toaruos/util/gen_wcwidth.krk
2021-10-27 19:23:05 +09:00

85 lines
2.1 KiB
Plaintext

import fileio
import os
let eaw_txt = '/tmp/EastAsianWidth.txt'
os.system(f"wget -O '{eaw_txt}' https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt")
let lines
with fileio.open(eaw_txt,'r') as f:
lines = f.readlines()
def classify(cp, gr, ct):
# U+00AD: Soft hyphen - other things seem to want this to be 1
if cp == 0xAD: return 1
# Low control codes
if cp < 0x20: return -1
# Del, higher control codes
if cp >= 0x7f and cp < 0xa0: return -1
# Surrogates
if cp >= 0xd800 and cp <= 0xdfff: return -1
# Combining characters
if ct in ['Cf','Me','Mn']: return 0
# Hangul jamo
if cp >= 0x1160 and cp <= 0x11FF: return 0
# Zero-width space
if cp == 0x200b: return 0
# Mark neutral, narrow, ambigus, and half-width as 1
if gr in ['N','Na','A','H']: return 1
# Mark wide and full-width as 2
if gr in ['W','F']: return 2
# Mark everything else as invalid
return -1
let classes = [None] * 0x110000
for line in lines:
if !line or line.startswith('#') or ';' not in line:
continue
line = line.strip()
let codepoint, rest = line.split(';',1)
let group, comment = rest.split('#',1)
group = group.strip()
comment = comment.strip()
let ctype = comment.split(' ')[0]
# Is this a range?
if '..' in codepoint:
let start, end = codepoint.split('..',1)
start = int(f'0x{start}')
end = int(f'0x{end}')
for i = start; i <= end; i++:
classes[i] = classify(i, group, ctype)
else:
codepoint = int(f'0x{codepoint}')
classes[codepoint] = classify(codepoint, group, ctype)
for i in range(1,0x110000):
if classes[i] is None: classes[i] = -1
print('''/* Generated by util/gen_wcwidth.krk */
#include <wchar.h>
int wcwidth(wchar_t wc) {
\tif (wc == 0) return 0;''')
let last = None
for i in range(1,0x110000):
if last is not None and classes[i] != last:
print(f'\telse if (wc < {hex(i)}) return {last};')
last = classes[i]
print(f'\telse if (wc < 0x110000) return {last};\n\treturn -1;\n}')