kuroko/tools/codectools/gen_dbdata.krk
2022-07-05 11:42:57 +09:00

289 lines
11 KiB
Python

import json
import fileio
from collections import smartrepr
let indices
with fileio.open('tools/codectools/indexes.json') as f:
indices = json.loads(f.read())
let aliases = {}
with fileio.open('tools/codectools/encodings.json') as f:
for i in json.loads(f.read()):
if i['heading'].startswith('Legacy multi-byte'):
for enc in i['encodings']:
aliases[enc['name'].lower()] = enc['labels']
let boilerplate = '''"""
Defines WHATWG-specified double-byte encodings which do not require dedicated implementations, and
supplies data used by those (in `codecs.bespokecodecs`) which do.
"""
# Generated by tools/codectools/gen_dbdata.krk from WHATWG encodings.json and indexes.json
from collections import xraydict
from codecs.infrastructure import AsciiIncrementalEncoder, AsciiIncrementalDecoder, register_kuroko_codec, encodesto7bit, decodesto7bit, lazy_property
'''
let template = '''
class {idname}IncrementalEncoder(AsciiIncrementalEncoder):
"""IncrementalEncoder implementation for {description}"""
name = {mainlabel}
html5name = {weblabel}
@lazy_property
def encoding_map():
return {encode}
class {idname}IncrementalDecoder(AsciiIncrementalDecoder):
"""IncrementalDecoder implementation for {description}"""
name = {mainlabel}
html5name = {weblabel}
@lazy_property
def decoding_map():
return {decode}
dbrange = {dbrange}
tbrange = {tbrange}
trailrange = {trailrange}
register_kuroko_codec({labels}, {idname}IncrementalEncoder, {idname}IncrementalDecoder)
'''
let template_big5 = '''
class {idnameenc}IncrementalEncoder(AsciiIncrementalEncoder):
"""IncrementalEncoder implementation for {description}"""
name = {mainlabelenc}
html5name = {weblabel}
@lazy_property
def encoding_map():
return {encode}
class {idnameenc2}IncrementalEncoder(AsciiIncrementalEncoder):
"""IncrementalEncoder implementation for {description2}"""
name = {mainlabelenc2}
html5name = None
@lazy_property
def encoding_map():
return xraydict({idnameenc}IncrementalEncoder("strict").encoding_map, {encode2})
class {idnamedec}IncrementalDecoder(AsciiIncrementalDecoder):
"""IncrementalDecoder implementation for {descriptiondec}"""
name = {mainlabeldec}
html5name = {weblabel}
@lazy_property
def decoding_map():
return {decode}
dbrange = {dbrange}
tbrange = {tbrange}
trailrange = {trailrange}
# Our Big5 deviates from WHATWG somewhat here in that encoding to HKSCS is permitted if
# specifically the HKSCS label is specified, otherwise Big5-ETEN is used. This seems more
# sensible in a non-browser context than the WHATWG behaviour of the encoder behaving the
# same for all labels.
register_kuroko_codec({labels}, {idnameenc}IncrementalEncoder, {idnamedec}IncrementalDecoder)
register_kuroko_codec({labels2}, {idnameenc2}IncrementalEncoder, {idnamedec}IncrementalDecoder)
'''
let decode_eucjp = {}
let decode_shiftjis = {}
let decode_jis7katakana = {}
let decode_uhc = {}
let decode_big5eten = {}
let decode_big5hkscs = {}
let decode_gbk = {}
let encode_eucjp = {}
let encode_eucjp_extra = {}
let encode_shiftjis = {}
let encode_jis7_onewaykana = {}
let encode_uhc = {}
let encode_big5eten = {}
let encode_big5web = {}
let encode_big5hkscs = {}
let encode_big5hkscs_extras = {}
let encode_gbk = {}
let dbrange_eucjp = (0x8E,) + tuple(range(0xA1, 0xFE + 1))
let dbrange_shiftjis = tuple(range(0x81, 0x9F + 1)) + tuple(range(0xE0, 0xFC + 1))
let dbrange_uhc = tuple(range(0x81, 0xFE + 1))
let dbrange_big5 = tuple(range(0x81, 0xFE + 1))
let tbrange_eucjp = (0x8F,)
let tbrange_shiftjis = ()
let tbrange_uhc = ()
let tbrange_big5 = ()
let trailrange_eucjp = tuple(range(0xA1, 0xFE + 1))
let trailrange_shiftjis = tuple(range(0x40, 0x7E + 1)) + tuple(range(0x80, 0xFC + 1))
let trailrange_uhc = tuple(range(0x41, 0xFE + 1)) # As WHATWG define it (a superset of the actual range)
let trailrange_big5 = tuple(range(0x40, 0x7E + 1)) + tuple(range(0xA1, 0xFE + 1))
for pointer, ucs in enumerate(indices['jis0208']):
if ucs == None: continue
let sku = (pointer // 188) + 1
let sten = (pointer % 188) + 1
let first = 0x80 + sku
if first >= 0xA0: first += 0x40
let second = 0x3F + sten
if second >= 0x7F: second += 1
decode_shiftjis[(first,second)] = ucs
if ucs not in encode_shiftjis and not (8272 <= pointer and pointer <= 8835):
encode_shiftjis[ucs] = (first, second)
let ku = (pointer // 94) + 1
let ten = (pointer % 94) + 1
if ku <= 94:
let firste = 0xA0 + ku
let seconde = 0xA0 + ten
decode_eucjp[(firste,seconde)] = ucs
encode_eucjp[ucs] = (firste,seconde)
for pointer, ucs in enumerate(indices['jis0212']):
if ucs == None: continue
let ku = (pointer // 94) + 1
let ten = (pointer % 94) + 1
let second = 0xA0 + ku
let third = 0xA0 + ten
decode_eucjp[(0x8F,second,third)] = ucs
encode_eucjp_extra[ucs] = (0x8F,second,third)
for i in range(63):
let ucs = 0xFF61 + i
let byte = 0xA1 + i
decode_shiftjis[byte] = ucs
encode_shiftjis[ucs] = byte
decode_eucjp[(0x8E, byte)] = ucs
encode_eucjp[ucs] = (0x8E, byte)
decode_jis7katakana[0x21 + i] = ucs
let eucbytes = encode_eucjp[indices['iso-2022-jp-katakana'][i]]
encode_jis7_onewaykana[ucs] = (eucbytes[0] - 0x80, eucbytes[1] - 0x80)
for i in range(94 * 20):
let pointer = 8836 + i
let ucs = 0xE000 + i
let sku = (pointer // 188) + 1
let sten = (pointer % 188) + 1
let first = 0xC0 + sku
let second = 0x3F + sten
if second >= 0x7F: second += 1
decode_shiftjis[(first,second)] = ucs
for pointer, ucs in enumerate(indices['euc-kr']):
if ucs == None: continue
let uku = (pointer // 190) + 1
let uten = (pointer % 190) + 1
let first = 0x80 + uku
let second = 0x40 + uten
decode_uhc[(first,second)] = ucs
encode_uhc[ucs] = (first,second)
for pointer, ucs in enumerate(indices['gb18030']):
if ucs == None: continue
let uku = (pointer // 190) + 1
let uten = (pointer % 190) + 1
let first = 0x80 + uku
let second = 0x3F + uten
if second >= 0x7F: second += 1
decode_gbk[(first,second)] = ucs
if ucs not in encode_gbk:
encode_gbk[ucs] = (first,second)
for pointer, ucs in enumerate(indices['big5']):
if ucs == None: continue
let bku = (pointer // 157) + 1
let bten = (pointer % 157) + 1
let first = 0x80 + bku
let second = 0x3F + bten
if second >= 0x7F: second += 0x22
decode_big5hkscs[(first, second)] = ucs
if pointer < 5024:
encode_big5hkscs[ucs] = (first, second)
encode_big5hkscs_extras[ucs] = (first, second)
else if pointer < 18997:
decode_big5eten[(first, second)] = ucs
if ucs not in encode_big5eten or ucs in [0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345]:
encode_big5hkscs[ucs] = (first, second)
encode_big5web[ucs] = (first, second)
encode_big5eten[ucs] = (first, second)
if ucs in encode_big5hkscs_extras: # i.e. a GCCS duplicate of a standard Big5 char
del encode_big5hkscs_extras[ucs]
else:
if ucs not in encode_big5hkscs:
encode_big5hkscs[ucs] = (first, second)
encode_big5hkscs_extras[ucs] = (first, second)
if ucs not in encode_big5web:
encode_big5web[ucs] = (first, second)
encode_shiftjis[0xA5] = 0x5C
encode_eucjp[0xA5] = 0x5C
encode_shiftjis[0x203E] = 0x7E
encode_eucjp[0x203E] = 0x7E
encode_shiftjis[0x2212] = encode_shiftjis[0xFF0D]
encode_eucjp[0x2212] = encode_eucjp[0xFF0D]
decode_big5hkscs[(0x88, 0x62)] = (0xCA, 0x304)
decode_big5hkscs[(0x88, 0x64)] = (0xCA, 0x30C)
decode_big5hkscs[(0x88, 0xA3)] = (0xEA, 0x304)
decode_big5hkscs[(0x88, 0xA5)] = (0xEA, 0x30C)
encode_big5hkscs[(0xCA, 0x304)] = (0x88, 0x62)
encode_big5hkscs[(0xCA, 0x30C)] = (0x88, 0x64)
encode_big5hkscs[(0xEA, 0x304)] = (0x88, 0xA3)
encode_big5hkscs[(0xEA, 0x30C)] = (0x88, 0xA5)
decode_gbk[0x80] = 0x20AC
with fileio.open('modules/codecs/dbdata.krk', 'w') as f:
f.write(boilerplate)
f.write(template.format(
mainlabel=repr('windows-31j'),
weblabel=repr('shift_jis'),
labels=repr(aliases['shift_jis'] + ["cp932", "932", "mskanji", "shiftjis", "s_jis"]),
description="Windows-31J (Shift_JIS as implemented by Microsoft).",
encode=smartrepr(encode_shiftjis), decode=smartrepr(decode_shiftjis), idname='Windows31J',
dbrange=repr(dbrange_shiftjis), tbrange=repr(tbrange_shiftjis),
trailrange=repr(trailrange_shiftjis)))
f.write(template.format(
mainlabel=repr("x-euc-jp"),
weblabel=repr("euc-jp"),
labels=repr(aliases["euc-jp"] + ["eucjp", "ujis", "u_jis"]),
description="EUC-JP (web version).",
encode=smartrepr(encode_eucjp), decode=smartrepr(decode_eucjp), idname="XEucJp",
dbrange=repr(dbrange_eucjp), tbrange=repr(tbrange_eucjp),
trailrange=repr(trailrange_eucjp)))
f.write(template.format(
mainlabel=repr("windows-949"),
weblabel=repr("euc-kr"),
labels=repr(aliases["euc-kr"] + ["cp949", "949", "ms949", "uhc", "euckr",
"ks_c_5601", "ksx1001", "ks_x_1001"]),
description="Unified Hangul Code (extended EUC-KR Wansung, Microsoft's KS C 5601 encoding).",
encode=smartrepr(encode_uhc), decode=smartrepr(decode_uhc), idname="Windows949",
dbrange=repr(dbrange_uhc), tbrange=repr(tbrange_uhc),
trailrange=repr(trailrange_uhc)))
f.write(template_big5.format(
mainlabelenc=repr("big5-eten"),
mainlabelenc2=repr("big5-hkscs"),
mainlabeldec=repr("big5-hkscs"),
weblabel=repr("big5"),
description="Big-5 (ETen version).",
description2="Big-5 (HKSCS version).",
descriptiondec="Big-5 (HKSCS version).",
labels=repr(["big5", "cn-big5", "csbig5", "x-x-big5", "big5-eten", "cp950", "950", "ms950"]),
labels2=repr(["big5-hkscs", "big5hkscs", "hkscs"]),
encode=smartrepr(encode_big5eten), idnameenc="Big5Eten",
encode2=smartrepr(encode_big5hkscs_extras), idnameenc2="Big5Hkscs",
decode=smartrepr(decode_big5hkscs), idnamedec="Big5Hkscs",
dbrange=repr(dbrange_big5), tbrange=repr(tbrange_big5),
trailrange=repr(trailrange_big5)))
f.write("\n# Additional data for bespoke or extra codecs")
f.write("\nclass _MoreDBData:")
f.write("\n @lazy_property")
f.write("\n def encode_jis7():")
f.write("\n return xraydict(encodesto7bit(XEucJpIncrementalEncoder(\"strict\").encoding_map), {})".format(smartrepr(encode_jis7_onewaykana)))
f.write("\n @lazy_property")
f.write("\n def decode_jis7():")
f.write("\n return decodesto7bit(XEucJpIncrementalDecoder(\"strict\").decoding_map)")
f.write("\n decode_jis7katakana = {}".format(smartrepr(decode_jis7katakana)))
f.write("\n encode_gbk = {}".format(smartrepr(encode_gbk)))
f.write("\n decode_gbk = {}".format(smartrepr(decode_gbk)))
let ranges = indices["gb18030-ranges"]
let rangesout = []
for i in ranges:
rangesout.append((i[0], i[1]))
f.write("\n gb_surrogate_ranges = {}".format(repr(rangesout)))
f.write("\n encode_eucjp_extra = {}".format(smartrepr(encode_eucjp_extra)))
f.write("\nlet more_dbdata = _MoreDBData()")