289 lines
11 KiB
Python
289 lines
11 KiB
Python
import json
|
|
import fileio
|
|
from collections import smartrepr
|
|
|
|
let indices
|
|
with fileio.open('tools/codectools/indexes.json') as f:
|
|
indices = json.loads(f.read())
|
|
|
|
let aliases = {}
|
|
|
|
with fileio.open('tools/codectools/encodings.json') as f:
|
|
for i in json.loads(f.read()):
|
|
if i['heading'].startswith('Legacy multi-byte'):
|
|
for enc in i['encodings']:
|
|
aliases[enc['name'].lower()] = enc['labels']
|
|
|
|
let boilerplate = '''"""
|
|
Defines WHATWG-specified double-byte encodings which do not require dedicated implementations, and
|
|
supplies data used by those (in `codecs.bespokecodecs`) which do.
|
|
"""
|
|
# Generated by tools/codectools/gen_dbdata.krk from WHATWG encodings.json and indexes.json
|
|
|
|
from collections import xraydict
|
|
from codecs.infrastructure import AsciiIncrementalEncoder, AsciiIncrementalDecoder, register_kuroko_codec, encodesto7bit, decodesto7bit, lazy_property
|
|
'''
|
|
|
|
let template = '''
|
|
class {idname}IncrementalEncoder(AsciiIncrementalEncoder):
|
|
"""IncrementalEncoder implementation for {description}"""
|
|
name = {mainlabel}
|
|
html5name = {weblabel}
|
|
@lazy_property
|
|
def encoding_map():
|
|
return {encode}
|
|
|
|
class {idname}IncrementalDecoder(AsciiIncrementalDecoder):
|
|
"""IncrementalDecoder implementation for {description}"""
|
|
name = {mainlabel}
|
|
html5name = {weblabel}
|
|
@lazy_property
|
|
def decoding_map():
|
|
return {decode}
|
|
dbrange = {dbrange}
|
|
tbrange = {tbrange}
|
|
trailrange = {trailrange}
|
|
|
|
register_kuroko_codec({labels}, {idname}IncrementalEncoder, {idname}IncrementalDecoder)
|
|
'''
|
|
|
|
let template_big5 = '''
|
|
class {idnameenc}IncrementalEncoder(AsciiIncrementalEncoder):
|
|
"""IncrementalEncoder implementation for {description}"""
|
|
name = {mainlabelenc}
|
|
html5name = {weblabel}
|
|
@lazy_property
|
|
def encoding_map():
|
|
return {encode}
|
|
|
|
class {idnameenc2}IncrementalEncoder(AsciiIncrementalEncoder):
|
|
"""IncrementalEncoder implementation for {description2}"""
|
|
name = {mainlabelenc2}
|
|
html5name = None
|
|
@lazy_property
|
|
def encoding_map():
|
|
return xraydict({idnameenc}IncrementalEncoder("strict").encoding_map, {encode2})
|
|
|
|
class {idnamedec}IncrementalDecoder(AsciiIncrementalDecoder):
|
|
"""IncrementalDecoder implementation for {descriptiondec}"""
|
|
name = {mainlabeldec}
|
|
html5name = {weblabel}
|
|
@lazy_property
|
|
def decoding_map():
|
|
return {decode}
|
|
dbrange = {dbrange}
|
|
tbrange = {tbrange}
|
|
trailrange = {trailrange}
|
|
|
|
# Our Big5 deviates from WHATWG somewhat here in that encoding to HKSCS is permitted if
|
|
# specifically the HKSCS label is specified, otherwise Big5-ETEN is used. This seems more
|
|
# sensible in a non-browser context than the WHATWG behaviour of the encoder behaving the
|
|
# same for all labels.
|
|
register_kuroko_codec({labels}, {idnameenc}IncrementalEncoder, {idnamedec}IncrementalDecoder)
|
|
register_kuroko_codec({labels2}, {idnameenc2}IncrementalEncoder, {idnamedec}IncrementalDecoder)
|
|
'''
|
|
|
|
let decode_eucjp = {}
|
|
let decode_shiftjis = {}
|
|
let decode_jis7katakana = {}
|
|
let decode_uhc = {}
|
|
let decode_big5eten = {}
|
|
let decode_big5hkscs = {}
|
|
let decode_gbk = {}
|
|
let encode_eucjp = {}
|
|
let encode_eucjp_extra = {}
|
|
let encode_shiftjis = {}
|
|
let encode_jis7_onewaykana = {}
|
|
let encode_uhc = {}
|
|
let encode_big5eten = {}
|
|
let encode_big5web = {}
|
|
let encode_big5hkscs = {}
|
|
let encode_big5hkscs_extras = {}
|
|
let encode_gbk = {}
|
|
let dbrange_eucjp = (0x8E,) + tuple(range(0xA1, 0xFE + 1))
|
|
let dbrange_shiftjis = tuple(range(0x81, 0x9F + 1)) + tuple(range(0xE0, 0xFC + 1))
|
|
let dbrange_uhc = tuple(range(0x81, 0xFE + 1))
|
|
let dbrange_big5 = tuple(range(0x81, 0xFE + 1))
|
|
let tbrange_eucjp = (0x8F,)
|
|
let tbrange_shiftjis = ()
|
|
let tbrange_uhc = ()
|
|
let tbrange_big5 = ()
|
|
let trailrange_eucjp = tuple(range(0xA1, 0xFE + 1))
|
|
let trailrange_shiftjis = tuple(range(0x40, 0x7E + 1)) + tuple(range(0x80, 0xFC + 1))
|
|
let trailrange_uhc = tuple(range(0x41, 0xFE + 1)) # As WHATWG define it (a superset of the actual range)
|
|
let trailrange_big5 = tuple(range(0x40, 0x7E + 1)) + tuple(range(0xA1, 0xFE + 1))
|
|
|
|
for pointer, ucs in enumerate(indices['jis0208']):
|
|
if ucs == None: continue
|
|
let sku = (pointer // 188) + 1
|
|
let sten = (pointer % 188) + 1
|
|
let first = 0x80 + sku
|
|
if first >= 0xA0: first += 0x40
|
|
let second = 0x3F + sten
|
|
if second >= 0x7F: second += 1
|
|
decode_shiftjis[(first,second)] = ucs
|
|
if ucs not in encode_shiftjis and not (8272 <= pointer and pointer <= 8835):
|
|
encode_shiftjis[ucs] = (first, second)
|
|
let ku = (pointer // 94) + 1
|
|
let ten = (pointer % 94) + 1
|
|
if ku <= 94:
|
|
let firste = 0xA0 + ku
|
|
let seconde = 0xA0 + ten
|
|
decode_eucjp[(firste,seconde)] = ucs
|
|
encode_eucjp[ucs] = (firste,seconde)
|
|
|
|
for pointer, ucs in enumerate(indices['jis0212']):
|
|
if ucs == None: continue
|
|
let ku = (pointer // 94) + 1
|
|
let ten = (pointer % 94) + 1
|
|
let second = 0xA0 + ku
|
|
let third = 0xA0 + ten
|
|
decode_eucjp[(0x8F,second,third)] = ucs
|
|
encode_eucjp_extra[ucs] = (0x8F,second,third)
|
|
|
|
for i in range(63):
|
|
let ucs = 0xFF61 + i
|
|
let byte = 0xA1 + i
|
|
decode_shiftjis[byte] = ucs
|
|
encode_shiftjis[ucs] = byte
|
|
decode_eucjp[(0x8E, byte)] = ucs
|
|
encode_eucjp[ucs] = (0x8E, byte)
|
|
decode_jis7katakana[0x21 + i] = ucs
|
|
let eucbytes = encode_eucjp[indices['iso-2022-jp-katakana'][i]]
|
|
encode_jis7_onewaykana[ucs] = (eucbytes[0] - 0x80, eucbytes[1] - 0x80)
|
|
|
|
for i in range(94 * 20):
|
|
let pointer = 8836 + i
|
|
let ucs = 0xE000 + i
|
|
let sku = (pointer // 188) + 1
|
|
let sten = (pointer % 188) + 1
|
|
let first = 0xC0 + sku
|
|
let second = 0x3F + sten
|
|
if second >= 0x7F: second += 1
|
|
decode_shiftjis[(first,second)] = ucs
|
|
|
|
for pointer, ucs in enumerate(indices['euc-kr']):
|
|
if ucs == None: continue
|
|
let uku = (pointer // 190) + 1
|
|
let uten = (pointer % 190) + 1
|
|
let first = 0x80 + uku
|
|
let second = 0x40 + uten
|
|
decode_uhc[(first,second)] = ucs
|
|
encode_uhc[ucs] = (first,second)
|
|
|
|
for pointer, ucs in enumerate(indices['gb18030']):
|
|
if ucs == None: continue
|
|
let uku = (pointer // 190) + 1
|
|
let uten = (pointer % 190) + 1
|
|
let first = 0x80 + uku
|
|
let second = 0x3F + uten
|
|
if second >= 0x7F: second += 1
|
|
decode_gbk[(first,second)] = ucs
|
|
if ucs not in encode_gbk:
|
|
encode_gbk[ucs] = (first,second)
|
|
|
|
for pointer, ucs in enumerate(indices['big5']):
|
|
if ucs == None: continue
|
|
let bku = (pointer // 157) + 1
|
|
let bten = (pointer % 157) + 1
|
|
let first = 0x80 + bku
|
|
let second = 0x3F + bten
|
|
if second >= 0x7F: second += 0x22
|
|
decode_big5hkscs[(first, second)] = ucs
|
|
if pointer < 5024:
|
|
encode_big5hkscs[ucs] = (first, second)
|
|
encode_big5hkscs_extras[ucs] = (first, second)
|
|
else if pointer < 18997:
|
|
decode_big5eten[(first, second)] = ucs
|
|
if ucs not in encode_big5eten or ucs in [0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345]:
|
|
encode_big5hkscs[ucs] = (first, second)
|
|
encode_big5web[ucs] = (first, second)
|
|
encode_big5eten[ucs] = (first, second)
|
|
if ucs in encode_big5hkscs_extras: # i.e. a GCCS duplicate of a standard Big5 char
|
|
del encode_big5hkscs_extras[ucs]
|
|
else:
|
|
if ucs not in encode_big5hkscs:
|
|
encode_big5hkscs[ucs] = (first, second)
|
|
encode_big5hkscs_extras[ucs] = (first, second)
|
|
if ucs not in encode_big5web:
|
|
encode_big5web[ucs] = (first, second)
|
|
|
|
encode_shiftjis[0xA5] = 0x5C
|
|
encode_eucjp[0xA5] = 0x5C
|
|
encode_shiftjis[0x203E] = 0x7E
|
|
encode_eucjp[0x203E] = 0x7E
|
|
encode_shiftjis[0x2212] = encode_shiftjis[0xFF0D]
|
|
encode_eucjp[0x2212] = encode_eucjp[0xFF0D]
|
|
decode_big5hkscs[(0x88, 0x62)] = (0xCA, 0x304)
|
|
decode_big5hkscs[(0x88, 0x64)] = (0xCA, 0x30C)
|
|
decode_big5hkscs[(0x88, 0xA3)] = (0xEA, 0x304)
|
|
decode_big5hkscs[(0x88, 0xA5)] = (0xEA, 0x30C)
|
|
encode_big5hkscs[(0xCA, 0x304)] = (0x88, 0x62)
|
|
encode_big5hkscs[(0xCA, 0x30C)] = (0x88, 0x64)
|
|
encode_big5hkscs[(0xEA, 0x304)] = (0x88, 0xA3)
|
|
encode_big5hkscs[(0xEA, 0x30C)] = (0x88, 0xA5)
|
|
decode_gbk[0x80] = 0x20AC
|
|
|
|
with fileio.open('modules/codecs/dbdata.krk', 'w') as f:
|
|
f.write(boilerplate)
|
|
f.write(template.format(
|
|
mainlabel=repr('windows-31j'),
|
|
weblabel=repr('shift_jis'),
|
|
labels=repr(aliases['shift_jis'] + ["cp932", "932", "mskanji", "shiftjis", "s_jis"]),
|
|
description="Windows-31J (Shift_JIS as implemented by Microsoft).",
|
|
encode=smartrepr(encode_shiftjis), decode=smartrepr(decode_shiftjis), idname='Windows31J',
|
|
dbrange=repr(dbrange_shiftjis), tbrange=repr(tbrange_shiftjis),
|
|
trailrange=repr(trailrange_shiftjis)))
|
|
f.write(template.format(
|
|
mainlabel=repr("x-euc-jp"),
|
|
weblabel=repr("euc-jp"),
|
|
labels=repr(aliases["euc-jp"] + ["eucjp", "ujis", "u_jis"]),
|
|
description="EUC-JP (web version).",
|
|
encode=smartrepr(encode_eucjp), decode=smartrepr(decode_eucjp), idname="XEucJp",
|
|
dbrange=repr(dbrange_eucjp), tbrange=repr(tbrange_eucjp),
|
|
trailrange=repr(trailrange_eucjp)))
|
|
f.write(template.format(
|
|
mainlabel=repr("windows-949"),
|
|
weblabel=repr("euc-kr"),
|
|
labels=repr(aliases["euc-kr"] + ["cp949", "949", "ms949", "uhc", "euckr",
|
|
"ks_c_5601", "ksx1001", "ks_x_1001"]),
|
|
description="Unified Hangul Code (extended EUC-KR Wansung, Microsoft's KS C 5601 encoding).",
|
|
encode=smartrepr(encode_uhc), decode=smartrepr(decode_uhc), idname="Windows949",
|
|
dbrange=repr(dbrange_uhc), tbrange=repr(tbrange_uhc),
|
|
trailrange=repr(trailrange_uhc)))
|
|
f.write(template_big5.format(
|
|
mainlabelenc=repr("big5-eten"),
|
|
mainlabelenc2=repr("big5-hkscs"),
|
|
mainlabeldec=repr("big5-hkscs"),
|
|
weblabel=repr("big5"),
|
|
description="Big-5 (ETen version).",
|
|
description2="Big-5 (HKSCS version).",
|
|
descriptiondec="Big-5 (HKSCS version).",
|
|
labels=repr(["big5", "cn-big5", "csbig5", "x-x-big5", "big5-eten", "cp950", "950", "ms950"]),
|
|
labels2=repr(["big5-hkscs", "big5hkscs", "hkscs"]),
|
|
encode=smartrepr(encode_big5eten), idnameenc="Big5Eten",
|
|
encode2=smartrepr(encode_big5hkscs_extras), idnameenc2="Big5Hkscs",
|
|
decode=smartrepr(decode_big5hkscs), idnamedec="Big5Hkscs",
|
|
dbrange=repr(dbrange_big5), tbrange=repr(tbrange_big5),
|
|
trailrange=repr(trailrange_big5)))
|
|
f.write("\n# Additional data for bespoke or extra codecs")
|
|
f.write("\nclass _MoreDBData:")
|
|
f.write("\n @lazy_property")
|
|
f.write("\n def encode_jis7():")
|
|
f.write("\n return xraydict(encodesto7bit(XEucJpIncrementalEncoder(\"strict\").encoding_map), {})".format(smartrepr(encode_jis7_onewaykana)))
|
|
f.write("\n @lazy_property")
|
|
f.write("\n def decode_jis7():")
|
|
f.write("\n return decodesto7bit(XEucJpIncrementalDecoder(\"strict\").decoding_map)")
|
|
f.write("\n decode_jis7katakana = {}".format(smartrepr(decode_jis7katakana)))
|
|
f.write("\n encode_gbk = {}".format(smartrepr(encode_gbk)))
|
|
f.write("\n decode_gbk = {}".format(smartrepr(decode_gbk)))
|
|
let ranges = indices["gb18030-ranges"]
|
|
let rangesout = []
|
|
for i in ranges:
|
|
rangesout.append((i[0], i[1]))
|
|
f.write("\n gb_surrogate_ranges = {}".format(repr(rangesout)))
|
|
f.write("\n encode_eucjp_extra = {}".format(smartrepr(encode_eucjp_extra)))
|
|
f.write("\nlet more_dbdata = _MoreDBData()")
|
|
|
|
|