kuroko/tools/codectools/gen_sbencs.krk

import json, fileio

let indices
with fileio.open("tools/codectools/indexes.json") as f:
    indices = json.loads(f.read())

def build_sbmap(name):
    if name == "iso-8859-8-i":
        name = "iso-8859-8"
    let encoding_map = {}
    let decoding_map = {}
    for n, i in enumerate(indices[name]):
        if i != None:
            decoding_map[n + 0x80] = i
            encoding_map[i] = n + 0x80
    return (encoding_map, decoding_map)

let template = """
class {idname}IncrementalEncoder(AsciiIncrementalEncoder):
    '''
    IncrementalEncoder implementation for {description}
    '''
    name = {mainlabel}
    html5name = {weblabel}
    @lazy_property
    def encoding_map():
        return {encode}

class {idname}IncrementalDecoder(AsciiIncrementalDecoder):
    '''
    IncrementalDecoder implementation for {description}
    '''
    name = {mainlabel}
    html5name = {weblabel}
    @lazy_property
    def decoding_map():
        return {decode}

register_kuroko_codec(
    {labels},
    {idname}IncrementalEncoder,
    {idname}IncrementalDecoder)
"""

let boilerplate = """'''
Defines WHATWG-specified single-byte encodings.
'''
# Generated by tools/codectools/gen_sbencs.krk from WHATWG encodings.json and indexes.json

from codecs.infrastructure import AsciiIncrementalEncoder, AsciiIncrementalDecoder, register_kuroko_codec, lazy_property
"""

# Places where the WHATWG encoding "name" is actually the name of a similar encoding aliased
#   together. Granted, this is not very common for single byte encodings (only "KOI8-U" being
#   KOI8-RU comes to mind, since e.g. Windows-1252 is named Windows-1252 with "ISO-8859-1"
#   merely an alias) but it is, importantly, the rule rather than the exception for the CJK
#   codecs (e.g. Windows-31J is an alias of "Shift JIS" which is actually Windows-31J).
let realname = {"koi8-u": "koi8-ru"}

# Additional labels included for Python parity
let parity_labels = {
    "windows-1250": ["1250"],
    "windows-1251": ["1251"],
    # I've assigned several of Python's aliases to the ecma-43-dv codec, but these are minor
    #   misspellings of one WHATWG assigns for Windows-1252, so doing likewise to avoid confusion.
    "windows-1252": ["ansi-x3-4-1968", "ansi-x3.4-1986", "1252"],
    "windows-1253": ["1253"],
    "windows-1254": ["1254", "iso-8859-9-1989"],
    "windows-1255": ["1255"],
    "windows-1256": ["1256"],
    "windows-1257": ["1257"],
    "windows-1258": ["1258"],
    "iso-8859-2": ["iso-8859-2-1987"],
    "iso-8859-3": ["iso-8859-3-1988"],
    "iso-8859-4": ["iso-8859-4-1988"],
    "iso-8859-5": ["iso-8859-5-1988"],
    "iso-8859-6": ["iso-8859-6-1987"],
    "iso-8859-7": ["iso-8859-7-1987"],
    "iso-8859-8": ["iso-8859-8-1988"],
    "iso-8859-10": ["iso-8859-10-1992"],
    # Note Python differentiates TIS-620 and ISO-8859-11 for some reason (neither IANA nor WHATWG
    #   do). Also, it aliases "iso-ir-166" to the former (since it cites TIS-620) despite it having
    #   an NBSP in the registration document (case in point).
    "windows-874": ["iso-8859-11-2001", "tis620", "tis-620-0", "tis-620-2529-0",
                    "tis-620-2529-1", "iso-ir-166", "thai", "cp874"],
    "iso-8859-13": ["l7", "latin7"],
    "iso-8859-14": ["iso-8859-14-1998", "l8", "latin8", "iso-ir-199", "iso_celtic"],
    "iso-8859-15": ["latin9"],
    "iso-8859-16": ["iso-8859-16-2001", "iso8859-16", "l10", "latin10", "iso-ir-226"],
    "macintosh": ["mac-roman", "macroman"],
    "x-mac-cyrillic": ["mac-cyrillic", "maccyrillic"],
}

let descriptions = {
    "windows-1250": "Windows-1250 (Central Europe)",
    "windows-1251": "Windows-1251 (Cyrillic)",
    "windows-1252": "Windows-1252 (Western Europe), ISO-8859-1 modification/extension",
    "windows-1253": "Windows-1253 (Greek)",
    "windows-1254": "Windows-1254 (Turkish), ISO-8859-9 modification/extension",
    "windows-1255": "Windows-1255 (Logical order Hebrew with vowel points)",
    "windows-1256": "Windows-1256 (Arabic)",
    "windows-1257": "Windows-1257 (Baltic Rim)",
    "windows-1258": """Windows-1258 (Vietnam), basic implementation

    Note that Windows-1258 includes a mixture of composed forms and combining characters,
    and that some grapheme clusters must be represented with a sequence of a composed
    form and a combining character, even though a fully composed form exists in Unicode
    taken from other encodings such as VISCII, since a fully composed form is not included,
    and a combining form is included for only one of the diacritics.

    The encoder is a simple mapping which will accept text in the form generated by the decoder
    but, due to the above, some grapheme clusters will not be accepted in either NFC or NFD
    normalised form. The decoder does not convert its output to any normalised form. This follows
    both Python and WHATWG behaviour. Conversion of text between encodable form and either
    normalised form may need to be handled in a separate step by any code using this codec.""",
    "ibm866": """OEM-866 (Russian Cyrillic).

    Note: OEM-866 competed with OEM-855 for Cyrillic; OEM-866 preserved all box drawing characters
    (rather then only a subset) and was more popular for Russian, but did not provide coverage
    for all of the different South Slavic Cyrillic orthographies, unlike OEM-855. Their layouts
    for Cyrillic are entirely different.""",
    "iso-8859-2": "ISO/IEC 8859-2 (Central European)",
    "iso-8859-3": "ISO/IEC 8859-3 (Maltese and Esperanto)",
    "iso-8859-4": "ISO/IEC 8859-4 (North European)",
    "iso-8859-5": "ISO/IEC 8859-5 (Cyrillic)",
    "iso-8859-6": "ISO/IEC 8859-6 (Arabic ASMO 708)",
    "iso-8859-7": "ISO/IEC 8859-7 (Greek ELOT 928)",
    "iso-8859-8": "ISO/IEC 8859-8 (Hebrew)",
    "iso-8859-8-i": "ISO/IEC 8859-8 (Hebrew)", # Artifact: they do the same thing inside codecs.
    "iso-8859-10": "ISO/IEC 8859-10 (Nordic)",
    "windows-874": "Windows-874 (Thai), TIS-620 / ISO-8859-11 modification/extension",
    "iso-8859-13": "ISO/IEC 8859-13 (Baltic Rim)",
    "iso-8859-14": "ISO/IEC 8859-14 (Celtic)",
    "iso-8859-15": "ISO/IEC 8859-15 (New Western European)",
    "iso-8859-16": "ISO/IEC 8859-16 (South-Eastern European; Romanian SR 14111)",
    "koi8-r": "the KOI8-R (KOI-8 Cyrillic for Russian) encoding.",
    "koi8-ru": "the KOI8-RU (KOI-8 Cyrillic for Belarusian, Ukrainian and Ruthenian) encoding.",
    "macintosh": "the Macintosh Roman encoding.",
    "x-mac-cyrillic": "the Macintosh Cyrillic encoding.",
    "x-user-defined": """the user-defined extended ASCII encoding.

    This maps ASCII bytes as ASCII characters, and non-ASCII bytes to the private use
    range U+F780–F7FF, such that the low 8 bits always match the original byte.

    This is sometimes useful for round-tripping arbitrary _sensu stricto_ extended
    ASCII data without caring about the non-ASCII part. Note however, that _sensu lato_
    extended ASCII may for example use ASCII bytes as trail bytes in a multi-byte code.""",
}

let encode_xudef = {}
let decode_xudef = {}
for i in range(128):
    let codepoint = 0xF780 + i
    let byte = 0x80 + i
    encode_xudef[codepoint] = byte
    decode_xudef[byte] = codepoint

let all_weblabels = []
let mapped_to_replacement = []

with fileio.open("modules/codecs/sbencs.krk", "w") as outf:
    outf.write(boilerplate)
    with fileio.open("tools/codectools/encodings.json") as f:
        for i in json.loads(f.read()):
            if i["heading"] == "Legacy single-byte encodings":
                for enc in i["encodings"]:
                    all_weblabels.extend(enc["labels"])
                    let whatwgname = enc["name"].lower()
                    let name = realname.get(whatwgname, whatwgname)
                    let labels = enc["labels"]
                    if name in parity_labels:
                        labels.extend(parity_labels[name])
                    let built = build_sbmap(whatwgname)
                    let encoding_map = built[0]
                    let decoding_map = built[1]
                    let idname = name.title().replace("-", "")
                    outf.write(template.format(mainlabel=repr(name), encode=repr(encoding_map),
                            weblabel=repr(whatwgname), description=descriptions.get(name, "TODO"),
                            decode=repr(decoding_map), labels=repr(labels), idname=idname))
            else:
                for enc in i["encodings"]:
                    if enc["name"].lower() != "replacement":
                        all_weblabels.extend(enc["labels"])
                    else:
                        mapped_to_replacement.extend(enc["labels"])
    outf.write(template.format(mainlabel=repr("x-user-defined"), encode=repr(encode_xudef),
            weblabel=repr("x-user-defined"), description=descriptions.get("x-user-defined", "TODO"),
            decode=repr(decode_xudef), labels=repr(["x-user-defined"]), idname="XUserDefined"))

with fileio.open("modules/codecs/isweblabel.krk", "w") as outf:
    outf.write(f"""'''
Allows checking the WHATWG status of a given label (listed, not listed, or mapped to undefined).
'''
# Generated by tools/codectools/gen_sbencs.krk from WHATWG encodings.json
let weblabels = {all_weblabels!r}
let mapped_to_replacement = {mapped_to_replacement!r}

def map_weblabel(label):
    '''
    If `label` is a regular WHATWG label, returns it; if it is a label mapped to Replacement,
    returns `"undefined"`; otherwise, returns `None`.
    '''
    if label in mapped_to_replacement:
        # WHATWG aliases these to replacement to prevent their use in injection/XSS attacks.
        return "undefined"
    else if label in weblabels:
        return label
    else:
        return None
""")