kuroko/tools/codectools/gen_sbencs.krk
HarJIT 614193b8a1
Codecs package docs, as well as some assorted tweaks or minor additions (#5)
* Add some docs, and remove second Code page 874 codec (they handled the
non-overridden C1 area differently, but we only need one).

* More docs work.

* Doc stuff.

* Adjusted.

* More tweaks (table padding is not the docstring's problem).

* CSS and docstring tweaks.

* Link from modules to parent packages and vice versa.

* More documentation.

* Docstrings for all `codecs` submodules.

* Move encode_jis7_reduced into dbextra_data_7bit (thus completing the lazy
startup which was apparently not complete already) and docstrings added to
implementations of base class methods referring up to the base class.

* Remove FUSE junk that somehow made it into the repo.

* Some more docstrings.

* Fix some broken references to `string` (rather than `data`) which would have
caused a problem if any existing error handler had returned a negative
offset (which no current handler does, but it's worth fixing anyway).

* Add a cp042 codec to accompany the x-user-defined codec, and to pave the
way for maybe adding Adobe Symbol, Zapf Dingbats or Wingdings codecs
in future.

* Better Japanese Autodetect behaviour for ISO-2022-JP (add yet another
condition in which it will be detected, making it able to conclusively
detect it prior to end of stream without being fed an entire escape
sequence in one call). Also some docs tweaks.

* idstr() → _idstr() since it's internal.

* Docs for codecs.pifonts.

* Docstrings for dbextra.

* Document the sbextra classes.

* Docstrings for the web encodings.

* Possibly a fairer assessment of likely reality.

* Docstrings for codecs.binascii

* The *encoding* isn't removed (the BOM is).

* Make it clearer when competing OEM code pages use different letter layouts.

* Fix copied in error.

* Stop generating linking to non-existent "← tools" from tools.gendoc.

* Move .fuse_hidden* exclusion to my user-level config.

* Constrain the table style changes to class .markdownTable, to avoid any
effect on other interface tables generated by Doxygen.

* Refer to `__ispackage__` when generating help.
2021-04-02 16:34:10 +09:00

218 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json, fileio
let indices
with fileio.open("tools/codectools/indexes.json") as f:
indices = json.loads(f.read())
def build_sbmap(name):
if name == "iso-8859-8-i":
name = "iso-8859-8"
let encoding_map = {}
let decoding_map = {}
for n, i in enumerate(indices[name]):
if i != None:
decoding_map[n + 0x80] = i
encoding_map[i] = n + 0x80
return (encoding_map, decoding_map)
let template = """
class {idname}IncrementalEncoder(AsciiIncrementalEncoder):
'''
IncrementalEncoder implementation for {description}
'''
name = {mainlabel}
html5name = {weblabel}
@lazy_property
def encoding_map():
return {encode}
class {idname}IncrementalDecoder(AsciiIncrementalDecoder):
'''
IncrementalDecoder implementation for {description}
'''
name = {mainlabel}
html5name = {weblabel}
@lazy_property
def decoding_map():
return {decode}
register_kuroko_codec(
{labels},
{idname}IncrementalEncoder,
{idname}IncrementalDecoder)
"""
let boilerplate = """'''
Defines WHATWG-specified single-byte encodings.
'''
# Generated by tools/codectools/gen_sbencs.krk from WHATWG encodings.json and indexes.json
from codecs.infrastructure import AsciiIncrementalEncoder, AsciiIncrementalDecoder, register_kuroko_codec, lazy_property
"""
# Places where the WHATWG encoding "name" is actually the name of a similar encoding aliased
# together. Granted, this is not very common for single byte encodings (only "KOI8-U" being
# KOI8-RU comes to mind, since e.g. Windows-1252 is named Windows-1252 with "ISO-8859-1"
# merely an alias) but it is, importantly, the rule rather than the exception for the CJK
# codecs (e.g. Windows-31J is an alias of "Shift JIS" which is actually Windows-31J).
let realname = {"koi8-u": "koi8-ru"}
# Additional labels included for Python parity
let parity_labels = {
"windows-1250": ["1250"],
"windows-1251": ["1251"],
# I've assigned several of Python's aliases to the ecma-43-dv codec, but these are minor
# misspellings of one WHATWG assigns for Windows-1252, so doing likewise to avoid confusion.
"windows-1252": ["ansi-x3-4-1968", "ansi-x3.4-1986", "1252"],
"windows-1253": ["1253"],
"windows-1254": ["1254", "iso-8859-9-1989"],
"windows-1255": ["1255"],
"windows-1256": ["1256"],
"windows-1257": ["1257"],
"windows-1258": ["1258"],
"iso-8859-2": ["iso-8859-2-1987"],
"iso-8859-3": ["iso-8859-3-1988"],
"iso-8859-4": ["iso-8859-4-1988"],
"iso-8859-5": ["iso-8859-5-1988"],
"iso-8859-6": ["iso-8859-6-1987"],
"iso-8859-7": ["iso-8859-7-1987"],
"iso-8859-8": ["iso-8859-8-1988"],
"iso-8859-10": ["iso-8859-10-1992"],
# Note Python differentiates TIS-620 and ISO-8859-11 for some reason (neither IANA nor WHATWG
# do). Also, it aliases "iso-ir-166" to the former (since it cites TIS-620) despite it having
# an NBSP in the registration document (case in point).
"windows-874": ["iso-8859-11-2001", "tis620", "tis-620-0", "tis-620-2529-0",
"tis-620-2529-1", "iso-ir-166", "thai", "cp874"],
"iso-8859-13": ["l7", "latin7"],
"iso-8859-14": ["iso-8859-14-1998", "l8", "latin8", "iso-ir-199", "iso_celtic"],
"iso-8859-15": ["latin9"],
"iso-8859-16": ["iso-8859-16-2001", "iso8859-16", "l10", "latin10", "iso-ir-226"],
"macintosh": ["mac-roman", "macroman"],
"x-mac-cyrillic": ["mac-cyrillic", "maccyrillic"],
}
let descriptions = {
"windows-1250": "Windows-1250 (Central Europe)",
"windows-1251": "Windows-1251 (Cyrillic)",
"windows-1252": "Windows-1252 (Western Europe), ISO-8859-1 modification/extension",
"windows-1253": "Windows-1253 (Greek)",
"windows-1254": "Windows-1254 (Turkish), ISO-8859-9 modification/extension",
"windows-1255": "Windows-1255 (Logical order Hebrew with vowel points)",
"windows-1256": "Windows-1256 (Arabic)",
"windows-1257": "Windows-1257 (Baltic Rim)",
"windows-1258": """Windows-1258 (Vietnam), basic implementation
Note that Windows-1258 includes a mixture of composed forms and combining characters,
and that some grapheme clusters must be represented with a sequence of a composed
form and a combining character, even though a fully composed form exists in Unicode
taken from other encodings such as VISCII, since a fully composed form is not included,
and a combining form is included for only one of the diacritics.
The encoder is a simple mapping which will accept text in the form generated by the decoder
but, due to the above, some grapheme clusters will not be accepted in either NFC or NFD
normalised form. The decoder does not convert its output to any normalised form. This follows
both Python and WHATWG behaviour. Conversion of text between encodable form and either
normalised form may need to be handled in a separate step by any code using this codec.""",
"ibm866": """OEM-866 (Russian Cyrillic).
Note: OEM-866 competed with OEM-855 for Cyrillic; OEM-866 preserved all box drawing characters
(rather then only a subset) and was more popular for Russian, but did not provide coverage
for all of the different South Slavic Cyrillic orthographies, unlike OEM-855. Their layouts
for Cyrillic are entirely different.""",
"iso-8859-2": "ISO/IEC 8859-2 (Central European)",
"iso-8859-3": "ISO/IEC 8859-3 (Maltese and Esperanto)",
"iso-8859-4": "ISO/IEC 8859-4 (North European)",
"iso-8859-5": "ISO/IEC 8859-5 (Cyrillic)",
"iso-8859-6": "ISO/IEC 8859-6 (Arabic ASMO 708)",
"iso-8859-7": "ISO/IEC 8859-7 (Greek ELOT 928)",
"iso-8859-8": "ISO/IEC 8859-8 (Hebrew)",
"iso-8859-8-i": "ISO/IEC 8859-8 (Hebrew)", # Artifact: they do the same thing inside codecs.
"iso-8859-10": "ISO/IEC 8859-10 (Nordic)",
"windows-874": "Windows-874 (Thai), TIS-620 / ISO-8859-11 modification/extension",
"iso-8859-13": "ISO/IEC 8859-13 (Baltic Rim)",
"iso-8859-14": "ISO/IEC 8859-14 (Celtic)",
"iso-8859-15": "ISO/IEC 8859-15 (New Western European)",
"iso-8859-16": "ISO/IEC 8859-16 (South-Eastern European; Romanian SR 14111)",
"koi8-r": "the KOI8-R (KOI-8 Cyrillic for Russian) encoding.",
"koi8-ru": "the KOI8-RU (KOI-8 Cyrillic for Belarusian, Ukrainian and Ruthenian) encoding.",
"macintosh": "the Macintosh Roman encoding.",
"x-mac-cyrillic": "the Macintosh Cyrillic encoding.",
"x-user-defined": """the user-defined extended ASCII encoding.
This maps ASCII bytes as ASCII characters, and non-ASCII bytes to the private use
range U+F780F7FF, such that the low 8 bits always match the original byte.
This is sometimes useful for round-tripping arbitrary _sensu stricto_ extended
ASCII data without caring about the non-ASCII part. Note however, that _sensu lato_
extended ASCII may for example use ASCII bytes as trail bytes in a multi-byte code.""",
}
let encode_xudef = {}
let decode_xudef = {}
for i in range(128):
let codepoint = 0xF780 + i
let byte = 0x80 + i
encode_xudef[codepoint] = byte
decode_xudef[byte] = codepoint
let all_weblabels = []
let mapped_to_replacement = []
with fileio.open("modules/codecs/sbencs.krk", "w") as outf:
outf.write(boilerplate)
with fileio.open("tools/codectools/encodings.json") as f:
for i in json.loads(f.read()):
if i["heading"] == "Legacy single-byte encodings":
for enc in i["encodings"]:
all_weblabels.extend(enc["labels"])
let whatwgname = enc["name"].lower()
let name = realname.get(whatwgname, whatwgname)
let labels = enc["labels"]
if name in parity_labels:
labels.extend(parity_labels[name])
let built = build_sbmap(whatwgname)
let encoding_map = built[0]
let decoding_map = built[1]
let idname = name.title().replace("-", "")
outf.write(template.format(mainlabel=repr(name), encode=repr(encoding_map),
weblabel=repr(whatwgname), description=descriptions.get(name, "TODO"),
decode=repr(decoding_map), labels=repr(labels), idname=idname))
else:
for enc in i["encodings"]:
if enc["name"].lower() != "replacement":
all_weblabels.extend(enc["labels"])
else:
mapped_to_replacement.extend(enc["labels"])
outf.write(template.format(mainlabel=repr("x-user-defined"), encode=repr(encode_xudef),
weblabel=repr("x-user-defined"), description=descriptions.get("x-user-defined", "TODO"),
decode=repr(decode_xudef), labels=repr(["x-user-defined"]), idname="XUserDefined"))
with fileio.open("modules/codecs/isweblabel.krk", "w") as outf:
outf.write(f"""'''
Allows checking the WHATWG status of a given label (listed, not listed, or mapped to undefined).
'''
# Generated by tools/codectools/gen_sbencs.krk from WHATWG encodings.json
let weblabels = {all_weblabels!r}
let mapped_to_replacement = {mapped_to_replacement!r}
def map_weblabel(label):
'''
If `label` is a regular WHATWG label, returns it; if it is a label mapped to Replacement,
returns `"undefined"`; otherwise, returns `None`.
'''
if label in mapped_to_replacement:
# WHATWG aliases these to replacement to prevent their use in injection/XSS attacks.
return "undefined"
else if label in weblabels:
return label
else:
return None
""")