614193b8a1
* Add some docs, and remove second Code page 874 codec (they handled the non-overridden C1 area differently, but we only need one). * More docs work. * Doc stuff. * Adjusted. * More tweaks (table padding is not the docstring's problem). * CSS and docstring tweaks. * Link from modules to parent packages and vice versa. * More documentation. * Docstrings for all `codecs` submodules. * Move encode_jis7_reduced into dbextra_data_7bit (thus completing the lazy startup which was apparently not complete already) and docstrings added to implementations of base class methods referring up to the base class. * Remove FUSE junk that somehow made it into the repo. * Some more docstrings. * Fix some broken references to `string` (rather than `data`) which would have caused a problem if any existing error handler had returned a negative offset (which no current handler does, but it's worth fixing anyway). * Add a cp042 codec to accompany the x-user-defined codec, and to pave the way for maybe adding Adobe Symbol, Zapf Dingbats or Wingdings codecs in future. * Better Japanese Autodetect behaviour for ISO-2022-JP (add yet another condition in which it will be detected, making it able to conclusively detect it prior to end of stream without being fed an entire escape sequence in one call). Also some docs tweaks. * idstr() → _idstr() since it's internal. * Docs for codecs.pifonts. * Docstrings for dbextra. * Document the sbextra classes. * Docstrings for the web encodings. * Possibly a fairer assessment of likely reality. * Docstrings for codecs.binascii * The *encoding* isn't removed (the BOM is). * Make it clearer when competing OEM code pages use different letter layouts. * Fix copied in error. * Stop generating linking to non-existent "← tools" from tools.gendoc. * Move .fuse_hidden* exclusion to my user-level config. * Constrain the table style changes to class .markdownTable, to avoid any effect on other interface tables generated by Doxygen. * Refer to `__ispackage__` when generating help.
218 lines
9.5 KiB
Python
218 lines
9.5 KiB
Python
import json, fileio
|
||
|
||
let indices
|
||
with fileio.open("tools/codectools/indexes.json") as f:
|
||
indices = json.loads(f.read())
|
||
|
||
def build_sbmap(name):
|
||
if name == "iso-8859-8-i":
|
||
name = "iso-8859-8"
|
||
let encoding_map = {}
|
||
let decoding_map = {}
|
||
for n, i in enumerate(indices[name]):
|
||
if i != None:
|
||
decoding_map[n + 0x80] = i
|
||
encoding_map[i] = n + 0x80
|
||
return (encoding_map, decoding_map)
|
||
|
||
let template = """
|
||
class {idname}IncrementalEncoder(AsciiIncrementalEncoder):
|
||
'''
|
||
IncrementalEncoder implementation for {description}
|
||
'''
|
||
name = {mainlabel}
|
||
html5name = {weblabel}
|
||
@lazy_property
|
||
def encoding_map():
|
||
return {encode}
|
||
|
||
class {idname}IncrementalDecoder(AsciiIncrementalDecoder):
|
||
'''
|
||
IncrementalDecoder implementation for {description}
|
||
'''
|
||
name = {mainlabel}
|
||
html5name = {weblabel}
|
||
@lazy_property
|
||
def decoding_map():
|
||
return {decode}
|
||
|
||
register_kuroko_codec(
|
||
{labels},
|
||
{idname}IncrementalEncoder,
|
||
{idname}IncrementalDecoder)
|
||
"""
|
||
|
||
let boilerplate = """'''
|
||
Defines WHATWG-specified single-byte encodings.
|
||
'''
|
||
# Generated by tools/codectools/gen_sbencs.krk from WHATWG encodings.json and indexes.json
|
||
|
||
from codecs.infrastructure import AsciiIncrementalEncoder, AsciiIncrementalDecoder, register_kuroko_codec, lazy_property
|
||
"""
|
||
|
||
# Places where the WHATWG encoding "name" is actually the name of a similar encoding aliased
|
||
# together. Granted, this is not very common for single byte encodings (only "KOI8-U" being
|
||
# KOI8-RU comes to mind, since e.g. Windows-1252 is named Windows-1252 with "ISO-8859-1"
|
||
# merely an alias) but it is, importantly, the rule rather than the exception for the CJK
|
||
# codecs (e.g. Windows-31J is an alias of "Shift JIS" which is actually Windows-31J).
|
||
let realname = {"koi8-u": "koi8-ru"}
|
||
|
||
# Additional labels included for Python parity
|
||
let parity_labels = {
|
||
"windows-1250": ["1250"],
|
||
"windows-1251": ["1251"],
|
||
# I've assigned several of Python's aliases to the ecma-43-dv codec, but these are minor
|
||
# misspellings of one WHATWG assigns for Windows-1252, so doing likewise to avoid confusion.
|
||
"windows-1252": ["ansi-x3-4-1968", "ansi-x3.4-1986", "1252"],
|
||
"windows-1253": ["1253"],
|
||
"windows-1254": ["1254", "iso-8859-9-1989"],
|
||
"windows-1255": ["1255"],
|
||
"windows-1256": ["1256"],
|
||
"windows-1257": ["1257"],
|
||
"windows-1258": ["1258"],
|
||
"iso-8859-2": ["iso-8859-2-1987"],
|
||
"iso-8859-3": ["iso-8859-3-1988"],
|
||
"iso-8859-4": ["iso-8859-4-1988"],
|
||
"iso-8859-5": ["iso-8859-5-1988"],
|
||
"iso-8859-6": ["iso-8859-6-1987"],
|
||
"iso-8859-7": ["iso-8859-7-1987"],
|
||
"iso-8859-8": ["iso-8859-8-1988"],
|
||
"iso-8859-10": ["iso-8859-10-1992"],
|
||
# Note Python differentiates TIS-620 and ISO-8859-11 for some reason (neither IANA nor WHATWG
|
||
# do). Also, it aliases "iso-ir-166" to the former (since it cites TIS-620) despite it having
|
||
# an NBSP in the registration document (case in point).
|
||
"windows-874": ["iso-8859-11-2001", "tis620", "tis-620-0", "tis-620-2529-0",
|
||
"tis-620-2529-1", "iso-ir-166", "thai", "cp874"],
|
||
"iso-8859-13": ["l7", "latin7"],
|
||
"iso-8859-14": ["iso-8859-14-1998", "l8", "latin8", "iso-ir-199", "iso_celtic"],
|
||
"iso-8859-15": ["latin9"],
|
||
"iso-8859-16": ["iso-8859-16-2001", "iso8859-16", "l10", "latin10", "iso-ir-226"],
|
||
"macintosh": ["mac-roman", "macroman"],
|
||
"x-mac-cyrillic": ["mac-cyrillic", "maccyrillic"],
|
||
}
|
||
|
||
let descriptions = {
|
||
"windows-1250": "Windows-1250 (Central Europe)",
|
||
"windows-1251": "Windows-1251 (Cyrillic)",
|
||
"windows-1252": "Windows-1252 (Western Europe), ISO-8859-1 modification/extension",
|
||
"windows-1253": "Windows-1253 (Greek)",
|
||
"windows-1254": "Windows-1254 (Turkish), ISO-8859-9 modification/extension",
|
||
"windows-1255": "Windows-1255 (Logical order Hebrew with vowel points)",
|
||
"windows-1256": "Windows-1256 (Arabic)",
|
||
"windows-1257": "Windows-1257 (Baltic Rim)",
|
||
"windows-1258": """Windows-1258 (Vietnam), basic implementation
|
||
|
||
Note that Windows-1258 includes a mixture of composed forms and combining characters,
|
||
and that some grapheme clusters must be represented with a sequence of a composed
|
||
form and a combining character, even though a fully composed form exists in Unicode
|
||
taken from other encodings such as VISCII, since a fully composed form is not included,
|
||
and a combining form is included for only one of the diacritics.
|
||
|
||
The encoder is a simple mapping which will accept text in the form generated by the decoder
|
||
but, due to the above, some grapheme clusters will not be accepted in either NFC or NFD
|
||
normalised form. The decoder does not convert its output to any normalised form. This follows
|
||
both Python and WHATWG behaviour. Conversion of text between encodable form and either
|
||
normalised form may need to be handled in a separate step by any code using this codec.""",
|
||
"ibm866": """OEM-866 (Russian Cyrillic).
|
||
|
||
Note: OEM-866 competed with OEM-855 for Cyrillic; OEM-866 preserved all box drawing characters
|
||
(rather then only a subset) and was more popular for Russian, but did not provide coverage
|
||
for all of the different South Slavic Cyrillic orthographies, unlike OEM-855. Their layouts
|
||
for Cyrillic are entirely different.""",
|
||
"iso-8859-2": "ISO/IEC 8859-2 (Central European)",
|
||
"iso-8859-3": "ISO/IEC 8859-3 (Maltese and Esperanto)",
|
||
"iso-8859-4": "ISO/IEC 8859-4 (North European)",
|
||
"iso-8859-5": "ISO/IEC 8859-5 (Cyrillic)",
|
||
"iso-8859-6": "ISO/IEC 8859-6 (Arabic ASMO 708)",
|
||
"iso-8859-7": "ISO/IEC 8859-7 (Greek ELOT 928)",
|
||
"iso-8859-8": "ISO/IEC 8859-8 (Hebrew)",
|
||
"iso-8859-8-i": "ISO/IEC 8859-8 (Hebrew)", # Artifact: they do the same thing inside codecs.
|
||
"iso-8859-10": "ISO/IEC 8859-10 (Nordic)",
|
||
"windows-874": "Windows-874 (Thai), TIS-620 / ISO-8859-11 modification/extension",
|
||
"iso-8859-13": "ISO/IEC 8859-13 (Baltic Rim)",
|
||
"iso-8859-14": "ISO/IEC 8859-14 (Celtic)",
|
||
"iso-8859-15": "ISO/IEC 8859-15 (New Western European)",
|
||
"iso-8859-16": "ISO/IEC 8859-16 (South-Eastern European; Romanian SR 14111)",
|
||
"koi8-r": "the KOI8-R (KOI-8 Cyrillic for Russian) encoding.",
|
||
"koi8-ru": "the KOI8-RU (KOI-8 Cyrillic for Belarusian, Ukrainian and Ruthenian) encoding.",
|
||
"macintosh": "the Macintosh Roman encoding.",
|
||
"x-mac-cyrillic": "the Macintosh Cyrillic encoding.",
|
||
"x-user-defined": """the user-defined extended ASCII encoding.
|
||
|
||
This maps ASCII bytes as ASCII characters, and non-ASCII bytes to the private use
|
||
range U+F780–F7FF, such that the low 8 bits always match the original byte.
|
||
|
||
This is sometimes useful for round-tripping arbitrary _sensu stricto_ extended
|
||
ASCII data without caring about the non-ASCII part. Note however, that _sensu lato_
|
||
extended ASCII may for example use ASCII bytes as trail bytes in a multi-byte code.""",
|
||
}
|
||
|
||
let encode_xudef = {}
|
||
let decode_xudef = {}
|
||
for i in range(128):
|
||
let codepoint = 0xF780 + i
|
||
let byte = 0x80 + i
|
||
encode_xudef[codepoint] = byte
|
||
decode_xudef[byte] = codepoint
|
||
|
||
let all_weblabels = []
|
||
let mapped_to_replacement = []
|
||
|
||
with fileio.open("modules/codecs/sbencs.krk", "w") as outf:
|
||
outf.write(boilerplate)
|
||
with fileio.open("tools/codectools/encodings.json") as f:
|
||
for i in json.loads(f.read()):
|
||
if i["heading"] == "Legacy single-byte encodings":
|
||
for enc in i["encodings"]:
|
||
all_weblabels.extend(enc["labels"])
|
||
let whatwgname = enc["name"].lower()
|
||
let name = realname.get(whatwgname, whatwgname)
|
||
let labels = enc["labels"]
|
||
if name in parity_labels:
|
||
labels.extend(parity_labels[name])
|
||
let built = build_sbmap(whatwgname)
|
||
let encoding_map = built[0]
|
||
let decoding_map = built[1]
|
||
let idname = name.title().replace("-", "")
|
||
outf.write(template.format(mainlabel=repr(name), encode=repr(encoding_map),
|
||
weblabel=repr(whatwgname), description=descriptions.get(name, "TODO"),
|
||
decode=repr(decoding_map), labels=repr(labels), idname=idname))
|
||
else:
|
||
for enc in i["encodings"]:
|
||
if enc["name"].lower() != "replacement":
|
||
all_weblabels.extend(enc["labels"])
|
||
else:
|
||
mapped_to_replacement.extend(enc["labels"])
|
||
outf.write(template.format(mainlabel=repr("x-user-defined"), encode=repr(encode_xudef),
|
||
weblabel=repr("x-user-defined"), description=descriptions.get("x-user-defined", "TODO"),
|
||
decode=repr(decode_xudef), labels=repr(["x-user-defined"]), idname="XUserDefined"))
|
||
|
||
with fileio.open("modules/codecs/isweblabel.krk", "w") as outf:
|
||
outf.write(f"""'''
|
||
Allows checking the WHATWG status of a given label (listed, not listed, or mapped to undefined).
|
||
'''
|
||
# Generated by tools/codectools/gen_sbencs.krk from WHATWG encodings.json
|
||
let weblabels = {all_weblabels!r}
|
||
let mapped_to_replacement = {mapped_to_replacement!r}
|
||
|
||
def map_weblabel(label):
|
||
'''
|
||
If `label` is a regular WHATWG label, returns it; if it is a label mapped to Replacement,
|
||
returns `"undefined"`; otherwise, returns `None`.
|
||
'''
|
||
if label in mapped_to_replacement:
|
||
# WHATWG aliases these to replacement to prevent their use in injection/XSS attacks.
|
||
return "undefined"
|
||
else if label in weblabels:
|
||
return label
|
||
else:
|
||
return None
|
||
""")
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|