614193b8a1
* Add some docs, and remove second Code page 874 codec (they handled the non-overridden C1 area differently, but we only need one). * More docs work. * Doc stuff. * Adjusted. * More tweaks (table padding is not the docstring's problem). * CSS and docstring tweaks. * Link from modules to parent packages and vice versa. * More documentation. * Docstrings for all `codecs` submodules. * Move encode_jis7_reduced into dbextra_data_7bit (thus completing the lazy startup which was apparently not complete already) and docstrings added to implementations of base class methods referring up to the base class. * Remove FUSE junk that somehow made it into the repo. * Some more docstrings. * Fix some broken references to `string` (rather than `data`) which would have caused a problem if any existing error handler had returned a negative offset (which no current handler does, but it's worth fixing anyway). * Add a cp042 codec to accompany the x-user-defined codec, and to pave the way for maybe adding Adobe Symbol, Zapf Dingbats or Wingdings codecs in future. * Better Japanese Autodetect behaviour for ISO-2022-JP (add yet another condition in which it will be detected, making it able to conclusively detect it prior to end of stream without being fed an entire escape sequence in one call). Also some docs tweaks. * idstr() → _idstr() since it's internal. * Docs for codecs.pifonts. * Docstrings for dbextra. * Document the sbextra classes. * Docstrings for the web encodings. * Possibly a fairer assessment of likely reality. * Docstrings for codecs.binascii * The *encoding* isn't removed (the BOM is). * Make it clearer when competing OEM code pages use different letter layouts. * Fix copied in error. * Stop generating linking to non-existent "← tools" from tools.gendoc. * Move .fuse_hidden* exclusion to my user-level config. * Constrain the table style changes to class .markdownTable, to avoid any effect on other interface tables generated by Doxygen. * Refer to `__ispackage__` when generating help.
854 lines
39 KiB
Python
854 lines
39 KiB
Python
"""Contains various WHATWG-defined codecs which require dedicated implementations.
|
|
|
|
Also includes `utf-8-sig` which, while not a WHATWG-specified codec _per se_, is detected,
|
|
interpreted and handled by WHATWG BOM tag logic, in preference above any label, before the codec
|
|
gets to see it. WHATWG BOM tag logic is not implemented here (it is not always sensible in a
|
|
non-browser context); hence, they remain separate codecs."""
|
|
from codecs.infrastructure import register_kuroko_codec, ByteCatenator, StringCatenator, UnicodeEncodeError, UnicodeDecodeError, lookup_error, lookup, IncrementalDecoder, IncrementalEncoder, lazy_property
|
|
from codecs.dbdata import more_dbdata
|
|
|
|
class Gb18030IncrementalEncoder(IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for GB18030 (Mainland Chinese Unicode format)"""
|
|
name = "gb18030"
|
|
html5name = "gb18030"
|
|
four_byte_codes = True
|
|
def encode(string, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
let out = ByteCatenator()
|
|
let offset = 0
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
if offset >= len(string):
|
|
return out.getvalue()
|
|
let i = string[offset]
|
|
if ord(i) < 0x80:
|
|
out.add(bytes([ord(i)]))
|
|
offset += 1
|
|
else if (not self.four_byte_codes) and (ord(i) == 0x20AC):
|
|
out.add(b"\x80")
|
|
offset += 1
|
|
else if ord(i) in more_dbdata.encode_gbk:
|
|
let target = more_dbdata.encode_gbk[ord(i)]
|
|
if isinstance(target, tuple):
|
|
for individ in target:
|
|
out.add(bytes([individ]))
|
|
else:
|
|
out.add(bytes([target]))
|
|
offset += 1
|
|
else if (not self.four_byte_codes) or (ord(i) == 0xE5E5):
|
|
let reason = ""
|
|
if not self.four_byte_codes:
|
|
reason = "character not supported in plain GBK mode"
|
|
else if ord(i) == 0xE5E5:
|
|
reason = "U+E5E5 corresponds to 0xA3A0, which is mapped to U+3000 as legacy"
|
|
let error = UnicodeEncodeError(self.name, string, offset, offset + 1, reason)
|
|
let errorret = lookup_error(self.errors)(error)
|
|
out.add(errorret[0])
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(string)
|
|
else:
|
|
let pointer
|
|
if ord(i) == 0xE7C7:
|
|
pointer = 7457
|
|
else:
|
|
let basepointer = 0
|
|
let basecodepoint = 0
|
|
for nexttuple in more_dbdata.gb_surrogate_ranges:
|
|
let nextpointer = nexttuple[0]
|
|
let nextcodepoint = nexttuple[1]
|
|
if nextcodepoint > ord(i):
|
|
break
|
|
basepointer = nextpointer
|
|
basecodepoint = nextcodepoint
|
|
pointer = (ord(i) - basecodepoint) + basepointer
|
|
let running = pointer
|
|
let first = 0x81 + (running // (10 * 126 * 10))
|
|
running %= 10 * 126 * 10
|
|
let second = 0x30 + (running // (10 * 126))
|
|
running %= 10 * 126
|
|
let third = 0x81 + (running // 10)
|
|
let fourth = 0x30 + (running % 10)
|
|
out.add(bytes([first, second, third, fourth]))
|
|
offset += 1
|
|
|
|
class GbkIncrementalEncoder(Gb18030IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for GBK (Chinese),
|
|
extension of GB2312 (Simplified Chinese)"""
|
|
name = "gbk"
|
|
html5name = "gbk"
|
|
four_byte_codes = False
|
|
|
|
def _get_gbsurrogate_pointer(leader, i):
|
|
let ret = (leader[0] - 0x81) * (10 * 126 * 10)
|
|
ret += (leader[1] - 0x30) * (10 * 126)
|
|
ret += (leader[2] - 0x81) * 10
|
|
ret += i - 0x30
|
|
if (39419 < ret and ret < 189000) or (ret > 1237575): return None
|
|
return ret
|
|
|
|
class Gb18030IncrementalDecoder(IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for GB18030 (Mainland Chinese Unicode),
|
|
extension of GB2312 (Simplified Chinese)"""
|
|
name = "gb18030"
|
|
html5name = "gb18030"
|
|
def decode(data_in, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
let data = self.pending + data_in
|
|
self.pending = b""
|
|
let out = StringCatenator()
|
|
let offset = 0
|
|
let leader = []
|
|
let bytemode = 1
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
if offset >= len(data):
|
|
return self._handle_truncation(out, bytemode, final, data, offset, leader)
|
|
let i = data[offset]
|
|
if bytemode == 1 and i < 0x80:
|
|
out.add(chr(i))
|
|
offset += 1
|
|
else if bytemode == 1 and (0x81 <= i and i <= 0xFE):
|
|
bytemode = 2
|
|
leader.append(i)
|
|
offset += 1
|
|
else if bytemode == 1 and i == 0x80:
|
|
out.add("€")
|
|
offset += 1
|
|
else if bytemode == 2 and (leader[0], i) in more_dbdata.decode_gbk:
|
|
out.add(chr(more_dbdata.decode_gbk[(leader[0], i)]))
|
|
offset += 1
|
|
bytemode = 1
|
|
leader = []
|
|
else if bytemode == 2 and (0x30 <= i and i <= 0x39):
|
|
bytemode = 4
|
|
leader.append(i)
|
|
offset += 1
|
|
else if bytemode == 4 and len(leader) == 2 and (0x81 <= i and i <= 0xFE):
|
|
leader.append(i)
|
|
offset += 1
|
|
else if bytemode == 4 and len(leader) == 3 and _get_gbsurrogate_pointer(leader, i):
|
|
let pointer = _get_gbsurrogate_pointer(leader, i)
|
|
let codepoint
|
|
if pointer == 7457:
|
|
codepoint = 0xE7C7
|
|
else:
|
|
let basecodepoint = 0
|
|
let basepointer = 0
|
|
for nexttuple in more_dbdata.gb_surrogate_ranges:
|
|
let nextpointer = nexttuple[0]
|
|
let nextcodepoint = nexttuple[1]
|
|
if nextpointer > pointer:
|
|
break
|
|
basecodepoint = nextcodepoint
|
|
basepointer = nextpointer
|
|
codepoint = (pointer - basepointer) + basecodepoint
|
|
out.add(chr(codepoint))
|
|
offset += 1
|
|
bytemode = 1
|
|
leader = []
|
|
else:
|
|
let errorstart = offset - len(leader)
|
|
let errorend = errorstart + bytemode
|
|
# Note: per WHATWG behaviour, if an invalid multi-byte code contains an ASCII byte,
|
|
# parsing shall resume at that byte.
|
|
if bytemode > 1:
|
|
if len(leader) > 1:
|
|
errorend -= 3
|
|
else if i < 0x80:
|
|
errorend -= 1
|
|
let error = UnicodeDecodeError(self.name, data, errorstart, errorend,
|
|
"invalid sequence")
|
|
bytemode = 1
|
|
leader = []
|
|
let errorret = lookup_error(self.errors)(error)
|
|
out.add(errorret[0])
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(data)
|
|
|
|
register_kuroko_codec(["gb18030", "gb18030_2000"], Gb18030IncrementalEncoder, Gb18030IncrementalDecoder)
|
|
register_kuroko_codec(
|
|
["chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "936", "cp936", "ms936",
|
|
"gbk", "iso-ir-58", "x-gbk", "gb2312_1980", "gb2312_80"],
|
|
GbkIncrementalEncoder, Gb18030IncrementalDecoder)
|
|
|
|
class Iso2022JpIncrementalEncoder(IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for ISO-2022-JP (7-bit stateful Japanese JIS)"""
|
|
name = "iso-2022-jp"
|
|
html5name = "iso-2022-jp"
|
|
encodes_sbcs = []
|
|
@lazy_property
|
|
def encodes_dbcs():
|
|
return [None, None, more_dbdata.encode_jis7]
|
|
encode_supershift_latin = None
|
|
encode_supershift_greek = None
|
|
super_shift = False
|
|
escs_onebyte = {0: 0x42, 1: 0x4A}
|
|
escs_twobyte = {2: 0x42}
|
|
# eager: switch to a lower numbered state when possible (à la Python)
|
|
# lazy: switch state only when necessary (à la WHATWG)
|
|
attitude = "lazy"
|
|
def ensure_state_number(state, out):
|
|
if self.state == state:
|
|
else if state in self.escs_onebyte:
|
|
out.add(b"\[(")
|
|
out.add(bytes([self.escs_onebyte[state]]))
|
|
else if state in self.escs_twobyte:
|
|
out.add(b"\[$")
|
|
if self.escs_twobyte[state] not in (0x40, 0x41, 0x42):
|
|
out.add(b"(")
|
|
out.add(bytes([self.escs_twobyte[state]]))
|
|
else:
|
|
raise ValueError("set to invalid state: " + repr(state))
|
|
self.state = state
|
|
def encode(string, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
let out = ByteCatenator()
|
|
let offset = 0
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
if offset >= len(string):
|
|
if final:
|
|
self.ensure_state_number(0, out)
|
|
return out.getvalue()
|
|
let i = string[offset]
|
|
# Do not pass Shift Out, Shift In or Escape through from data lest this generate
|
|
# state changes (SO and SI are used in some ISO-2022-JP variants though not this one)
|
|
let is_ascii = ord(i) < 0x80 and i not in ("\x0E\x0F\[")
|
|
let is_jiscii = (is_ascii and i not in "\\~") or (i in "¥\u203E")
|
|
let is_sets = [is_ascii, is_jiscii]
|
|
let try_state = 2
|
|
while 1:
|
|
if try_state in self.escs_onebyte:
|
|
is_sets.append(ord(i) in self.encodes_sbcs[try_state])
|
|
else if try_state in self.escs_twobyte:
|
|
is_sets.append(ord(i) in self.encodes_dbcs[try_state])
|
|
else:
|
|
break
|
|
try_state += 1
|
|
if (not is_sets[self.state]) or (self.attitude == "eager"):
|
|
for n, j in enumerate(is_sets):
|
|
if j:
|
|
self.ensure_state_number(n, out)
|
|
break
|
|
if not is_sets[self.state]: # i.e. still isn't.
|
|
if self.super_shift and ord(i) in self.encode_supershift_latin:
|
|
if self.state_greekmode or not self.state_desigsupershift:
|
|
out.add(b"\[.A")
|
|
self.state_greekmode = False
|
|
self.state_desigsupershift = True
|
|
out.add(b"\[N")
|
|
out.add(bytes([self.encode_supershift_latin[ord(i)]]))
|
|
offset += 1
|
|
else if self.super_shift and ord(i) in self.encode_supershift_greek:
|
|
if not self.state_greekmode:
|
|
out.add(b"\[.F")
|
|
self.state_greekmode = True
|
|
self.state_desigsupershift = True
|
|
out.add(b"\[N")
|
|
out.add(bytes([self.encode_supershift_greek[ord(i)]]))
|
|
offset += 1
|
|
else:
|
|
let error = UnicodeEncodeError(self.name, string, offset, offset + 1,
|
|
"character not supported by target encoding")
|
|
let errorret = lookup_error(self.errors)(error)
|
|
self.ensure_state_number(0, out)
|
|
out.add(errorret[0])
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(string)
|
|
else if self.state in (0, 1):
|
|
# By this point we know whether ASCII or JIS-Roman is appropriate and have switched
|
|
# to the appropriate one, so we can just treat these the same now.
|
|
let j
|
|
if i == "¥":
|
|
j = "\\"
|
|
else if i == "\u203E":
|
|
j = "~"
|
|
else:
|
|
j = i
|
|
out.add(bytes([ord(j)]))
|
|
offset += 1
|
|
else if self.state in self.escs_onebyte:
|
|
out.add(bytes([self.encodes_sbcs[self.state][ord(i)]]))
|
|
offset += 1
|
|
else if self.state in self.escs_twobyte:
|
|
out.add(bytes(self.encodes_dbcs[self.state][ord(i)]))
|
|
offset += 1
|
|
else:
|
|
raise RuntimeError("inconsistently configured encoder")
|
|
def reset():
|
|
"""Implements `IncrementalEncoder.reset`"""
|
|
self.state = 0
|
|
self.state_greekmode = False
|
|
self.state_desigsupershift = False
|
|
def getstate():
|
|
"""Implements `IncrementalEncoder.getstate`"""
|
|
return (self.state, self.state_desigsupershift, self.state_greekmode)
|
|
def setstate(state):
|
|
"""Implements `IncrementalEncoder.setstate`"""
|
|
self.state = state[0]
|
|
self.state_desigsupershift = state[1]
|
|
self.state_greekmode = state[2]
|
|
|
|
class Iso2022JpIncrementalDecoder(IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for ISO-2022-JP (7-bit stateful Japanese JIS)"""
|
|
name = "iso-2022-jp"
|
|
html5name = "iso-2022-jp"
|
|
@lazy_property
|
|
def decodes_sbcs():
|
|
return [None, None, more_dbdata.decode_jis7katakana]
|
|
@lazy_property
|
|
def decodes_dbcs():
|
|
return [None, None, None, more_dbdata.decode_jis7]
|
|
decode_shiftout = None
|
|
decode_supershift_latin = None
|
|
decode_supershift_greek = None
|
|
escs_onebyte = {0x42: 0, 0x49: 2, 0x4A: 1}
|
|
escs_twobyte = {0x40: 3, 0x42: 3}
|
|
two_byte_modes = [3]
|
|
new_twobytes = False
|
|
shift_out = False
|
|
super_shift = False
|
|
concat_lenient = False
|
|
def decode(data_in, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
let data = self.pending + data_in
|
|
self.pending = b""
|
|
let out = StringCatenator()
|
|
let offset = 0
|
|
let leader = []
|
|
let bytemode = 1
|
|
if self.state_set in self.two_byte_modes and not self.state_shiftoutmode:
|
|
bytemode = 2
|
|
let in_esc = False
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
self.scrutinising_inter646 = self.scrutinising_inter646 and self.state_justswitched
|
|
if offset >= len(data):
|
|
return self._handle_truncation(out, bytemode, final, data, offset, leader)
|
|
let i = data[offset]
|
|
if i == 0x1B and len(leader) == 0:
|
|
in_esc = True
|
|
leader.append(i)
|
|
offset += 1
|
|
else if i == 0x0E and len(leader) == 0 and self.shift_out and not self.state_shiftoutmode:
|
|
# state_justswitched is for the HTML5 version which doesn't use Shift Out
|
|
self.state_justswitched = False
|
|
self.state_shiftoutmode = True
|
|
offset += 1
|
|
else if i == 0x0F and len(leader) == 0 and self.state_shiftoutmode:
|
|
# state_justswitched is for the HTML5 version which doesn't use Shift In
|
|
self.state_justswitched = False
|
|
self.state_shiftoutmode = False
|
|
offset += 1
|
|
else if in_esc and len(leader) == 1 and i in (0x24, 0x28):
|
|
leader.append(i)
|
|
offset += 1
|
|
else if in_esc and len(leader) == 1 and i in (0x2E, 0x4E) and self.super_shift:
|
|
leader.append(i)
|
|
offset += 1
|
|
else if in_esc and len(leader) == 2 and leader[1] == 0x2E and i == 0x41:
|
|
self.state_greekmode = False
|
|
in_esc = False
|
|
leader = []
|
|
offset += 1
|
|
else if in_esc and len(leader) == 2 and leader[1] == 0x4E and (
|
|
not self.state_greekmode and i in self.decode_supershift_latin):
|
|
out.add(chr(self.decode_supershift_latin[i]))
|
|
in_esc = False
|
|
leader = []
|
|
offset += 1
|
|
else if in_esc and len(leader) == 2 and leader[1] == 0x2E and i == 0x46:
|
|
self.state_greekmode = True
|
|
in_esc = False
|
|
leader = []
|
|
offset += 1
|
|
else if in_esc and len(leader) == 2 and leader[1] == 0x4E and (
|
|
self.state_greekmode and i in self.decode_supershift_latin):
|
|
out.add(chr(self.decode_supershift_greek[i]))
|
|
in_esc = False
|
|
leader = []
|
|
offset += 1
|
|
else if in_esc and len(leader) == 2 and leader[1] == 0x28 and i in self.escs_onebyte:
|
|
if self.state_justswitched:
|
|
out.add(lookup_error(self.errors)(UnicodeDecodeError(self.name, data,
|
|
offset - 5, offset + 1, "no content between two G0 re-designations"))[0])
|
|
else if self.state_set == self.escs_onebyte[i]:
|
|
# Don't penalise a switch to ASCII at the very start.
|
|
if (self.state_set != 0) or self.state_last646seen:
|
|
out.add(lookup_error(self.errors)(UnicodeDecodeError(self.name, data,
|
|
offset - 2, offset + 1, "G0 re-designation to the same one-byte set"))[0])
|
|
else if self.state_set in (0, 1) and self.escs_onebyte[i] in (0, 1) and self.state_last646seen:
|
|
self.scrutinising_inter646 = True
|
|
self.state_set = self.escs_onebyte[i]
|
|
self.state_justswitched = True
|
|
in_esc = False
|
|
bytemode = 1
|
|
leader = []
|
|
offset += 1
|
|
else if in_esc and len(leader) == 2 and leader[1] == 0x24 and i == 0x28 and self.new_twobytes:
|
|
leader.append(i)
|
|
offset += 1
|
|
else if in_esc and len(leader) in (2, 3) and leader[1] == 0x24 and i in self.escs_twobyte:
|
|
if self.state_justswitched and not self.concat_lenient:
|
|
# This can break concatenations of two ISO-2022-JP streams and is questionable
|
|
# whether it actually secures anything, but WHATWG does not exempt double-
|
|
# byte targets from errors on being switched to straight after another
|
|
# switch.
|
|
# The concat_lenient attribute can be set by subclasses to stop this, but this
|
|
# class should be at least as scrutinous as the WHATWG spec.
|
|
out.add(lookup_error(self.errors)(UnicodeDecodeError(self.name, data,
|
|
offset - 5, offset + 1, "no content between two G0 re-designations"))[0])
|
|
# Don't error upon re-designation of the same two-byte set: remember that WHATWG
|
|
# treats both \[$@ and \[$B as designating the same set, but not all encoders do
|
|
# necessarily (_vide_ the "jis_encoding" subclassing this class). WHATWG does not
|
|
# require erroring on nil-effect re-designation at all.
|
|
self.state_set = self.escs_twobyte[i]
|
|
self.state_justswitched = True
|
|
in_esc = False
|
|
bytemode = 2
|
|
leader = []
|
|
offset += 1
|
|
else if bytemode == 2 and len(leader) == 0 and not in_esc and not self.state_shiftoutmode:
|
|
self.state_justswitched = False
|
|
leader.append(i)
|
|
offset += 1
|
|
else if bytemode == 1 and self.state_set == 0 and i < 0x80 and not in_esc and not self.state_shiftoutmode:
|
|
self.state_justswitched = False
|
|
if self.scrutinising_inter646:
|
|
let unjustif = lambda i: i >= 0x20 and i not in (0x5C, 0x7E, 0x7F)
|
|
if unjustif(self.state_last646seen) and unjustif(i):
|
|
out.add(lookup_error(self.errors)(UnicodeDecodeError(self.name, data,
|
|
offset - 3, offset, "unjustified JIS-Roman → ASCII switch"))[0])
|
|
self.scrutinising_inter646 = False
|
|
self.state_last646seen = i
|
|
out.add(chr(i))
|
|
offset += 1
|
|
else if bytemode == 1 and self.state_set == 1 and i < 0x80 and not in_esc and not self.state_shiftoutmode:
|
|
self.state_justswitched = False
|
|
if self.scrutinising_inter646:
|
|
let unjustif = lambda i: i >= 0x20 and i not in (0x5C, 0x7E, 0x7F)
|
|
if unjustif(self.state_last646seen) and unjustif(i):
|
|
out.add(lookup_error(self.errors)(UnicodeDecodeError(self.name, data,
|
|
offset - 3, offset, "unjustified ASCII → JIS-Roman switch"))[0])
|
|
self.scrutinising_inter646 = False
|
|
self.state_last646seen = i
|
|
let char = chr(i)
|
|
if char == "\\":
|
|
char = "¥"
|
|
else if char == "~":
|
|
char = "\u203E"
|
|
out.add(char)
|
|
offset += 1
|
|
else if self.state_shiftoutmode and not in_esc and i in self.decode_shiftout:
|
|
out.add(chr(self.decode_shiftout[i]))
|
|
offset += 1
|
|
else if bytemode == 1 and self.state_set not in (0, 1) and not in_esc and (
|
|
i in self.decodes_sbcs[self.state_set] and not self.state_shiftoutmode):
|
|
self.state_justswitched = False
|
|
out.add(chr(self.decodes_sbcs[self.state_set][i]))
|
|
offset += 1
|
|
else if bytemode == 2 and (leader[0], i) in self.decodes_dbcs[self.state_set] and (
|
|
not self.state_shiftoutmode and not in_esc):
|
|
let decoded = self.decodes_dbcs[self.state_set][(leader[0], i)]
|
|
if isinstance(decoded, int) and 0x21 <= decoded and decoded <= 0x7E:
|
|
# Never decode double-byte characters straight to ASCII, since this is very
|
|
# likely to be used for masking (the characters being maybe either seen as
|
|
# fullwidth or passed through as unrecognised extensions by filters).
|
|
out.add(chr(decoded + 0xFEE0))
|
|
else if isinstance(decoded, tuple):
|
|
for individ in decoded:
|
|
out.add(chr(individ))
|
|
else:
|
|
out.add(chr(decoded))
|
|
offset += 1
|
|
leader = []
|
|
else:
|
|
self.state_justswitched = False
|
|
let errorstart = offset - len(leader)
|
|
let errorend
|
|
if bytemode == 2 and i != 0x1B:
|
|
errorend = errorstart + 2
|
|
else:
|
|
errorend = errorstart + 1
|
|
let error = UnicodeDecodeError(self.name, data, errorstart, errorend,
|
|
"invalid sequence")
|
|
in_esc = False
|
|
leader = []
|
|
let errorret = lookup_error(self.errors)(error)
|
|
out.add(errorret[0])
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(data)
|
|
def reset():
|
|
"""Implements `IncrementalDecoder.reset`"""
|
|
self.pending = b""
|
|
self.state_set = 0
|
|
self.state_greekmode = False
|
|
self.state_shiftoutmode = False
|
|
# The following are used to monitor if unnecessary shift sequences are being used to mask
|
|
# ASCII characters. Do not be lured into a false sense of security though. Our approach
|
|
# expands the WHATWG-specified approach to be a bit more thorough, but it still ignores
|
|
# Shift In / Shift Out (which aren't used in the WHATWG version but are accepted by our
|
|
# "jis_encoding" subclass) and allows both lazy and eager switches between the 646-sets.
|
|
# As a general rule, do not presume to be able to sanitise anything while it is encoded as
|
|
# ISO-2022-JP. You have been warned.
|
|
# Some further reading: https://www.unicode.org/L2/L2020/20202-empty-iso-2022-jp.pdf
|
|
self.state_justswitched = False
|
|
self.state_last646seen = None
|
|
self.scrutinising_inter646 = False
|
|
def getstate():
|
|
"""Implements `IncrementalDecoder.getstate`"""
|
|
return (self.pending, self.state_set, self.state_greekmode, self.state_shiftoutmode,
|
|
self.state_justswitched, self.state_last646seen, self.scrutinising_inter646)
|
|
def setstate(state):
|
|
"""Implements `IncrementalDecoder.setstate`"""
|
|
self.pending = state[0]
|
|
self.state_set = state[1]
|
|
self.state_greekmode = state[2]
|
|
self.state_shiftoutmode = state[3]
|
|
self.state_justswitched = state[4]
|
|
self.state_last646seen = state[5]
|
|
self.scrutinising_inter646 = state[6]
|
|
|
|
register_kuroko_codec(["iso-2022-jp", "iso2022-jp", "iso2022jp", "csiso2022jp", "cp50220", "cscp50220"],
|
|
Iso2022JpIncrementalEncoder, Iso2022JpIncrementalDecoder)
|
|
|
|
|
|
class Utf16IncrementalEncoder(IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for UTF-16 with Byte Order Mark"""
|
|
name = "utf-16"
|
|
html5name = "utf-16"
|
|
encoding_map = {}
|
|
endian = "little"
|
|
include_bom = True
|
|
# -1: BOM not yet emitted if applicable
|
|
# 0: BOM emitted
|
|
state = None
|
|
def push_word(word, out):
|
|
if self.endian == "little":
|
|
out.add(bytes([word & 0xFF, (word >> 8) & 0xFF]))
|
|
else if self.endian == "big":
|
|
out.add(bytes([(word >> 8) & 0xFF, word & 0xFF]))
|
|
else:
|
|
raise ValueError("unexpected endian value: " + repr(self.endian))
|
|
def encode(string, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
let out = ByteCatenator()
|
|
let offset = 0
|
|
if self.include_bom and self.state == -1:
|
|
self.push_word(0xFEFF, out)
|
|
self.state = 0
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
if offset >= len(string):
|
|
return out.getvalue()
|
|
let i = string[offset]
|
|
if (ord(i) < 0x10000) and not (0xD800 <= ord(i) and ord(i) < 0xE000):
|
|
self.push_word(ord(i), out)
|
|
offset += 1
|
|
else if ord(i) >= 0x10000:
|
|
let astrality = ord(i) - 0x10000
|
|
let lead_surrogate = ((astrality >> 10) & 0x3FF) + 0xD800
|
|
let trail_surrogate = (astrality & 0x3FF) + 0xDC00
|
|
self.push_word(lead_surrogate, out)
|
|
self.push_word(trail_surrogate, out)
|
|
offset += 1
|
|
else: # i.e. trying to encode a surrogate "codepoint"
|
|
let error = UnicodeEncodeError(self.name, string, offset, offset + 1,
|
|
"isolated surrogate word")
|
|
let errorret = lookup_error(self.errors)(error)
|
|
for i in errorret[0]:
|
|
self.push_word(i, out)
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(string)
|
|
def reset():
|
|
"""Implements `IncrementalEncoder.reset`"""
|
|
self.state = -1
|
|
def getstate():
|
|
"""Implements `IncrementalEncoder.getstate`"""
|
|
return self.state
|
|
def setstate(state):
|
|
"""Implements `IncrementalEncoder.setstate`"""
|
|
self.state = state
|
|
|
|
class Utf16IncrementalDecoder(IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for UTF-16"""
|
|
name = "utf-16"
|
|
html5name = "utf-16"
|
|
force_endian = None # subclass may set to "little" or "big"
|
|
# -1: expecting BOM
|
|
# 0: LE
|
|
# 1: BE
|
|
state = None
|
|
pending = b""
|
|
def decode(data_in, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
let data = self.pending + data_in
|
|
self.pending = b""
|
|
let out = StringCatenator()
|
|
let offset = 0
|
|
let leader = []
|
|
let wordmode = 1
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
if (offset + 1) >= len(data):
|
|
let leader_bytes = []
|
|
for i in leader:
|
|
if self.state == 1:
|
|
leader_bytes.append((i >> 8) & 0xFF)
|
|
leader_bytes.append(i & 0xFF)
|
|
else:
|
|
leader_bytes.append(i & 0xFF)
|
|
leader_bytes.append((i >> 8) & 0xFF)
|
|
if offset == (len(data) - 1): # i.e. one isolated byte at the end
|
|
leader_bytes.append(data[offset])
|
|
return self._handle_truncation(out, None, final, data, offset, leader_bytes)
|
|
let i
|
|
if self.state != 1:
|
|
i = data[offset] | (data[offset + 1] << 8)
|
|
else:
|
|
i = data[offset + 1] | (data[offset] << 8)
|
|
if self.state == -1:
|
|
if self.force_endian == "little":
|
|
self.state = 0 # keep BOM if endian specified, per Python.
|
|
i = data[offset] | (data[offset + 1] << 8)
|
|
else if self.force_endian == "big":
|
|
self.state = 1
|
|
i = data[offset + 1] | (data[offset] << 8)
|
|
else if i == 0xFEFF:
|
|
self.state = 0
|
|
i = None
|
|
else if i == 0xFFFE:
|
|
self.state = 1
|
|
i = None
|
|
else:
|
|
self.state = 0 # Default to LE, per WHATWG, contra Unicode
|
|
if i == None:
|
|
offset += 2
|
|
else if wordmode == 1 and not (0xD800 <= i and i < 0xE000):
|
|
out.add(chr(i))
|
|
offset += 2
|
|
else if wordmode == 1 and (0xD800 <= i and i < 0xDC00):
|
|
leader.append(i)
|
|
wordmode = 2
|
|
offset += 2
|
|
else if wordmode == 2 and (0xDC00 <= i and i < 0xE000):
|
|
out.add(chr((((leader[0] & 0x3FF) << 10) | (i & 0x3FF)) + 0x10000))
|
|
wordmode = 1
|
|
leader = []
|
|
offset += 2
|
|
else:
|
|
let errorstart = offset - (len(leader) * 2)
|
|
let errorend = errorstart + 2
|
|
let error = UnicodeDecodeError(self.name, data, errorstart, errorend,
|
|
"isolated surrogate word")
|
|
wordmode = 1
|
|
leader = []
|
|
let errorret = lookup_error(self.errors)(error)
|
|
out.add(errorret[0])
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(data)
|
|
def reset():
|
|
"""Implements `IncrementalDecoder.reset`"""
|
|
self.pending = b""
|
|
self.state = -1
|
|
def getstate():
|
|
"""Implements `IncrementalDecoder.getstate`"""
|
|
return (self.pending, self.state)
|
|
def setstate(state):
|
|
"""Implements `IncrementalDecoder.setstate`"""
|
|
self.pending = state[0]
|
|
self.state = state[1]
|
|
|
|
class Utf16BeIncrementalEncoder(Utf16IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for UTF-16 Big Endian without Byte Order Mark"""
|
|
name = "utf-16be"
|
|
html5name = "utf-16be"
|
|
endian = "big"
|
|
include_bom = False
|
|
|
|
class Utf16BeIncrementalDecoder(Utf16IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for UTF-16 Big Endian without Byte Order Mark"""
|
|
name = "utf-16be"
|
|
html5name = "utf-16be"
|
|
force_endian = "big"
|
|
|
|
class Utf16LeIncrementalEncoder(Utf16IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for UTF-16 Little Endian without Byte Order Mark"""
|
|
name = "utf-16le"
|
|
html5name = "utf-16le"
|
|
endian = "little"
|
|
include_bom = False
|
|
|
|
class Utf16LeIncrementalDecoder(Utf16IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for UTF-16 Little Endian without Byte Order Mark"""
|
|
name = "utf-16le"
|
|
html5name = "utf-16le"
|
|
force_endian = "little"
|
|
|
|
# Note: this behaves explicitly differently to WHATWG since WHATWG has the BOM, if present,
|
|
# override *any label at all*, and doesn't specify an encoder for UTF-16. So it aliases UTF-16 to
|
|
# UTF-16LE, while we take UTF-16 as meaning with BOM and UTF-16LE/BE as without, per Python.
|
|
register_kuroko_codec(["utf-16", "utf16", "iso-10646-ucs-2", "ucs-2", "unicode", "csunicode", "u16"],
|
|
Utf16IncrementalEncoder, Utf16IncrementalDecoder)
|
|
register_kuroko_codec(["utf-16le", "utf-16-le", "unicodefeff", "unicodelittleunmarked"],
|
|
Utf16LeIncrementalEncoder, Utf16LeIncrementalDecoder)
|
|
register_kuroko_codec(["utf-16be", "utf-16-be", "unicodefffe", "unicodebigunmarked"],
|
|
Utf16BeIncrementalEncoder, Utf16BeIncrementalDecoder)
|
|
|
|
|
|
class Utf8IncrementalEncoder(IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for UTF-8"""
|
|
name = "utf-8"
|
|
html5name = "utf-8"
|
|
# -1: expecting BOM
|
|
# 0: Normal
|
|
state = None
|
|
include_bom = False
|
|
def encode(string, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
# We use UTF-8 natively, so this is fairly simple
|
|
let out = ByteCatenator()
|
|
if self.include_bom and self.state == -1:
|
|
out.add("\uFEFF".encode())
|
|
self.state = 0
|
|
out.add(string.encode())
|
|
return out.getvalue()
|
|
def reset():
|
|
"""Implements `IncrementalEncoder.reset`"""
|
|
self.state = -1
|
|
def getstate():
|
|
"""Implements `IncrementalEncoder.getstate`"""
|
|
return self.state
|
|
def setstate(state):
|
|
"""Implements `IncrementalEncoder.setstate`"""
|
|
self.state = state
|
|
|
|
class Utf8IncrementalDecoder(IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for UTF-8"""
|
|
name = "utf-8"
|
|
html5name = "utf-8"
|
|
# -1: expecting BOM
|
|
# 0: Normal
|
|
state = None
|
|
remove_bom = False
|
|
pending = b""
|
|
def _error_handler(error):
|
|
return lookup_error(self.errors)(error)
|
|
def decode(data_in, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
# We use UTF-8 natively, so this only validates it and applies the error handler
|
|
# (and removes a BOM if remove_bom is set)
|
|
let data = self.pending + data_in
|
|
self.pending = b""
|
|
let out = StringCatenator()
|
|
let running_offset = 0
|
|
if self.remove_bom and self.state == -1 and len(data) >= 3:
|
|
if data[0] == 0xEF and data[1] == 0xBB and data[2] == 0xBF:
|
|
running_offset = 3
|
|
if data not in (b"\xEF\xBB", b"\xEF"):
|
|
self.state = 0
|
|
let first_offset = running_offset
|
|
let second_offset = running_offset
|
|
let countdown = 0
|
|
let trail_byte_bottom = 0x80
|
|
let trail_byte_top = 0xBF
|
|
let dlist = list(data)
|
|
let lingering_reason = None
|
|
let bolster = 1
|
|
while running_offset < len(data):
|
|
let is_error = False
|
|
let reason = lingering_reason or "byte does not begin valid sequence"
|
|
lingering_reason = None
|
|
if countdown == 0:
|
|
if data[running_offset] < 0x80:
|
|
else if 0xC2 <= data[running_offset] and data[running_offset] <= 0xDF:
|
|
countdown = 1
|
|
else if 0xE0 <= data[running_offset] and data[running_offset] <= 0xEF:
|
|
if data[running_offset] == 0xE0:
|
|
trail_byte_bottom = 0xA0
|
|
lingering_reason = "start of overlong sequence"
|
|
else if data[running_offset] == 0xED:
|
|
trail_byte_top = 0x9F
|
|
lingering_reason = "start of sequence for surrogate code point"
|
|
countdown = 2
|
|
else if 0xF0 <= data[running_offset] and data[running_offset] <= 0xF4:
|
|
if data[running_offset] == 0xF0:
|
|
trail_byte_bottom = 0x90
|
|
lingering_reason = "start of overlong sequence"
|
|
else if data[running_offset] == 0xF4:
|
|
trail_byte_top = 0x8F
|
|
lingering_reason = "start of sequence beyond Unicode"
|
|
countdown = 3
|
|
else:
|
|
is_error = True
|
|
if 0x80 <= data[running_offset] and data[running_offset] <= 0xBF:
|
|
reason = "isolated trail byte"
|
|
else if data[running_offset] in (0xC0, 0xC1):
|
|
reason = "start of overlong sequence"
|
|
else if 0xF5 <= data[running_offset] and data[running_offset] <= 0xFD:
|
|
reason = "start of sequence beyond Unicode"
|
|
else: reason = "invalid lead byte"
|
|
else:
|
|
if not (trail_byte_bottom <= data[running_offset] and data[running_offset] <= trail_byte_top):
|
|
is_error = True
|
|
if not (0x80 <= data[running_offset] and data[running_offset] <= 0xBF):
|
|
reason = "not followed by trail byte"
|
|
bolster -= 1
|
|
trail_byte_bottom = 0x80
|
|
trail_byte_top = 0xBF
|
|
countdown -= 1
|
|
running_offset += 1
|
|
if is_error:
|
|
out.add(bytes(dlist[first_offset:second_offset]).decode())
|
|
let error = UnicodeDecodeError(self.name, data, second_offset,
|
|
second_offset + bolster, reason)
|
|
let errorret = self._error_handler(error)
|
|
out.add(errorret[0])
|
|
running_offset = errorret[1]
|
|
if running_offset < 0:
|
|
running_offset += len(data)
|
|
countdown = 0
|
|
bolster = 1
|
|
first_offset = running_offset
|
|
second_offset = running_offset
|
|
else if countdown == 0:
|
|
second_offset = running_offset
|
|
bolster = 1
|
|
else:
|
|
bolster += 1
|
|
out.add(bytes(dlist[first_offset:second_offset]).decode())
|
|
if second_offset < len(data):
|
|
self.pending = bytes(dlist[second_offset:])
|
|
return out.getvalue()
|
|
def reset():
|
|
"""Implements `IncrementalDecoder.reset`"""
|
|
self.pending = b""
|
|
self.state = -1
|
|
def getstate():
|
|
"""Implements `IncrementalDecoder.getstate`"""
|
|
return (self.pending, self.state)
|
|
def setstate(state):
|
|
"""Implements `IncrementalDecoder.setstate`"""
|
|
self.pending = state[0]
|
|
self.state = state[1]
|
|
|
|
class Utf8SigIncrementalEncoder(Utf8IncrementalEncoder):
|
|
"""IncrementalEncoder implementation for UTF-8 with Byte Order Mark"""
|
|
name = "utf-8-sig"
|
|
html5name = None
|
|
include_bom = True
|
|
|
|
class Utf8SigIncrementalDecoder(Utf8IncrementalDecoder):
|
|
"""IncrementalDecoder implementation for UTF-8 with Byte Order Mark"""
|
|
name = "utf-8-sig"
|
|
html5name = None
|
|
remove_bom = True
|
|
|
|
register_kuroko_codec(["unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "utf-8", "utf8",
|
|
"x-unicode20utf8", "u8", "utf", "cp65001", "utf8-ucs4", "utf8mb4",
|
|
"al32utf8"],
|
|
Utf8IncrementalEncoder, Utf8IncrementalDecoder)
|
|
register_kuroko_codec(["utf-8-sig", "utf-8-bom"], Utf8SigIncrementalEncoder, Utf8SigIncrementalDecoder)
|
|
|
|
|