614193b8a1
* Add some docs, and remove second Code page 874 codec (they handled the non-overridden C1 area differently, but we only need one). * More docs work. * Doc stuff. * Adjusted. * More tweaks (table padding is not the docstring's problem). * CSS and docstring tweaks. * Link from modules to parent packages and vice versa. * More documentation. * Docstrings for all `codecs` submodules. * Move encode_jis7_reduced into dbextra_data_7bit (thus completing the lazy startup which was apparently not complete already) and docstrings added to implementations of base class methods referring up to the base class. * Remove FUSE junk that somehow made it into the repo. * Some more docstrings. * Fix some broken references to `string` (rather than `data`) which would have caused a problem if any existing error handler had returned a negative offset (which no current handler does, but it's worth fixing anyway). * Add a cp042 codec to accompany the x-user-defined codec, and to pave the way for maybe adding Adobe Symbol, Zapf Dingbats or Wingdings codecs in future. * Better Japanese Autodetect behaviour for ISO-2022-JP (add yet another condition in which it will be detected, making it able to conclusively detect it prior to end of stream without being fed an entire escape sequence in one call). Also some docs tweaks. * idstr() → _idstr() since it's internal. * Docs for codecs.pifonts. * Docstrings for dbextra. * Document the sbextra classes. * Docstrings for the web encodings. * Possibly a fairer assessment of likely reality. * Docstrings for codecs.binascii * The *encoding* isn't removed (the BOM is). * Make it clearer when competing OEM code pages use different letter layouts. * Fix copied in error. * Stop generating linking to non-existent "← tools" from tools.gendoc. * Move .fuse_hidden* exclusion to my user-level config. * Constrain the table style changes to class .markdownTable, to avoid any effect on other interface tables generated by Doxygen. * Refer to `__ispackage__` when generating help.
327 lines
13 KiB
Python
327 lines
13 KiB
Python
"""
|
|
Defines functions and codecs pertaining to binary-to-text encodings.
|
|
"""
|
|
from codecs.infrastructure import StringCatenator, ByteCatenator, IncrementalEncoder, IncrementalDecoder, UnicodeDecodeError, UnicodeEncodeError, register_kuroko_codec
|
|
|
|
let _base64_alphabet = (
|
|
list(range(ord("A"), ord("Z") + 1)) +
|
|
list(range(ord("a"), ord("z") + 1)) +
|
|
list(range(ord("0"), ord("9") + 1)) + [ord("+"), ord("/")]
|
|
)
|
|
|
|
let _base64_alphabet_uu = list(range(0x20, 0x60))
|
|
let _base64_alphabet_hqx = [ord(i) for i in
|
|
"!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr"]
|
|
|
|
class Base64IncrementalCreator(IncrementalDecoder):
|
|
"""
|
|
IncrementalDecoder implementation to create (yes) Base64 from bytes.
|
|
"""
|
|
name = "inverse-base64"
|
|
alphabet = _base64_alphabet
|
|
padchar = "="
|
|
def decode(data_in, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
let data = self.pending + data_in
|
|
self.pending = b""
|
|
let offset = 0
|
|
let out = StringCatenator()
|
|
while 1:
|
|
let subdata = list(data)[offset:(offset + 3)]
|
|
if (len(subdata) == 0) or (len(subdata) < 3 and not final):
|
|
return self._handle_truncation(out, None, final, data, offset, subdata)
|
|
let padchars = 3 - len(subdata)
|
|
subdata = subdata + ([0] * padchars)
|
|
let high = subdata[0]
|
|
let mid = subdata[1]
|
|
let low = subdata[2]
|
|
let charm = high >> 2
|
|
let up = ((high & 3) << 4) | (mid >> 4)
|
|
let down = ((mid & 0xF) << 2) | (low >> 6)
|
|
let strange = low & 0x3F
|
|
let outbit = "".join([chr(self.alphabet[i]) for i in [charm, up, down, strange]])
|
|
if padchars:
|
|
outbit = outbit[:-padchars] + (self.padchar * padchars)
|
|
out.add(outbit)
|
|
offset += 3
|
|
|
|
class Base64IncrementalParser(IncrementalEncoder):
|
|
"""
|
|
IncrementalEncoder implementation to parse (yes) Base64 from a string to bytes.
|
|
"""
|
|
name = "inverse-base64"
|
|
alphabet = _base64_alphabet
|
|
padchar = "="
|
|
def encode(string_in, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
let string = self.pending + string_in
|
|
self.pending = ""
|
|
let offset = 0
|
|
let out = ByteCatenator()
|
|
while 1:
|
|
let raw_substring = ""
|
|
let suboffset = offset
|
|
while (len(raw_substring) < 4) and suboffset < len(string):
|
|
if string[suboffset].strip():
|
|
raw_substring += string[suboffset]
|
|
suboffset += 1
|
|
if len(raw_substring) == 0:
|
|
return out.getvalue()
|
|
else if len(raw_substring) < 4 and not final:
|
|
self.pending = raw_substring
|
|
return out.getvalue()
|
|
let substring = raw_substring
|
|
let padchars = 4 - len(substring.rstrip(self.padchar))
|
|
if padchars:
|
|
substring = substring[:(4-padchars)] + (chr(self.alphabet[0]) * padchars)
|
|
let charm
|
|
let up
|
|
let down
|
|
let strange
|
|
try:
|
|
charm = self.alphabet.index(ord(substring[0]))
|
|
up = self.alphabet.index(ord(substring[1]))
|
|
down = self.alphabet.index(ord(substring[2]))
|
|
strange = self.alphabet.index(ord(substring[3]))
|
|
except ValueError:
|
|
# Ignore the error specifier
|
|
raise UnicodeEncodeError(self.name, string, offset, suboffset,
|
|
"not Base64")
|
|
let high = (charm << 2) | (up >> 4)
|
|
let mid = ((up & 0xF) << 4) | (down >> 2)
|
|
let low = ((down & 3) << 6) | strange
|
|
if not padchars:
|
|
out.add(bytes([high, mid, low]))
|
|
else if padchars == 1:
|
|
out.add(bytes([high, mid]))
|
|
else if padchars == 2:
|
|
out.add(bytes([high]))
|
|
else:
|
|
# Ignore the error specifier
|
|
raise UnicodeEncodeError(self.name, string, offset, suboffset,
|
|
"Base64 truncated or with invalid number of pad characters")
|
|
offset = suboffset
|
|
def reset():
|
|
"""Implements `IncrementalEncoder.reset`"""
|
|
self.pending = ""
|
|
def getstate():
|
|
"""Implements `IncrementalEncoder.getstate`"""
|
|
return self.pending
|
|
def setstate(state):
|
|
"""Implements `IncrementalEncoder.setstate`"""
|
|
self.pending = state
|
|
|
|
register_kuroko_codec(["inverse-base64"],
|
|
Base64IncrementalParser, Base64IncrementalCreator)
|
|
|
|
class Base64UUIncrementalCreator(Base64IncrementalCreator):
|
|
"""
|
|
IncrementalDecoder implementation to create (yes) the flavour of Base64 used in uuencode.
|
|
|
|
Note that this does not output the uuencode format, and is only one component of implementing it.
|
|
"""
|
|
name = "inverse-base64uu"
|
|
alphabet = _base64_alphabet_uu
|
|
padchar = " "
|
|
|
|
class Base64UUIncrementalParser(Base64IncrementalParser):
|
|
"""
|
|
IncrementalEncoder implementation to parse (yes) the flavour of Base64 used in uuencode.
|
|
|
|
Note that this does not take the uuencode format, and is only one component of implementing it.
|
|
"""
|
|
name = "inverse-base64uu"
|
|
alphabet = _base64_alphabet_uu
|
|
padchar = " "
|
|
|
|
register_kuroko_codec(["inverse-base64uu"],
|
|
Base64UUIncrementalParser, Base64UUIncrementalCreator)
|
|
|
|
class Base64HQXIncrementalCreator(Base64IncrementalCreator):
|
|
"""
|
|
IncrementalDecoder implementation to create (yes) the flavour of Base64 used in BinHex4.
|
|
|
|
Note that this does not output the BinHex4 format, and is only one component of implementing it.
|
|
"""
|
|
name = "inverse-base64hqx"
|
|
alphabet = _base64_alphabet_hqx
|
|
|
|
class Base64HQXIncrementalParser(Base64IncrementalParser):
|
|
"""
|
|
IncrementalEncoder implementation to parse (yes) the flavour of Base64 used in BinHex4.
|
|
|
|
Note that this does not take the BinHex4 format, and is only one component of implementing it.
|
|
"""
|
|
name = "inverse-base64hqx"
|
|
alphabet = _base64_alphabet_hqx
|
|
|
|
register_kuroko_codec(["inverse-base64hqx"],
|
|
Base64HQXIncrementalParser, Base64HQXIncrementalCreator)
|
|
|
|
|
|
class QuoPriIncrementalCreator(IncrementalDecoder):
|
|
"""
|
|
IncrementalDecoder implementation to create (yes) Quoted-Printable from bytes.
|
|
"""
|
|
name = "inverse-quopri"
|
|
def decode(data_in, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
let data = self.pending + data_in
|
|
self.pending = b""
|
|
let offset = 0
|
|
let out = StringCatenator()
|
|
while 1:
|
|
# Unless we're final, stop one byte short of the end of the input and shove the
|
|
# last byte in pending. We need to know if a line end follows (end of the *final*
|
|
# input would count as a line end).
|
|
if (offset == len(data)) or ((offset + 1) == len(data) and not final):
|
|
self.pending = bytes(list(data)[offset:])
|
|
return out.getvalue()
|
|
let i = data[offset]
|
|
let next_eol = (offset + 1 == len(data)) or (data[offset + 1] in (0x0A, 0x0D))
|
|
if i > 0x20 and i < 0x7F and i != b"="[0]:
|
|
if self.linelength >= 75 and not next_eol:
|
|
out.add("=\r\n")
|
|
self.linelength = 0
|
|
out.add(chr(i))
|
|
self.linelength += 1
|
|
else if i in (0x0A, 0x0D):
|
|
out.add(chr(i))
|
|
self.linelength = 0
|
|
else if i in (0x09, 0x20):
|
|
if next_eol:
|
|
if self.linelength > 73:
|
|
out.add("=\r\n")
|
|
out.add(chr(i))
|
|
self.linelength = 1
|
|
else:
|
|
let hexbit = hex(i)[2:].upper()
|
|
if len(hexbit) == 1: hexbit = "0" + hexbit
|
|
out.add("=" + hexbit)
|
|
self.linelength += 3
|
|
else:
|
|
out.add(chr(i))
|
|
self.linelength += 1
|
|
else:
|
|
if (self.linelength > 73) or (self.linelength > 72 and not next_eol):
|
|
out.add("=\r\n")
|
|
self.linelength = 0
|
|
let hexbit = hex(i)[2:].upper()
|
|
if len(hexbit) == 1: hexbit = "0" + hexbit
|
|
out.add("=" + hexbit)
|
|
self.linelength += 3
|
|
offset += 1
|
|
def reset():
|
|
"""Implements `IncrementalDecoder.reset`"""
|
|
self.linelength = 0
|
|
self.pending = b""
|
|
def getstate():
|
|
"""Implements `IncrementalDecoder.getstate`"""
|
|
return (self.linelength, self.pending)
|
|
def setstate(state):
|
|
"""Implements `IncrementalDecoder.setstate`"""
|
|
self.linelength = state[0]
|
|
self.pending = state[1]
|
|
|
|
class QuoPriIncrementalParser(IncrementalEncoder):
|
|
"""
|
|
IncrementalEncoder implementation to parse (yes) Quoted-Printable from a string to bytes.
|
|
"""
|
|
name = "inverse-quopri"
|
|
def encode(string_in, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
let string = self.pending + string_in
|
|
self.pending = ""
|
|
let offset = 0
|
|
let out = ByteCatenator()
|
|
while 1:
|
|
let substring = string[offset:(offset + 1)] # not string[offset] (it will fail at end)
|
|
if len(substring) == 0:
|
|
return out.getvalue()
|
|
else if substring == "=":
|
|
substring = string[offset:(offset + 3)]
|
|
if len(substring) < 3 and not final:
|
|
self.pending = substring
|
|
return out.getvalue()
|
|
else:
|
|
out.add(bytes([ord(substring)]))
|
|
offset += 1
|
|
continue
|
|
# Python's QuoPri parser is very lenient so we should also be
|
|
let procsubst = substring.upper()
|
|
let hexd = "0123456789ABCDEF"
|
|
if len(substring) >= 3 and substring[1] == "\r" and substring[2] == "\n":
|
|
offset += 3
|
|
else if len(substring) >= 2 and substring[1] == "\n":
|
|
# Leniency to LF not CRLF in soft line break (following Python behaviour)
|
|
offset += 2
|
|
else if len(substring) >= 2 and substring[1] == "\r" and substring[2:3] != "\n":
|
|
# Leniency to CR not CRLF in soft line break
|
|
# Python behaviour will actually swallow any data between =\r and \n, which is
|
|
# BAD. I am not following suit.
|
|
offset += 2
|
|
else if len(substring) < 3 or procsubst[1] not in hexd or procsubst[2] not in hexd:
|
|
# Leniency to = which doesn't start a QuoPri escape
|
|
out.add(bytes([ord(substring[0])]))
|
|
offset += 1
|
|
else:
|
|
let byteval = (hexd.index(procsubst[1]) << 4) | hexd.index(procsubst[2])
|
|
out.add(bytes([byteval]))
|
|
offset += 3
|
|
def reset():
|
|
"""Implements `IncrementalEncoder.reset`"""
|
|
self.pending = ""
|
|
def getstate():
|
|
"""Implements `IncrementalEncoder.getstate`"""
|
|
return self.pending
|
|
def setstate(state):
|
|
"""Implements `IncrementalEncoder.setstate`"""
|
|
self.pending = state
|
|
|
|
register_kuroko_codec(["inverse-quopri"],
|
|
QuoPriIncrementalParser, QuoPriIncrementalCreator)
|
|
|
|
|
|
def base64_file_create(data, filename=None, mode=0o666):
|
|
"""
|
|
Create a Base64 string containing the provided data, with lines wrapped as required by some
|
|
formats. If a filename and optional UNIX mode are provided, Base64 headers as recognised by
|
|
some modern versions of uudecode are added.
|
|
"""
|
|
let out = StringCatenator()
|
|
let creator = Base64IncrementalCreator("strict")
|
|
if filename != None:
|
|
let octmode = oct(mode)[2:]
|
|
out.add(f"begin-base64 {octmode} {filename}\n")
|
|
let offset = 0
|
|
while offset < len(data):
|
|
let segment = bytes(list(data)[offset:offset+57])
|
|
out.add(creator.decode(segment, True))
|
|
out.add("\n")
|
|
offset += 57
|
|
if filename != None:
|
|
out.add("====\n")
|
|
return out.getvalue()
|
|
|
|
|
|
def uu_file_create(data, filename="-", mode=0o666):
|
|
"""
|
|
Create a string in the uuencode file format containing the provided data.
|
|
"""
|
|
let out = StringCatenator()
|
|
let creator = Base64UUIncrementalCreator("strict")
|
|
let octmode = oct(mode)[2:]
|
|
out.add(f"begin {octmode} {filename}\n")
|
|
let offset = 0
|
|
while offset < len(data):
|
|
let segment = bytes(list(data)[offset:offset+45])
|
|
out.add(chr(_base64_alphabet_uu[len(segment)]))
|
|
out.add(creator.decode(segment, True))
|
|
out.add("\n")
|
|
offset += 45
|
|
out.add("`\nend\n")
|
|
return out.getvalue()
|
|
|
|
|
|
|