kuroko/modules/codecs/binascii.krk
HarJIT 614193b8a1
Codecs package docs, as well as some assorted tweaks or minor additions (#5)
* Add some docs, and remove second Code page 874 codec (they handled the
non-overridden C1 area differently, but we only need one).

* More docs work.

* Doc stuff.

* Adjusted.

* More tweaks (table padding is not the docstring's problem).

* CSS and docstring tweaks.

* Link from modules to parent packages and vice versa.

* More documentation.

* Docstrings for all `codecs` submodules.

* Move encode_jis7_reduced into dbextra_data_7bit (thus completing the lazy
startup which was apparently not complete already) and docstrings added to
implementations of base class methods referring up to the base class.

* Remove FUSE junk that somehow made it into the repo.

* Some more docstrings.

* Fix some broken references to `string` (rather than `data`) which would have
caused a problem if any existing error handler had returned a negative
offset (which no current handler does, but it's worth fixing anyway).

* Add a cp042 codec to accompany the x-user-defined codec, and to pave the
way for maybe adding Adobe Symbol, Zapf Dingbats or Wingdings codecs
in future.

* Better Japanese Autodetect behaviour for ISO-2022-JP (add yet another
condition in which it will be detected, making it able to conclusively
detect it prior to end of stream without being fed an entire escape
sequence in one call). Also some docs tweaks.

* idstr() → _idstr() since it's internal.

* Docs for codecs.pifonts.

* Docstrings for dbextra.

* Document the sbextra classes.

* Docstrings for the web encodings.

* Possibly a fairer assessment of likely reality.

* Docstrings for codecs.binascii

* The *encoding* isn't removed (the BOM is).

* Make it clearer when competing OEM code pages use different letter layouts.

* Fix copied in error.

* Stop generating linking to non-existent "← tools" from tools.gendoc.

* Move .fuse_hidden* exclusion to my user-level config.

* Constrain the table style changes to class .markdownTable, to avoid any
effect on other interface tables generated by Doxygen.

* Refer to `__ispackage__` when generating help.
2021-04-02 16:34:10 +09:00

327 lines
13 KiB
Python

"""
Defines functions and codecs pertaining to binary-to-text encodings.
"""
from codecs.infrastructure import StringCatenator, ByteCatenator, IncrementalEncoder, IncrementalDecoder, UnicodeDecodeError, UnicodeEncodeError, register_kuroko_codec
let _base64_alphabet = (
list(range(ord("A"), ord("Z") + 1)) +
list(range(ord("a"), ord("z") + 1)) +
list(range(ord("0"), ord("9") + 1)) + [ord("+"), ord("/")]
)
let _base64_alphabet_uu = list(range(0x20, 0x60))
let _base64_alphabet_hqx = [ord(i) for i in
"!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr"]
class Base64IncrementalCreator(IncrementalDecoder):
"""
IncrementalDecoder implementation to create (yes) Base64 from bytes.
"""
name = "inverse-base64"
alphabet = _base64_alphabet
padchar = "="
def decode(data_in, final = False):
"""Implements `IncrementalDecoder.decode`"""
let data = self.pending + data_in
self.pending = b""
let offset = 0
let out = StringCatenator()
while 1:
let subdata = list(data)[offset:(offset + 3)]
if (len(subdata) == 0) or (len(subdata) < 3 and not final):
return self._handle_truncation(out, None, final, data, offset, subdata)
let padchars = 3 - len(subdata)
subdata = subdata + ([0] * padchars)
let high = subdata[0]
let mid = subdata[1]
let low = subdata[2]
let charm = high >> 2
let up = ((high & 3) << 4) | (mid >> 4)
let down = ((mid & 0xF) << 2) | (low >> 6)
let strange = low & 0x3F
let outbit = "".join([chr(self.alphabet[i]) for i in [charm, up, down, strange]])
if padchars:
outbit = outbit[:-padchars] + (self.padchar * padchars)
out.add(outbit)
offset += 3
class Base64IncrementalParser(IncrementalEncoder):
"""
IncrementalEncoder implementation to parse (yes) Base64 from a string to bytes.
"""
name = "inverse-base64"
alphabet = _base64_alphabet
padchar = "="
def encode(string_in, final = False):
"""Implements `IncrementalEncoder.encode`"""
let string = self.pending + string_in
self.pending = ""
let offset = 0
let out = ByteCatenator()
while 1:
let raw_substring = ""
let suboffset = offset
while (len(raw_substring) < 4) and suboffset < len(string):
if string[suboffset].strip():
raw_substring += string[suboffset]
suboffset += 1
if len(raw_substring) == 0:
return out.getvalue()
else if len(raw_substring) < 4 and not final:
self.pending = raw_substring
return out.getvalue()
let substring = raw_substring
let padchars = 4 - len(substring.rstrip(self.padchar))
if padchars:
substring = substring[:(4-padchars)] + (chr(self.alphabet[0]) * padchars)
let charm
let up
let down
let strange
try:
charm = self.alphabet.index(ord(substring[0]))
up = self.alphabet.index(ord(substring[1]))
down = self.alphabet.index(ord(substring[2]))
strange = self.alphabet.index(ord(substring[3]))
except ValueError:
# Ignore the error specifier
raise UnicodeEncodeError(self.name, string, offset, suboffset,
"not Base64")
let high = (charm << 2) | (up >> 4)
let mid = ((up & 0xF) << 4) | (down >> 2)
let low = ((down & 3) << 6) | strange
if not padchars:
out.add(bytes([high, mid, low]))
else if padchars == 1:
out.add(bytes([high, mid]))
else if padchars == 2:
out.add(bytes([high]))
else:
# Ignore the error specifier
raise UnicodeEncodeError(self.name, string, offset, suboffset,
"Base64 truncated or with invalid number of pad characters")
offset = suboffset
def reset():
"""Implements `IncrementalEncoder.reset`"""
self.pending = ""
def getstate():
"""Implements `IncrementalEncoder.getstate`"""
return self.pending
def setstate(state):
"""Implements `IncrementalEncoder.setstate`"""
self.pending = state
register_kuroko_codec(["inverse-base64"],
Base64IncrementalParser, Base64IncrementalCreator)
class Base64UUIncrementalCreator(Base64IncrementalCreator):
"""
IncrementalDecoder implementation to create (yes) the flavour of Base64 used in uuencode.
Note that this does not output the uuencode format, and is only one component of implementing it.
"""
name = "inverse-base64uu"
alphabet = _base64_alphabet_uu
padchar = " "
class Base64UUIncrementalParser(Base64IncrementalParser):
"""
IncrementalEncoder implementation to parse (yes) the flavour of Base64 used in uuencode.
Note that this does not take the uuencode format, and is only one component of implementing it.
"""
name = "inverse-base64uu"
alphabet = _base64_alphabet_uu
padchar = " "
register_kuroko_codec(["inverse-base64uu"],
Base64UUIncrementalParser, Base64UUIncrementalCreator)
class Base64HQXIncrementalCreator(Base64IncrementalCreator):
"""
IncrementalDecoder implementation to create (yes) the flavour of Base64 used in BinHex4.
Note that this does not output the BinHex4 format, and is only one component of implementing it.
"""
name = "inverse-base64hqx"
alphabet = _base64_alphabet_hqx
class Base64HQXIncrementalParser(Base64IncrementalParser):
"""
IncrementalEncoder implementation to parse (yes) the flavour of Base64 used in BinHex4.
Note that this does not take the BinHex4 format, and is only one component of implementing it.
"""
name = "inverse-base64hqx"
alphabet = _base64_alphabet_hqx
register_kuroko_codec(["inverse-base64hqx"],
Base64HQXIncrementalParser, Base64HQXIncrementalCreator)
class QuoPriIncrementalCreator(IncrementalDecoder):
"""
IncrementalDecoder implementation to create (yes) Quoted-Printable from bytes.
"""
name = "inverse-quopri"
def decode(data_in, final = False):
"""Implements `IncrementalDecoder.decode`"""
let data = self.pending + data_in
self.pending = b""
let offset = 0
let out = StringCatenator()
while 1:
# Unless we're final, stop one byte short of the end of the input and shove the
# last byte in pending. We need to know if a line end follows (end of the *final*
# input would count as a line end).
if (offset == len(data)) or ((offset + 1) == len(data) and not final):
self.pending = bytes(list(data)[offset:])
return out.getvalue()
let i = data[offset]
let next_eol = (offset + 1 == len(data)) or (data[offset + 1] in (0x0A, 0x0D))
if i > 0x20 and i < 0x7F and i != b"="[0]:
if self.linelength >= 75 and not next_eol:
out.add("=\r\n")
self.linelength = 0
out.add(chr(i))
self.linelength += 1
else if i in (0x0A, 0x0D):
out.add(chr(i))
self.linelength = 0
else if i in (0x09, 0x20):
if next_eol:
if self.linelength > 73:
out.add("=\r\n")
out.add(chr(i))
self.linelength = 1
else:
let hexbit = hex(i)[2:].upper()
if len(hexbit) == 1: hexbit = "0" + hexbit
out.add("=" + hexbit)
self.linelength += 3
else:
out.add(chr(i))
self.linelength += 1
else:
if (self.linelength > 73) or (self.linelength > 72 and not next_eol):
out.add("=\r\n")
self.linelength = 0
let hexbit = hex(i)[2:].upper()
if len(hexbit) == 1: hexbit = "0" + hexbit
out.add("=" + hexbit)
self.linelength += 3
offset += 1
def reset():
"""Implements `IncrementalDecoder.reset`"""
self.linelength = 0
self.pending = b""
def getstate():
"""Implements `IncrementalDecoder.getstate`"""
return (self.linelength, self.pending)
def setstate(state):
"""Implements `IncrementalDecoder.setstate`"""
self.linelength = state[0]
self.pending = state[1]
class QuoPriIncrementalParser(IncrementalEncoder):
"""
IncrementalEncoder implementation to parse (yes) Quoted-Printable from a string to bytes.
"""
name = "inverse-quopri"
def encode(string_in, final = False):
"""Implements `IncrementalEncoder.encode`"""
let string = self.pending + string_in
self.pending = ""
let offset = 0
let out = ByteCatenator()
while 1:
let substring = string[offset:(offset + 1)] # not string[offset] (it will fail at end)
if len(substring) == 0:
return out.getvalue()
else if substring == "=":
substring = string[offset:(offset + 3)]
if len(substring) < 3 and not final:
self.pending = substring
return out.getvalue()
else:
out.add(bytes([ord(substring)]))
offset += 1
continue
# Python's QuoPri parser is very lenient so we should also be
let procsubst = substring.upper()
let hexd = "0123456789ABCDEF"
if len(substring) >= 3 and substring[1] == "\r" and substring[2] == "\n":
offset += 3
else if len(substring) >= 2 and substring[1] == "\n":
# Leniency to LF not CRLF in soft line break (following Python behaviour)
offset += 2
else if len(substring) >= 2 and substring[1] == "\r" and substring[2:3] != "\n":
# Leniency to CR not CRLF in soft line break
# Python behaviour will actually swallow any data between =\r and \n, which is
# BAD. I am not following suit.
offset += 2
else if len(substring) < 3 or procsubst[1] not in hexd or procsubst[2] not in hexd:
# Leniency to = which doesn't start a QuoPri escape
out.add(bytes([ord(substring[0])]))
offset += 1
else:
let byteval = (hexd.index(procsubst[1]) << 4) | hexd.index(procsubst[2])
out.add(bytes([byteval]))
offset += 3
def reset():
"""Implements `IncrementalEncoder.reset`"""
self.pending = ""
def getstate():
"""Implements `IncrementalEncoder.getstate`"""
return self.pending
def setstate(state):
"""Implements `IncrementalEncoder.setstate`"""
self.pending = state
register_kuroko_codec(["inverse-quopri"],
QuoPriIncrementalParser, QuoPriIncrementalCreator)
def base64_file_create(data, filename=None, mode=0o666):
"""
Create a Base64 string containing the provided data, with lines wrapped as required by some
formats. If a filename and optional UNIX mode are provided, Base64 headers as recognised by
some modern versions of uudecode are added.
"""
let out = StringCatenator()
let creator = Base64IncrementalCreator("strict")
if filename != None:
let octmode = oct(mode)[2:]
out.add(f"begin-base64 {octmode} {filename}\n")
let offset = 0
while offset < len(data):
let segment = bytes(list(data)[offset:offset+57])
out.add(creator.decode(segment, True))
out.add("\n")
offset += 57
if filename != None:
out.add("====\n")
return out.getvalue()
def uu_file_create(data, filename="-", mode=0o666):
"""
Create a string in the uuencode file format containing the provided data.
"""
let out = StringCatenator()
let creator = Base64UUIncrementalCreator("strict")
let octmode = oct(mode)[2:]
out.add(f"begin {octmode} {filename}\n")
let offset = 0
while offset < len(data):
let segment = bytes(list(data)[offset:offset+45])
out.add(chr(_base64_alphabet_uu[len(segment)]))
out.add(creator.decode(segment, True))
out.add("\n")
offset += 45
out.add("`\nend\n")
return out.getvalue()