614193b8a1
* Add some docs, and remove second Code page 874 codec (they handled the non-overridden C1 area differently, but we only need one). * More docs work. * Doc stuff. * Adjusted. * More tweaks (table padding is not the docstring's problem). * CSS and docstring tweaks. * Link from modules to parent packages and vice versa. * More documentation. * Docstrings for all `codecs` submodules. * Move encode_jis7_reduced into dbextra_data_7bit (thus completing the lazy startup which was apparently not complete already) and docstrings added to implementations of base class methods referring up to the base class. * Remove FUSE junk that somehow made it into the repo. * Some more docstrings. * Fix some broken references to `string` (rather than `data`) which would have caused a problem if any existing error handler had returned a negative offset (which no current handler does, but it's worth fixing anyway). * Add a cp042 codec to accompany the x-user-defined codec, and to pave the way for maybe adding Adobe Symbol, Zapf Dingbats or Wingdings codecs in future. * Better Japanese Autodetect behaviour for ISO-2022-JP (add yet another condition in which it will be detected, making it able to conclusively detect it prior to end of stream without being fed an entire escape sequence in one call). Also some docs tweaks. * idstr() → _idstr() since it's internal. * Docs for codecs.pifonts. * Docstrings for dbextra. * Document the sbextra classes. * Docstrings for the web encodings. * Possibly a fairer assessment of likely reality. * Docstrings for codecs.binascii * The *encoding* isn't removed (the BOM is). * Make it clearer when competing OEM code pages use different letter layouts. * Fix copied in error. * Stop generating linking to non-existent "← tools" from tools.gendoc. * Move .fuse_hidden* exclusion to my user-level config. * Constrain the table style changes to class .markdownTable, to avoid any effect on other interface tables generated by Doxygen. * Refer to `__ispackage__` when generating help.
82 lines
3.3 KiB
Python
82 lines
3.3 KiB
Python
"""
|
|
This module includes codecs implementing special handling for symbol fonts.
|
|
"""
|
|
|
|
from codecs.infrastructure import register_kuroko_codec, ByteCatenator, StringCatenator, UnicodeEncodeError, UnicodeDecodeError, lookup_error, lookup, IncrementalEncoder, IncrementalDecoder, lazy_property
|
|
from collections import xraydict
|
|
|
|
class Cp042IncrementalEncoder(IncrementalEncoder):
|
|
"""
|
|
Encoder for Windows code page 42 (GDI Symbol), and base class for symbol font encoders.
|
|
|
|
This maps characters to PUA with the low 8 bits matching the original byte encoding, similarly
|
|
to `x-user-defined`, but using a different PUA range and including all non-C0 bytes, not
|
|
only non-ASCII bytes.
|
|
"""
|
|
name = "cp042"
|
|
html5name = None
|
|
encoding_map = {}
|
|
def encode(string, final = False):
|
|
"""Implements `IncrementalEncoder.encode`"""
|
|
let out = ByteCatenator()
|
|
let offset = 0
|
|
while 1: # offset can be arbitrarily changed by the error handler, so not a for
|
|
if offset >= len(string):
|
|
return out.getvalue()
|
|
let i = string[offset]
|
|
if ord(i) in self.encoding_map:
|
|
let target = self.encoding_map[ord(i)]
|
|
out.add(bytes([target]))
|
|
offset += 1
|
|
else if ord(i) < 0x100:
|
|
# U+0020 thru U+00FF are accepted by GDI itself, but not by Code page 42
|
|
# as implemented by Microsoft, which has caused problems:
|
|
# http://archives.miloush.net/michkap/archive/2005/11/08/490495.html
|
|
out.add(bytes([ord(i)]))
|
|
offset += 1
|
|
else if (0xF020 <= ord(i)) and (ord(i) < 0xF100):
|
|
out.add(bytes([ord(i) - 0xF000]))
|
|
offset += 1
|
|
else if (0xF780 <= ord(i)) and (ord(i) < 0xF800):
|
|
# Accept (not generate) the x-user-defined range as well, because why not?
|
|
out.add(bytes([ord(i) - 0xF700]))
|
|
offset += 1
|
|
else:
|
|
let error = UnicodeEncodeError(self.name, string, offset, offset + 1,
|
|
"character not supported by target encoding")
|
|
let errorret = lookup_error(self.errors)(error)
|
|
out.add(errorret[0])
|
|
offset = errorret[1]
|
|
if offset < 0:
|
|
offset += len(string)
|
|
|
|
class Cp042IncrementalDecoder(IncrementalDecoder):
|
|
"""
|
|
Decoder for Windows code page 42 (GDI Symbol), and base class for symbol font decoders.
|
|
|
|
This maps characters to PUA with the low 8 bits matching the original byte encoding, similarly
|
|
to `x-user-defined`, but using a different PUA range and including all non-C0 bytes, not
|
|
only non-ASCII bytes.
|
|
"""
|
|
name = "cp042"
|
|
html5name = None
|
|
decoding_map = {}
|
|
def decode(data, final = False):
|
|
"""Implements `IncrementalDecoder.decode`"""
|
|
self.pending = b""
|
|
let out = StringCatenator()
|
|
let offset = 0
|
|
for i in data:
|
|
if i in self.decoding_map:
|
|
out.add(chr(self.decoding_map[i]))
|
|
else if i < 0x20:
|
|
out.add(chr(i))
|
|
else:
|
|
out.add(chr(i + 0xF000))
|
|
return out.getvalue()
|
|
|
|
register_kuroko_codec(["cp042"], Cp042IncrementalEncoder, Cp042IncrementalDecoder)
|
|
|
|
|
|
|