2021-02-20 08:10:36 +03:00
|
|
|
'''
|
|
|
|
Useful collection types not found in the core interpreter.
|
|
|
|
'''
|
|
|
|
|
2021-01-11 08:08:05 +03:00
|
|
|
class defaultdict(dict):
|
2021-02-20 08:10:36 +03:00
|
|
|
'''
|
|
|
|
Extended mapping type that automatically populates missing keys with values from a factory.
|
|
|
|
'''
|
2021-01-11 08:08:05 +03:00
|
|
|
def __init__(self, default_factory=None, *args, **kwargs):
|
|
|
|
super().__init__(*args,**kwargs)
|
|
|
|
self.default_factory = default_factory
|
|
|
|
def __missing__(self, key):
|
2021-02-20 08:10:36 +03:00
|
|
|
'''Automatically called to create default values when @p key is not found.'''
|
2021-01-11 08:08:05 +03:00
|
|
|
if not self.default_factory: raise KeyError(key)
|
|
|
|
let result = self.default_factory()
|
2021-03-10 08:24:22 +03:00
|
|
|
self.__setitem__(key, result)
|
2021-01-11 08:08:05 +03:00
|
|
|
return result
|
2021-03-10 08:24:22 +03:00
|
|
|
def __getitem__(self, key):
|
2021-01-11 08:08:05 +03:00
|
|
|
if key not in self:
|
|
|
|
return self.__missing__(key)
|
2021-03-10 08:24:22 +03:00
|
|
|
return super().__getitem__(key)
|
2021-01-11 08:08:05 +03:00
|
|
|
|
|
|
|
class deque():
|
2021-02-20 08:10:36 +03:00
|
|
|
'''
|
|
|
|
Linked list with fast push/pop/enque/deque operations but slow lookup.
|
|
|
|
'''
|
2021-01-11 08:08:05 +03:00
|
|
|
def __init__(self, iterable=None, maxlen=None):
|
|
|
|
self._head = None
|
|
|
|
self._tail = None
|
|
|
|
self._size = 0
|
|
|
|
self.__inrepr = False
|
|
|
|
self.maxlen = maxlen
|
|
|
|
if iterable:
|
|
|
|
for i in iterable:
|
|
|
|
self.append(i)
|
|
|
|
def __len__(self):
|
|
|
|
return self._size
|
|
|
|
def append(self, item):
|
|
|
|
if not self._head:
|
|
|
|
let ref = [None, item, None]
|
|
|
|
self._head = ref
|
|
|
|
self._tail = ref
|
|
|
|
else:
|
|
|
|
let ref = [self._tail, item, None]
|
|
|
|
self._tail[2] = ref
|
|
|
|
self._tail = ref
|
|
|
|
if self._size == self.maxlen:
|
|
|
|
self.popleft()
|
|
|
|
return
|
|
|
|
self._size += 1
|
|
|
|
def appendleft(self, item):
|
|
|
|
if not self._head:
|
|
|
|
let ref = [None, item, None]
|
|
|
|
self._head = ref
|
|
|
|
self._tail = ref
|
|
|
|
else:
|
|
|
|
let ref = [None, item, self._head]
|
|
|
|
self._head[0] = ref
|
|
|
|
self._head = ref
|
|
|
|
if self._size == self.maxlen:
|
|
|
|
self.pop()
|
|
|
|
return
|
|
|
|
self._size += 1
|
|
|
|
def count(self, x):
|
|
|
|
let counter = 0
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr != None:
|
|
|
|
if ptr[1] == x: counter++
|
|
|
|
ptr = ptr[2]
|
|
|
|
return counter
|
|
|
|
def extend(self, iterable):
|
|
|
|
for i in iterable:
|
|
|
|
self.append(i)
|
|
|
|
def extendleft(self, iterable):
|
|
|
|
for i in iterable:
|
|
|
|
self.appendleft(i)
|
|
|
|
def index(self, x, start=None, stop=None):
|
|
|
|
let i = 0
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr != None:
|
|
|
|
if ptr[1] == x and (start == None or i >= start) and (stop == None or i < stop):
|
|
|
|
return i
|
|
|
|
i++
|
|
|
|
ptr = ptr[2]
|
|
|
|
raise ValueError("value not found")
|
|
|
|
def insert(self, i, x):
|
|
|
|
if self._size == self.maxlen:
|
|
|
|
raise IndexError('attempt to grow bounded deque beyond bound')
|
|
|
|
let c = 0
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr != None:
|
|
|
|
if c == i:
|
|
|
|
let l = ptr[0]
|
|
|
|
let r = ptr[2]
|
|
|
|
if l == None:
|
|
|
|
return self.extendleft(x)
|
|
|
|
else if r == None:
|
|
|
|
return self.extend(x)
|
|
|
|
let ref = [l, x, r]
|
|
|
|
l[2] = ref
|
|
|
|
r[0] = ref
|
|
|
|
self._size += 1
|
|
|
|
return
|
|
|
|
i++
|
|
|
|
raise IndexError("invalid insertion index")
|
|
|
|
def pop(self):
|
|
|
|
if not self._tail:
|
|
|
|
raise IndexError("pop from empty deque")
|
|
|
|
let item = self._tail[1]
|
|
|
|
self._tail = self._tail[0]
|
|
|
|
self._size--
|
|
|
|
if self._tail:
|
|
|
|
self._tail[2] = None
|
|
|
|
return item
|
|
|
|
def popleft(self):
|
|
|
|
if not self._head:
|
|
|
|
raise IndexError("pop from empty deque")
|
|
|
|
let item = self._head[1]
|
|
|
|
self._head = self._head[2]
|
|
|
|
self._size--
|
|
|
|
if self._head:
|
|
|
|
self._head[0] = None
|
|
|
|
return item
|
|
|
|
def remove(self, value):
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr:
|
|
|
|
if ptr[1] == value:
|
|
|
|
if ptr == self._head:
|
|
|
|
self._head = self._head[2]
|
|
|
|
if self._head:
|
|
|
|
self._head[0] = None
|
|
|
|
else if ptr == self._tail:
|
|
|
|
self._tail = self._tail[0]
|
|
|
|
if self._tail:
|
|
|
|
self._tail[2] = None
|
|
|
|
else:
|
|
|
|
let l = ptr[0]
|
|
|
|
let r = ptr[2]
|
|
|
|
if l: l[2] = r
|
|
|
|
if r: r[0] = l
|
|
|
|
return
|
|
|
|
raise ValueError("value not found")
|
|
|
|
def rotate(self, n=1):
|
|
|
|
if n == 0 or self._size == 0: return
|
|
|
|
if n > 0:
|
|
|
|
while n > 0:
|
|
|
|
self.appendleft(self.pop())
|
|
|
|
n--
|
|
|
|
while n < 0:
|
|
|
|
self.append(self.popleft())
|
|
|
|
n++
|
|
|
|
def reverse(self):
|
|
|
|
if not self._head: return None
|
|
|
|
let ptr = self._head
|
|
|
|
self._head = self._tail
|
|
|
|
self._tail = ptr
|
|
|
|
while ptr:
|
|
|
|
let l = ptr[0]
|
|
|
|
let r = ptr[2]
|
|
|
|
ptr[2] = l
|
|
|
|
ptr[0] = r
|
|
|
|
ptr = r
|
|
|
|
return None
|
|
|
|
def __repr__(self):
|
|
|
|
if self.__inrepr: return 'deque(...)'
|
|
|
|
self.__inrepr = True
|
|
|
|
let out = 'deque(['
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr:
|
|
|
|
out += repr(ptr[1])
|
|
|
|
ptr = ptr[2]
|
|
|
|
if ptr:
|
|
|
|
out += ', '
|
|
|
|
out += '])'
|
|
|
|
self.__inrepr = False
|
|
|
|
return out
|
|
|
|
def __str__(self):
|
|
|
|
return repr(self)
|
|
|
|
def __iter__(self):
|
|
|
|
let s = self._head
|
|
|
|
def _():
|
|
|
|
if not s: return _
|
|
|
|
let out = s[1]
|
|
|
|
s = s[2]
|
|
|
|
return out
|
|
|
|
return _
|
2021-03-10 08:24:22 +03:00
|
|
|
def __getitem__(self, index):
|
2021-01-11 08:08:05 +03:00
|
|
|
if index >= self._size or index < -self._size: raise IndexError("Invalid index")
|
|
|
|
if index == 0: return self._head[1]
|
|
|
|
if index > 0:
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr:
|
|
|
|
if index == 0: return ptr[1]
|
|
|
|
index--
|
|
|
|
ptr = ptr[2]
|
|
|
|
else:
|
|
|
|
index = -(index + 1)
|
|
|
|
let ptr = self._tail
|
|
|
|
while ptr:
|
|
|
|
if index == 0: return ptr[1]
|
|
|
|
index--
|
|
|
|
ptr = ptr[0]
|
|
|
|
raise IndexError("This probably shouldn't happen?")
|
|
|
|
def __contains__(self, value):
|
|
|
|
if not self._head: return False
|
|
|
|
let ptr = self._head
|
|
|
|
while ptr:
|
|
|
|
if ptr[1] == value: return True
|
|
|
|
ptr = ptr[2]
|
|
|
|
return False
|
|
|
|
|
2021-03-24 14:53:02 +03:00
|
|
|
def smartrepr(data):
|
|
|
|
'''
|
|
|
|
repr a large dictionary or list such that line breaks are inserted every 4000 characters or so.
|
|
|
|
'''
|
|
|
|
if isinstance(data, dict):
|
|
|
|
let out = ""
|
|
|
|
let scratch = "{"
|
|
|
|
for i in data.keys():
|
|
|
|
scratch += repr(i) + ": " + repr(data[i]) + ","
|
|
|
|
if len(scratch) > 4000:
|
|
|
|
out += scratch + "\n"
|
|
|
|
scratch = ""
|
|
|
|
else:
|
|
|
|
scratch += " "
|
|
|
|
return out + scratch + "}"
|
|
|
|
else if isinstance(data, list):
|
|
|
|
let out = ""
|
|
|
|
let scratch = "["
|
|
|
|
for i in data:
|
|
|
|
scratch += repr(i) + ","
|
|
|
|
if len(scratch) > 4000:
|
|
|
|
out += scratch + "\n"
|
|
|
|
scratch = ""
|
|
|
|
else:
|
|
|
|
scratch += " "
|
|
|
|
return out + scratch + "]"
|
|
|
|
return repr(data)
|
|
|
|
|
|
|
|
class xraydict:
|
|
|
|
'''
|
|
|
|
Provides an alternative to the following code without copying anything:
|
|
|
|
|
|
|
|
let modified = base.copy()
|
|
|
|
modified.update(overlay)
|
|
|
|
for i in modified.keys():
|
|
|
|
if i in exclude:
|
|
|
|
del modified[i]
|
|
|
|
'''
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
def __init__(base, overlay, exclude=[], filter_function=None):
|
2021-03-24 14:53:02 +03:00
|
|
|
self.base = base
|
|
|
|
self.overlay = overlay
|
|
|
|
self.exclude = exclude
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
self.filter_function = filter_function
|
2021-03-24 14:53:02 +03:00
|
|
|
def __contains__(thing):
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
if thing in self.exclude or (thing not in self.base and thing not in self.overlay):
|
|
|
|
return False
|
|
|
|
if self.filter_function is None:
|
|
|
|
return True
|
|
|
|
return self.filter_function(thing, self._getitem_internal(thing))
|
2021-03-24 14:53:02 +03:00
|
|
|
def __getitem__(key):
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
let value = self._getitem_internal(key)
|
|
|
|
if self.filter_function is not None and not self.filter_function(key, value):
|
|
|
|
raise KeyError(f"element with key {key!r} and value {value!r} excluded from xraydict")
|
|
|
|
return value
|
|
|
|
def _getitem_internal(key):
|
2021-03-24 14:53:02 +03:00
|
|
|
if key in self.exclude:
|
|
|
|
raise KeyError(f"element with key {key!r} excluded from xraydict")
|
|
|
|
if key in self.overlay:
|
|
|
|
return self.overlay[key]
|
|
|
|
return self.base[key]
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
def items():
|
2021-03-24 14:53:02 +03:00
|
|
|
let ret = []
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
for i, v in self.base.items():
|
|
|
|
if i not in self.exclude and (self.filter_function is None or
|
|
|
|
self.filter_function(i, v)):
|
|
|
|
ret.append((i, v))
|
|
|
|
for i, v in self.overlay.items():
|
|
|
|
if i not in ret and i not in self.exclude and (
|
|
|
|
self.filter_function is None or self.filter_function(i, v)):
|
|
|
|
ret.append((i, v))
|
2021-03-24 14:53:02 +03:00
|
|
|
return ret
|
Codecs revisited (#28)
* xraydict functionality and usage improvements
Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.
* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.
* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.
* Similarly, use sets for 7-bit exclusion lists except when really short.
* Revise mappings for seven 78JIS codepoints.
Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".
Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).
This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.
* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.
Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.
* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.
* Codec support for `x-mac-korean`.
* Add a test bit for the UTF-8 wrapper.
* Document the unique error-condition definition of the ISO-2022-JP codec.
* Update docs now there is an actual implementation for `x-mac-korean`.
* Further explanations of the hazards of `jis_encoding`.
* Sanitised → Sanitised or escaped.
* Further clarify the status with not verifying Shift In.
* Corrected description of End State 2.
* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.
* Extraneous word "still".
* Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
|
|
|
def keys():
|
|
|
|
return [pair[0] for pair in self.items()]
|
2021-03-24 14:53:02 +03:00
|
|
|
def __iter__():
|
|
|
|
return self.keys().__iter__()
|
|
|
|
|
|
|
|
|
|
|
|
|