kuroko/modules/collections.krk
HarJIT a580a835b8
Codecs revisited (#28)
* xraydict functionality and usage improvements

Add a filter_function to xraydict, allowing fewer big data structures. Make
uses of xraydict prefer exclusion sets to exclusion lists, to avoid
repeated linear search of a list.

* Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas.

* Remove `big5_coded_forms_from_hkscs` in favour of a filter function.

* Similarly, use sets for 7-bit exclusion lists except when really short.

* Revise mappings for seven 78JIS codepoints.

Mappings for 25-23 and 90-22 were previously the same as those used for
97JIS; they have been swapped to correspond with how the IBM extension
versus the standard code are mapped in the "old sequence" (78JIS-based)
as opposed to the "new sequence".

Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been
changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning
the 1978-edition unsimplified variants of those characters separate coded
forms (where previously, only swaps and disunifications in 83JIS and
disunifications in 90JIS (including JIS X 0212) had been considered).

This only affects the `jis_encoding` codec (including the decoding
direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`),
and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used.
The `iso-2022-jp` codec is unaffected, and remains similar to (but more
consistently pedantic than) the WHATWG specification, thus using the same
table for both 78JIS and 97JIS.

* Make `johab-ebcdic` decoder use many-to-one, not corporate PUA.

Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J),
and mapping to the IBM Corporate PUA (code page 1449) would probably make
it render as completely the wrong character if at all in practice.

* Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list.

* Codec support for `x-mac-korean`.

* Add a test bit for the UTF-8 wrapper.

* Document the unique error-condition definition of the ISO-2022-JP codec.

* Update docs now there is an actual implementation for `x-mac-korean`.

* Further explanations of the hazards of `jis_encoding`.

* Sanitised → Sanitised or escaped.

* Further clarify the status with not verifying Shift In.

* Corrected description of End State 2.

* Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation.

* Extraneous word "still".

* Fix omitting MacKorean single-byte codes.
2022-07-23 08:32:54 +09:00

290 lines
9.0 KiB
Python

'''
Useful collection types not found in the core interpreter.
'''
class defaultdict(dict):
'''
Extended mapping type that automatically populates missing keys with values from a factory.
'''
def __init__(self, default_factory=None, *args, **kwargs):
super().__init__(*args,**kwargs)
self.default_factory = default_factory
def __missing__(self, key):
'''Automatically called to create default values when @p key is not found.'''
if not self.default_factory: raise KeyError(key)
let result = self.default_factory()
self.__setitem__(key, result)
return result
def __getitem__(self, key):
if key not in self:
return self.__missing__(key)
return super().__getitem__(key)
class deque():
'''
Linked list with fast push/pop/enque/deque operations but slow lookup.
'''
def __init__(self, iterable=None, maxlen=None):
self._head = None
self._tail = None
self._size = 0
self.__inrepr = False
self.maxlen = maxlen
if iterable:
for i in iterable:
self.append(i)
def __len__(self):
return self._size
def append(self, item):
if not self._head:
let ref = [None, item, None]
self._head = ref
self._tail = ref
else:
let ref = [self._tail, item, None]
self._tail[2] = ref
self._tail = ref
if self._size == self.maxlen:
self.popleft()
return
self._size += 1
def appendleft(self, item):
if not self._head:
let ref = [None, item, None]
self._head = ref
self._tail = ref
else:
let ref = [None, item, self._head]
self._head[0] = ref
self._head = ref
if self._size == self.maxlen:
self.pop()
return
self._size += 1
def count(self, x):
let counter = 0
let ptr = self._head
while ptr != None:
if ptr[1] == x: counter++
ptr = ptr[2]
return counter
def extend(self, iterable):
for i in iterable:
self.append(i)
def extendleft(self, iterable):
for i in iterable:
self.appendleft(i)
def index(self, x, start=None, stop=None):
let i = 0
let ptr = self._head
while ptr != None:
if ptr[1] == x and (start == None or i >= start) and (stop == None or i < stop):
return i
i++
ptr = ptr[2]
raise ValueError("value not found")
def insert(self, i, x):
if self._size == self.maxlen:
raise IndexError('attempt to grow bounded deque beyond bound')
let c = 0
let ptr = self._head
while ptr != None:
if c == i:
let l = ptr[0]
let r = ptr[2]
if l == None:
return self.extendleft(x)
else if r == None:
return self.extend(x)
let ref = [l, x, r]
l[2] = ref
r[0] = ref
self._size += 1
return
i++
raise IndexError("invalid insertion index")
def pop(self):
if not self._tail:
raise IndexError("pop from empty deque")
let item = self._tail[1]
self._tail = self._tail[0]
self._size--
if self._tail:
self._tail[2] = None
return item
def popleft(self):
if not self._head:
raise IndexError("pop from empty deque")
let item = self._head[1]
self._head = self._head[2]
self._size--
if self._head:
self._head[0] = None
return item
def remove(self, value):
let ptr = self._head
while ptr:
if ptr[1] == value:
if ptr == self._head:
self._head = self._head[2]
if self._head:
self._head[0] = None
else if ptr == self._tail:
self._tail = self._tail[0]
if self._tail:
self._tail[2] = None
else:
let l = ptr[0]
let r = ptr[2]
if l: l[2] = r
if r: r[0] = l
return
raise ValueError("value not found")
def rotate(self, n=1):
if n == 0 or self._size == 0: return
if n > 0:
while n > 0:
self.appendleft(self.pop())
n--
while n < 0:
self.append(self.popleft())
n++
def reverse(self):
if not self._head: return None
let ptr = self._head
self._head = self._tail
self._tail = ptr
while ptr:
let l = ptr[0]
let r = ptr[2]
ptr[2] = l
ptr[0] = r
ptr = r
return None
def __repr__(self):
if self.__inrepr: return 'deque(...)'
self.__inrepr = True
let out = 'deque(['
let ptr = self._head
while ptr:
out += repr(ptr[1])
ptr = ptr[2]
if ptr:
out += ', '
out += '])'
self.__inrepr = False
return out
def __str__(self):
return repr(self)
def __iter__(self):
let s = self._head
def _():
if not s: return _
let out = s[1]
s = s[2]
return out
return _
def __getitem__(self, index):
if index >= self._size or index < -self._size: raise IndexError("Invalid index")
if index == 0: return self._head[1]
if index > 0:
let ptr = self._head
while ptr:
if index == 0: return ptr[1]
index--
ptr = ptr[2]
else:
index = -(index + 1)
let ptr = self._tail
while ptr:
if index == 0: return ptr[1]
index--
ptr = ptr[0]
raise IndexError("This probably shouldn't happen?")
def __contains__(self, value):
if not self._head: return False
let ptr = self._head
while ptr:
if ptr[1] == value: return True
ptr = ptr[2]
return False
def smartrepr(data):
'''
repr a large dictionary or list such that line breaks are inserted every 4000 characters or so.
'''
if isinstance(data, dict):
let out = ""
let scratch = "{"
for i in data.keys():
scratch += repr(i) + ": " + repr(data[i]) + ","
if len(scratch) > 4000:
out += scratch + "\n"
scratch = ""
else:
scratch += " "
return out + scratch + "}"
else if isinstance(data, list):
let out = ""
let scratch = "["
for i in data:
scratch += repr(i) + ","
if len(scratch) > 4000:
out += scratch + "\n"
scratch = ""
else:
scratch += " "
return out + scratch + "]"
return repr(data)
class xraydict:
'''
Provides an alternative to the following code without copying anything:
let modified = base.copy()
modified.update(overlay)
for i in modified.keys():
if i in exclude:
del modified[i]
'''
def __init__(base, overlay, exclude=[], filter_function=None):
self.base = base
self.overlay = overlay
self.exclude = exclude
self.filter_function = filter_function
def __contains__(thing):
if thing in self.exclude or (thing not in self.base and thing not in self.overlay):
return False
if self.filter_function is None:
return True
return self.filter_function(thing, self._getitem_internal(thing))
def __getitem__(key):
let value = self._getitem_internal(key)
if self.filter_function is not None and not self.filter_function(key, value):
raise KeyError(f"element with key {key!r} and value {value!r} excluded from xraydict")
return value
def _getitem_internal(key):
if key in self.exclude:
raise KeyError(f"element with key {key!r} excluded from xraydict")
if key in self.overlay:
return self.overlay[key]
return self.base[key]
def items():
let ret = []
for i, v in self.base.items():
if i not in self.exclude and (self.filter_function is None or
self.filter_function(i, v)):
ret.append((i, v))
for i, v in self.overlay.items():
if i not in ret and i not in self.exclude and (
self.filter_function is None or self.filter_function(i, v)):
ret.append((i, v))
return ret
def keys():
return [pair[0] for pair in self.items()]
def __iter__():
return self.keys().__iter__()