kuroko/modules/collections.krk

290 lines
9.0 KiB
Python
Raw Normal View History

2021-02-20 08:10:36 +03:00
'''
Useful collection types not found in the core interpreter.
'''
2021-01-11 08:08:05 +03:00
class defaultdict(dict):
2021-02-20 08:10:36 +03:00
'''
Extended mapping type that automatically populates missing keys with values from a factory.
'''
2021-01-11 08:08:05 +03:00
def __init__(self, default_factory=None, *args, **kwargs):
super().__init__(*args,**kwargs)
self.default_factory = default_factory
def __missing__(self, key):
2021-02-20 08:10:36 +03:00
'''Automatically called to create default values when @p key is not found.'''
2021-01-11 08:08:05 +03:00
if not self.default_factory: raise KeyError(key)
let result = self.default_factory()
self.__setitem__(key, result)
2021-01-11 08:08:05 +03:00
return result
def __getitem__(self, key):
2021-01-11 08:08:05 +03:00
if key not in self:
return self.__missing__(key)
return super().__getitem__(key)
2021-01-11 08:08:05 +03:00
class deque():
2021-02-20 08:10:36 +03:00
'''
Linked list with fast push/pop/enque/deque operations but slow lookup.
'''
2021-01-11 08:08:05 +03:00
def __init__(self, iterable=None, maxlen=None):
self._head = None
self._tail = None
self._size = 0
self.__inrepr = False
self.maxlen = maxlen
if iterable:
for i in iterable:
self.append(i)
def __len__(self):
return self._size
def append(self, item):
if not self._head:
let ref = [None, item, None]
self._head = ref
self._tail = ref
else:
let ref = [self._tail, item, None]
self._tail[2] = ref
self._tail = ref
if self._size == self.maxlen:
self.popleft()
return
self._size += 1
def appendleft(self, item):
if not self._head:
let ref = [None, item, None]
self._head = ref
self._tail = ref
else:
let ref = [None, item, self._head]
self._head[0] = ref
self._head = ref
if self._size == self.maxlen:
self.pop()
return
self._size += 1
def count(self, x):
let counter = 0
let ptr = self._head
while ptr != None:
if ptr[1] == x: counter++
ptr = ptr[2]
return counter
def extend(self, iterable):
for i in iterable:
self.append(i)
def extendleft(self, iterable):
for i in iterable:
self.appendleft(i)
def index(self, x, start=None, stop=None):
let i = 0
let ptr = self._head
while ptr != None:
if ptr[1] == x and (start == None or i >= start) and (stop == None or i < stop):
return i
i++
ptr = ptr[2]
raise ValueError("value not found")
def insert(self, i, x):
if self._size == self.maxlen:
raise IndexError('attempt to grow bounded deque beyond bound')
let c = 0
let ptr = self._head
while ptr != None:
if c == i:
let l = ptr[0]
let r = ptr[2]
if l == None:
return self.extendleft(x)
else if r == None:
return self.extend(x)
let ref = [l, x, r]
l[2] = ref
r[0] = ref
self._size += 1
return
i++
raise IndexError("invalid insertion index")
def pop(self):
if not self._tail:
raise IndexError("pop from empty deque")
let item = self._tail[1]
self._tail = self._tail[0]
self._size--
if self._tail:
self._tail[2] = None
return item
def popleft(self):
if not self._head:
raise IndexError("pop from empty deque")
let item = self._head[1]
self._head = self._head[2]
self._size--
if self._head:
self._head[0] = None
return item
def remove(self, value):
let ptr = self._head
while ptr:
if ptr[1] == value:
if ptr == self._head:
self._head = self._head[2]
if self._head:
self._head[0] = None
else if ptr == self._tail:
self._tail = self._tail[0]
if self._tail:
self._tail[2] = None
else:
let l = ptr[0]
let r = ptr[2]
if l: l[2] = r
if r: r[0] = l
return
raise ValueError("value not found")
def rotate(self, n=1):
if n == 0 or self._size == 0: return
if n > 0:
while n > 0:
self.appendleft(self.pop())
n--
while n < 0:
self.append(self.popleft())
n++
def reverse(self):
if not self._head: return None
let ptr = self._head
self._head = self._tail
self._tail = ptr
while ptr:
let l = ptr[0]
let r = ptr[2]
ptr[2] = l
ptr[0] = r
ptr = r
return None
def __repr__(self):
if self.__inrepr: return 'deque(...)'
self.__inrepr = True
let out = 'deque(['
let ptr = self._head
while ptr:
out += repr(ptr[1])
ptr = ptr[2]
if ptr:
out += ', '
out += '])'
self.__inrepr = False
return out
def __str__(self):
return repr(self)
def __iter__(self):
let s = self._head
def _():
if not s: return _
let out = s[1]
s = s[2]
return out
return _
def __getitem__(self, index):
2021-01-11 08:08:05 +03:00
if index >= self._size or index < -self._size: raise IndexError("Invalid index")
if index == 0: return self._head[1]
if index > 0:
let ptr = self._head
while ptr:
if index == 0: return ptr[1]
index--
ptr = ptr[2]
else:
index = -(index + 1)
let ptr = self._tail
while ptr:
if index == 0: return ptr[1]
index--
ptr = ptr[0]
raise IndexError("This probably shouldn't happen?")
def __contains__(self, value):
if not self._head: return False
let ptr = self._head
while ptr:
if ptr[1] == value: return True
ptr = ptr[2]
return False
def smartrepr(data):
'''
repr a large dictionary or list such that line breaks are inserted every 4000 characters or so.
'''
if isinstance(data, dict):
let out = ""
let scratch = "{"
for i in data.keys():
scratch += repr(i) + ": " + repr(data[i]) + ","
if len(scratch) > 4000:
out += scratch + "\n"
scratch = ""
else:
scratch += " "
return out + scratch + "}"
else if isinstance(data, list):
let out = ""
let scratch = "["
for i in data:
scratch += repr(i) + ","
if len(scratch) > 4000:
out += scratch + "\n"
scratch = ""
else:
scratch += " "
return out + scratch + "]"
return repr(data)
class xraydict:
'''
Provides an alternative to the following code without copying anything:
let modified = base.copy()
modified.update(overlay)
for i in modified.keys():
if i in exclude:
del modified[i]
'''
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
def __init__(base, overlay, exclude=[], filter_function=None):
self.base = base
self.overlay = overlay
self.exclude = exclude
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
self.filter_function = filter_function
def __contains__(thing):
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
if thing in self.exclude or (thing not in self.base and thing not in self.overlay):
return False
if self.filter_function is None:
return True
return self.filter_function(thing, self._getitem_internal(thing))
def __getitem__(key):
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
let value = self._getitem_internal(key)
if self.filter_function is not None and not self.filter_function(key, value):
raise KeyError(f"element with key {key!r} and value {value!r} excluded from xraydict")
return value
def _getitem_internal(key):
if key in self.exclude:
raise KeyError(f"element with key {key!r} excluded from xraydict")
if key in self.overlay:
return self.overlay[key]
return self.base[key]
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
def items():
let ret = []
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
for i, v in self.base.items():
if i not in self.exclude and (self.filter_function is None or
self.filter_function(i, v)):
ret.append((i, v))
for i, v in self.overlay.items():
if i not in ret and i not in self.exclude and (
self.filter_function is None or self.filter_function(i, v)):
ret.append((i, v))
return ret
Codecs revisited (#28) * xraydict functionality and usage improvements Add a filter_function to xraydict, allowing fewer big data structures. Make uses of xraydict prefer exclusion sets to exclusion lists, to avoid repeated linear search of a list. * Make `big5_coded_forms_from_hkscs` a set, remove set trailing commas. * Remove `big5_coded_forms_from_hkscs` in favour of a filter function. * Similarly, use sets for 7-bit exclusion lists except when really short. * Revise mappings for seven 78JIS codepoints. Mappings for 25-23 and 90-22 were previously the same as those used for 97JIS; they have been swapped to correspond with how the IBM extension versus the standard code are mapped in the "old sequence" (78JIS-based) as opposed to the "new sequence". Mappings for 32-70, 34-45, 35-29, 39-77 and 54-02 in 78JIS have been changed to reflect disunifications made in 2000-JIS and 2004-JIS, assigning the 1978-edition unsimplified variants of those characters separate coded forms (where previously, only swaps and disunifications in 83JIS and disunifications in 90JIS (including JIS X 0212) had been considered). This only affects the `jis_encoding` codec (including the decoding direction for `iso-2022-jp-2`, `iso-2022-jp-3` and `iso-2022-jp-2004`), and the decoding is only affected when `ESC $ @` (not `ESC $ B`) is used. The `iso-2022-jp` codec is unaffected, and remains similar to (but more consistently pedantic than) the WHATWG specification, thus using the same table for both 78JIS and 97JIS. * Make `johab-ebcdic` decoder use many-to-one, not corporate PUA. Many-to-one decodes are not uncommon in CJK encodings (e.g. Windows-31J), and mapping to the IBM Corporate PUA (code page 1449) would probably make it render as completely the wrong character if at all in practice. * Switch `cp950_no_eudc_encoding_map` away from a hardcoded exclusion list. * Codec support for `x-mac-korean`. * Add a test bit for the UTF-8 wrapper. * Document the unique error-condition definition of the ISO-2022-JP codec. * Update docs now there is an actual implementation for `x-mac-korean`. * Further explanations of the hazards of `jis_encoding`. * Sanitised → Sanitised or escaped. * Further clarify the status with not verifying Shift In. * Corrected description of End State 2. * Changes to MacKorean to avoid mapping non-ASCII using ASCII punctuation. * Extraneous word "still". * Fix omitting MacKorean single-byte codes.
2022-07-23 02:32:54 +03:00
def keys():
return [pair[0] for pair in self.items()]
def __iter__():
return self.keys().__iter__()