One fix and one improvement to GB18030: (#15)

— The codec had been failing to decode 0x81308130 to U+0080, even though
it successfully encoded it. Since U+0080 is not used for anything in most
contexts (it's allocated as a control code in the ECMA-35 sense, but
ECMA-48 does not use it) this is unlikely to have hurt anything, but I
have fixed it anyway (it arose from 0 and None being conflated in a
conditional).

— The encoding and decoding of GB18030 four-byte codes now uses binary
search rather than linear search. This significantly improves performance
on four-byte codes, though performance on two-byte codes is unaffected.
This commit is contained in:
HarJIT 2021-08-12 11:17:59 +01:00 committed by GitHub
parent 9435cbf442
commit f5f314a42d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -51,16 +51,22 @@ class Gb18030IncrementalEncoder(IncrementalEncoder):
if ord(i) == 0xE7C7:
pointer = 7457
else:
let basepointer = 0
let basecodepoint = 0
for nexttuple in more_dbdata.gb_surrogate_ranges:
let nextpointer = nexttuple[0]
let nextcodepoint = nexttuple[1]
if nextcodepoint > ord(i):
let leftindex = 0
let rightindex = len(more_dbdata.gb_surrogate_ranges) - 1
while leftindex != rightindex:
if leftindex == rightindex - 1:
if more_dbdata.gb_surrogate_ranges[rightindex][1] > ord(i):
rightindex = leftindex
else:
leftindex = rightindex
break
basepointer = nextpointer
basecodepoint = nextcodepoint
pointer = (ord(i) - basecodepoint) + basepointer
let centreindex = (leftindex + rightindex) // 2
if more_dbdata.gb_surrogate_ranges[centreindex][1] > ord(i):
rightindex = centreindex
else:
leftindex = centreindex
pointer = ((ord(i) - more_dbdata.gb_surrogate_ranges[leftindex][1])
+ more_dbdata.gb_surrogate_ranges[rightindex][0])
let running = pointer
let first = 0x81 + (running // (10 * 126 * 10))
running %= 10 * 126 * 10
@ -125,22 +131,28 @@ class Gb18030IncrementalDecoder(IncrementalDecoder):
else if bytemode == 4 and len(leader) == 2 and (0x81 <= i and i <= 0xFE):
leader.append(i)
offset += 1
else if bytemode == 4 and len(leader) == 3 and _get_gbsurrogate_pointer(leader, i):
else if bytemode == 4 and len(leader) == 3 and (_get_gbsurrogate_pointer(leader, i) != None):
let pointer = _get_gbsurrogate_pointer(leader, i)
let codepoint
if pointer == 7457:
codepoint = 0xE7C7
else:
let basecodepoint = 0
let basepointer = 0
for nexttuple in more_dbdata.gb_surrogate_ranges:
let nextpointer = nexttuple[0]
let nextcodepoint = nexttuple[1]
if nextpointer > pointer:
let leftindex = 0
let rightindex = len(more_dbdata.gb_surrogate_ranges) - 1
while leftindex != rightindex:
if leftindex == rightindex - 1:
if more_dbdata.gb_surrogate_ranges[rightindex][0] > pointer:
rightindex = leftindex
else:
leftindex = rightindex
break
basecodepoint = nextcodepoint
basepointer = nextpointer
codepoint = (pointer - basepointer) + basecodepoint
let centreindex = (leftindex + rightindex) // 2
if more_dbdata.gb_surrogate_ranges[centreindex][0] > pointer:
rightindex = centreindex
else:
leftindex = centreindex
codepoint = ((pointer - more_dbdata.gb_surrogate_ranges[leftindex][0])
+ more_dbdata.gb_surrogate_ranges[rightindex][1])
out.add(chr(codepoint))
offset += 1
bytemode = 1