One fix and one improvement to GB18030: (#15)
— The codec had been failing to decode 0x81308130 to U+0080, even though it successfully encoded it. Since U+0080 is not used for anything in most contexts (it's allocated as a control code in the ECMA-35 sense, but ECMA-48 does not use it) this is unlikely to have hurt anything, but I have fixed it anyway (it arose from 0 and None being conflated in a conditional). — The encoding and decoding of GB18030 four-byte codes now uses binary search rather than linear search. This significantly improves performance on four-byte codes, though performance on two-byte codes is unaffected.
This commit is contained in:
parent
9435cbf442
commit
f5f314a42d
@ -51,16 +51,22 @@ class Gb18030IncrementalEncoder(IncrementalEncoder):
|
|||||||
if ord(i) == 0xE7C7:
|
if ord(i) == 0xE7C7:
|
||||||
pointer = 7457
|
pointer = 7457
|
||||||
else:
|
else:
|
||||||
let basepointer = 0
|
let leftindex = 0
|
||||||
let basecodepoint = 0
|
let rightindex = len(more_dbdata.gb_surrogate_ranges) - 1
|
||||||
for nexttuple in more_dbdata.gb_surrogate_ranges:
|
while leftindex != rightindex:
|
||||||
let nextpointer = nexttuple[0]
|
if leftindex == rightindex - 1:
|
||||||
let nextcodepoint = nexttuple[1]
|
if more_dbdata.gb_surrogate_ranges[rightindex][1] > ord(i):
|
||||||
if nextcodepoint > ord(i):
|
rightindex = leftindex
|
||||||
|
else:
|
||||||
|
leftindex = rightindex
|
||||||
break
|
break
|
||||||
basepointer = nextpointer
|
let centreindex = (leftindex + rightindex) // 2
|
||||||
basecodepoint = nextcodepoint
|
if more_dbdata.gb_surrogate_ranges[centreindex][1] > ord(i):
|
||||||
pointer = (ord(i) - basecodepoint) + basepointer
|
rightindex = centreindex
|
||||||
|
else:
|
||||||
|
leftindex = centreindex
|
||||||
|
pointer = ((ord(i) - more_dbdata.gb_surrogate_ranges[leftindex][1])
|
||||||
|
+ more_dbdata.gb_surrogate_ranges[rightindex][0])
|
||||||
let running = pointer
|
let running = pointer
|
||||||
let first = 0x81 + (running // (10 * 126 * 10))
|
let first = 0x81 + (running // (10 * 126 * 10))
|
||||||
running %= 10 * 126 * 10
|
running %= 10 * 126 * 10
|
||||||
@ -125,22 +131,28 @@ class Gb18030IncrementalDecoder(IncrementalDecoder):
|
|||||||
else if bytemode == 4 and len(leader) == 2 and (0x81 <= i and i <= 0xFE):
|
else if bytemode == 4 and len(leader) == 2 and (0x81 <= i and i <= 0xFE):
|
||||||
leader.append(i)
|
leader.append(i)
|
||||||
offset += 1
|
offset += 1
|
||||||
else if bytemode == 4 and len(leader) == 3 and _get_gbsurrogate_pointer(leader, i):
|
else if bytemode == 4 and len(leader) == 3 and (_get_gbsurrogate_pointer(leader, i) != None):
|
||||||
let pointer = _get_gbsurrogate_pointer(leader, i)
|
let pointer = _get_gbsurrogate_pointer(leader, i)
|
||||||
let codepoint
|
let codepoint
|
||||||
if pointer == 7457:
|
if pointer == 7457:
|
||||||
codepoint = 0xE7C7
|
codepoint = 0xE7C7
|
||||||
else:
|
else:
|
||||||
let basecodepoint = 0
|
let leftindex = 0
|
||||||
let basepointer = 0
|
let rightindex = len(more_dbdata.gb_surrogate_ranges) - 1
|
||||||
for nexttuple in more_dbdata.gb_surrogate_ranges:
|
while leftindex != rightindex:
|
||||||
let nextpointer = nexttuple[0]
|
if leftindex == rightindex - 1:
|
||||||
let nextcodepoint = nexttuple[1]
|
if more_dbdata.gb_surrogate_ranges[rightindex][0] > pointer:
|
||||||
if nextpointer > pointer:
|
rightindex = leftindex
|
||||||
|
else:
|
||||||
|
leftindex = rightindex
|
||||||
break
|
break
|
||||||
basecodepoint = nextcodepoint
|
let centreindex = (leftindex + rightindex) // 2
|
||||||
basepointer = nextpointer
|
if more_dbdata.gb_surrogate_ranges[centreindex][0] > pointer:
|
||||||
codepoint = (pointer - basepointer) + basecodepoint
|
rightindex = centreindex
|
||||||
|
else:
|
||||||
|
leftindex = centreindex
|
||||||
|
codepoint = ((pointer - more_dbdata.gb_surrogate_ranges[leftindex][0])
|
||||||
|
+ more_dbdata.gb_surrogate_ranges[rightindex][1])
|
||||||
out.add(chr(codepoint))
|
out.add(chr(codepoint))
|
||||||
offset += 1
|
offset += 1
|
||||||
bytemode = 1
|
bytemode = 1
|
||||||
|
Loading…
Reference in New Issue
Block a user