Up until now the fts4 "unicode61" tokenizer has treated all private use codepoints except the first and last of each of the three ranges as alphanumeric (eligible to be part of tokens). This commit fixes this so that all private use codepoints are considered alphanumeric. In other words, it fixes the handling of codepoints 0xE000, 0xF8FF, 0xF0000, 0xFFFFD, 0x100000 and 0x10FFFD.

FossilOrigin-Name: 6cfd9af5250029c0d275be027b4208c48954a8a1
This commit is contained in:
dan 2013-06-05 16:17:21 +00:00
parent f5ad80397d
commit f2c9229f73
5 changed files with 56 additions and 36 deletions

View File

@ -101,28 +101,27 @@ int sqlite3FtsUnicodeIsalnum(int c){
0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
0x43FFF401,
0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
0x380400F0,
};
static const unsigned int aAscii[4] = {
0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,

View File

@ -239,7 +239,10 @@ proc an_load_unicodedata_text {zName} {
foreach $lField $fields {}
set iCode [expr "0x$code"]
set bAlnum [expr {[lsearch {L N} [string range $general_category 0 0]]>=0}]
set bAlnum [expr {
[lsearch {L N} [string range $general_category 0 0]] >= 0
|| $general_category=="Co"
}]
if { !$bAlnum } { lappend lRet $iCode }
}
@ -360,7 +363,7 @@ proc print_isalnum {zFunc lRange} {
}
assert( aEntry[0]<key );
assert( key>=aEntry[iRes] );
return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
}
return 1;}
puts "\}"
@ -729,7 +732,7 @@ proc print_fileheader {} {
*/
}]
puts ""
puts "#if !defined(SQLITE_DISABLE_FTS3_UNICODE)"
puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)"
puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
puts ""
puts "#include <assert.h>"
@ -805,4 +808,4 @@ if {$::generate_test_code} {
}
puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */"

View File

@ -1,5 +1,5 @@
C Fix\sa\stypo\sin\sa\scollating\sfunction\sinside\sthe\se_reindex.test\sscript.
D 2013-06-03T20:39:15.752
C Up\suntil\snow\sthe\sfts4\s"unicode61"\stokenizer\shas\streated\sall\sprivate\suse\scodepoints\sexcept\sthe\sfirst\sand\slast\sof\seach\sof\sthe\sthree\sranges\sas\salphanumeric\s(eligible\sto\sbe\spart\sof\stokens).\sThis\scommit\sfixes\sthis\sso\sthat\sall\sprivate\suse\scodepoints\sare\sconsidered\salphanumeric.\sIn\sother\swords,\sit\sfixes\sthe\shandling\sof\scodepoints\s0xE000,\s0xF8FF,\s0xF0000,\s0xFFFFD,\s0x100000\sand\s0x10FFFD.
D 2013-06-05T16:17:21.916
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 5e41da95d92656a5004b03d3576e8b226858a28e
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@ -95,14 +95,14 @@ F ext/fts3/fts3_tokenizer.c bbdc731bc91338050675c6d1da9ab82147391e16
F ext/fts3/fts3_tokenizer.h 64c6ef6c5272c51ebe60fc607a896e84288fcbc3
F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
F ext/fts3/fts3_unicode.c 92391b4b4fb043564c6539ea9b8661e3bcba47b9
F ext/fts3/fts3_unicode2.c a863f05f758af36777dffc2facc898bc73fec896
F ext/fts3/fts3_unicode2.c 0113d3acf13429e6dc38e0647d1bc71211c31a4d
F ext/fts3/fts3_write.c 6a1fc0e922e76b68e594bf7bc33bac72af9dc47b
F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
F ext/fts3/tool/fts3view.c 6cfc5b67a5f0e09c0d698f9fd012c784bfaa9197
F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
F ext/fts3/unicode/mkunicode.tcl 7a9bc018e2962abb79563c5a39fe581fcbf2f675
F ext/fts3/unicode/mkunicode.tcl dc6f268eb526710e2c6e496c372471d773d0c368
F ext/icu/README.txt d9fbbad0c2f647c3fdf715fc9fd64af53aedfc43
F ext/icu/icu.c 7538f98eab2854cf17fa5f7797bffa6c76e3863b
F ext/icu/sqliteicu.h 728867a802baa5a96de7495e9689a8e01715ef37
@ -550,7 +550,7 @@ F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
F test/fts4merge4.test c19c85ca1faa7b6d536832b49c12e1867235f584
F test/fts4unicode.test 25ccad45896f8e50f6a694cff738a35f798cdb40
F test/fts4unicode.test c8ac44217bf6c17812b03eaafa6c06995ad304c2
F test/full.test 6b3c8fb43c6beab6b95438c1675374b95fab245d
F test/func.test b0fc34fdc36897769651975a2b0a606312753643
F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
@ -1093,7 +1093,7 @@ F tool/vdbe-compress.tcl f12c884766bd14277f4fcedcae07078011717381
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
F tool/win/sqlite.vsix 97894c2790eda7b5bce3cc79cb2a8ec2fde9b3ac
P 3bd5ad095b23102dd3379cb62997cbf23cc67b7a
R 0bab77d0f95310ae1c21cfea10915144
U drh
Z 1023ee14390bd42e471d5323a67fa234
P 4d74fccf02134a998a84097b021ba9d501e34ff0
R 659aea33cb10f326783eda2b62f9d699
U dan
Z 74ecc7396dceda2a9a9f04f8bd9d8ced

View File

@ -1 +1 @@
4d74fccf02134a998a84097b021ba9d501e34ff0
6cfd9af5250029c0d275be027b4208c48954a8a1

View File

@ -384,5 +384,23 @@ foreach T $tokenizers {
do_isspace_test 6.$T.19 $T {8287 12288}
}
#-------------------------------------------------------------------------
# Test that the private use ranges are treated as alphanumeric.
#
breakpoint
foreach {tn1 c} {
1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
} {
foreach {tn2 config res} {
1 "" "0 hello*world hello*world"
2 "separators=*" "0 hello hello 1 world world"
} {
set config [string map [list * $c] $config]
set input [string map [list * $c] "hello*world"]
set output [string map [list * $c] $res]
do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
}
}
finish_test