Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5

and FTS3/4.

FossilOrigin-Name: 06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
This commit is contained in:
dan 2018-12-03 16:14:49 +00:00
parent 8c53b4e7f6
commit e89feee5c3
12 changed files with 320 additions and 94 deletions

View File

@ -82,7 +82,7 @@ typedef struct unicode_cursor unicode_cursor;
struct unicode_tokenizer {
sqlite3_tokenizer base;
int bRemoveDiacritic;
int eRemoveDiacritic;
int nException;
int *aiException;
};
@ -227,17 +227,20 @@ static int unicodeCreate(
pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
if( pNew==NULL ) return SQLITE_NOMEM;
memset(pNew, 0, sizeof(unicode_tokenizer));
pNew->bRemoveDiacritic = 1;
pNew->eRemoveDiacritic = 1;
for(i=0; rc==SQLITE_OK && i<nArg; i++){
const char *z = azArg[i];
int n = (int)strlen(z);
if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
pNew->bRemoveDiacritic = 1;
pNew->eRemoveDiacritic = 1;
}
else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
pNew->bRemoveDiacritic = 0;
pNew->eRemoveDiacritic = 0;
}
else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){
pNew->eRemoveDiacritic = 2;
}
else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
@ -350,7 +353,7 @@ static int unicodeNext(
/* Write the folded case of the last character read to the output */
zEnd = z;
iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic);
if( iOut ){
WRITE_UTF8(zOut, iOut);
}

View File

@ -159,32 +159,47 @@ int sqlite3FtsUnicodeIsalnum(int c){
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int remove_diacritic(int c){
static int remove_diacritic(int c, int bComplex){
unsigned short aDia[] = {
0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
62924, 63050, 63082, 63274, 63390,
3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
63182, 63242, 63274, 63310, 63368, 63390,
};
char aChar[] = {
'\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
'e', 'i', 'o', 'u', 'y',
'\0', 'a'|0x00, 'c'|0x00, 'e'|0x00, 'i'|0x00, 'n'|0x00,
'o'|0x00, 'u'|0x00, 'y'|0x00, 'y'|0x00, 'a'|0x00, 'c'|0x00,
'd'|0x00, 'e'|0x00, 'e'|0x00, 'g'|0x00, 'h'|0x00, 'i'|0x00,
'j'|0x00, 'k'|0x00, 'l'|0x00, 'n'|0x00, 'o'|0x00, 'r'|0x00,
's'|0x00, 't'|0x00, 'u'|0x00, 'u'|0x00, 'w'|0x00, 'y'|0x00,
'z'|0x00, 'o'|0x00, 'u'|0x00, 'a'|0x00, 'i'|0x00, 'o'|0x00,
'u'|0x00, 'u'|0x80, 'a'|0x80, 'g'|0x00, 'k'|0x00, 'o'|0x00,
'o'|0x80, 'j'|0x00, 'g'|0x00, 'n'|0x00, 'a'|0x80, 'a'|0x00,
'e'|0x00, 'i'|0x00, 'o'|0x00, 'r'|0x00, 'u'|0x00, 's'|0x00,
't'|0x00, 'h'|0x00, 'a'|0x00, 'e'|0x00, 'o'|0x80, 'o'|0x00,
'o'|0x80, 'y'|0x00, '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', 'a'|0x00, 'b'|0x00,
'c'|0x80, 'd'|0x00, 'd'|0x00, 'e'|0x80, 'e'|0x00, 'e'|0x80,
'f'|0x00, 'g'|0x00, 'h'|0x00, 'h'|0x00, 'i'|0x00, 'i'|0x80,
'k'|0x00, 'l'|0x00, 'l'|0x80, 'l'|0x00, 'm'|0x00, 'n'|0x00,
'o'|0x80, 'p'|0x00, 'r'|0x00, 'r'|0x80, 'r'|0x00, 's'|0x00,
's'|0x80, 't'|0x00, 'u'|0x00, 'u'|0x80, 'v'|0x00, 'w'|0x00,
'w'|0x00, 'x'|0x00, 'y'|0x00, 'z'|0x00, 'h'|0x00, 't'|0x00,
'w'|0x00, 'y'|0x00, 'a'|0x00, 'a'|0x80, 'a'|0x80, 'a'|0x80,
'e'|0x00, 'e'|0x80, 'e'|0x80, 'i'|0x00, 'o'|0x00, 'o'|0x80,
'o'|0x80, 'o'|0x80, 'u'|0x00, 'u'|0x80, 'u'|0x80, 'y'|0x00,
};
unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
@ -201,7 +216,8 @@ static int remove_diacritic(int c){
}
}
assert( key>=aDia[iRes] );
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
}
@ -228,7 +244,7 @@ int sqlite3FtsUnicodeIsdiacritic(int c){
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
/* Each entry in the following array defines a rule for folding a range
** of codepoints to lower case. The rule applies to a range of nRange
** codepoints starting at codepoint iCode.
@ -351,7 +367,9 @@ int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
assert( ret>0 );
}
if( bRemoveDiacritic ) ret = remove_diacritic(ret);
if( eRemoveDiacritic ){
ret = remove_diacritic(ret, eRemoveDiacritic==2);
}
}
else if( c>=66560 && c<66600 ){

View File

@ -9,11 +9,12 @@ proc print_rd {map} {
set nRange 1
set iFirst [lindex $map 0 0]
set cPrev [lindex $map 0 1]
set fPrev [lindex $map 0 2]
foreach m [lrange $map 1 end] {
foreach {i c} $m {}
foreach {i c f} $m {}
if {$cPrev == $c} {
if {$cPrev == $c && $fPrev==$f} {
for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
if {[info exists tl_lookup_table($j)]==0} break
}
@ -29,13 +30,16 @@ proc print_rd {map} {
lappend lRange [list $iFirst $nRange]
lappend aChar $cPrev
lappend aFlag $fPrev
set iFirst $i
set cPrev $c
set fPrev $f
set nRange 1
}
lappend lRange [list $iFirst $nRange]
lappend aChar $cPrev
lappend aFlag $fPrev
puts "/*"
puts "** If the argument is a codepoint corresponding to a lowercase letter"
@ -45,7 +49,7 @@ proc print_rd {map} {
puts "** E\"). The resuls of passing a codepoint that corresponds to an"
puts "** uppercase letter are undefined."
puts "*/"
puts "static int ${::remove_diacritic}(int c)\{"
puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
puts " unsigned short aDia\[\] = \{"
puts -nonewline " 0, "
set i 1
@ -60,13 +64,17 @@ proc print_rd {map} {
puts ""
puts " \};"
puts " char aChar\[\] = \{"
puts -nonewline " '\\0', "
puts -nonewline " '\\0', "
set i 1
foreach c $aChar {
set str "'$c', "
if {$c == ""} { set str "'\\0', " }
foreach c $aChar f $aFlag {
if { $f } {
set str "'$c'|0x80, "
} else {
set str "'$c'|0x00, "
}
if {$c == ""} { set str "'\\0', " }
if {($i % 12)==0} {puts "" ; puts -nonewline " " }
if {($i % 6)==0} {puts "" ; puts -nonewline " " }
incr i
puts -nonewline "$str"
}
@ -87,7 +95,8 @@ proc print_rd {map} {
}
}
assert( key>=aDia[iRes] );
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
puts "\}"
}
@ -95,7 +104,8 @@ proc print_isdiacritic {zFunc map} {
set lCode [list]
foreach m $map {
foreach {code char} $m {}
foreach {code char flag} $m {}
if {$flag} continue
if {$code && $char == ""} { lappend lCode $code }
}
set lCode [lsort -integer $lCode]
@ -472,7 +482,7 @@ proc print_fold {zFunc} {
puts "** The results are undefined if the value passed to this function"
puts "** is less than zero."
puts "*/"
puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
set liOff [tl_generate_ioff_table $lRecord]
tl_print_table_header
@ -516,7 +526,9 @@ proc print_fold {zFunc} {
assert( ret>0 );
}
if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
if( eRemoveDiacritic ){
ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
}
}
}]

View File

@ -7,12 +7,24 @@
# character that it should be replaced with, or an empty string if the
# codepoint should simply be removed from the input. Examples:
#
# { 224 a } (replace codepoint 224 to "a")
# { 769 "" } (remove codepoint 769 from input)
# { 224 a 0 } (replace codepoint 224 to "a")
# { 769 "" 0 } (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
# The third value in the list is always either 0 or 1. 0 if the
# UnicodeData.txt file maps the codepoint to a single ASCII character and
# a diacritic, or 1 if the mapping is indirect. For example, consider the
# two entries:
#
# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
#
# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
# diacritic). The second is an indirect mapping, as it maps to the
# first codepoint plus 0302 (a diacritic).
#
proc rd_load_unicodedata_text {zName} {
global tl_lookup_table
@ -53,18 +65,29 @@ proc rd_load_unicodedata_text {zName} {
set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
# Filter out upper-case characters, as they will be mapped to their
# lower-case equivalents before this data is used.
if {[info exists tl_lookup_table($iCode)]} continue
# Check if this is an indirect mapping. If so, set bIndirect to true
# and change $iAscii to the indirectly mappped ASCII character.
set bIndirect 0
if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
set iAscii $mapping($iAscii)
set bIndirect 1
}
if { ($iAscii >= 97 && $iAscii <= 122)
|| ($iAscii >= 65 && $iAscii <= 90)
} {
lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
set mapping($iCode) $iAscii
set dia($iDia) 1
}
}
foreach d [array names dia] {
lappend lRet [list $d ""]
lappend lRet [list $d "" 0]
}
set lRet [lsort -integer -index 0 $lRet]

View File

@ -234,13 +234,18 @@ struct Unicode61Tokenizer {
unsigned char aTokenChar[128]; /* ASCII range token characters */
char *aFold; /* Buffer to fold text into */
int nFold; /* Size of aFold[] in bytes */
int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
int nException;
int *aiException;
unsigned char aCategory[32]; /* True for token char categories */
};
/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
#define FTS5_REMOVE_DIACRITICS_NONE 0
#define FTS5_REMOVE_DIACRITICS_SIMPLE 1
#define FTS5_REMOVE_DIACRITICS_COMPLEX 2
static int fts5UnicodeAddExceptions(
Unicode61Tokenizer *p, /* Tokenizer object */
const char *z, /* Characters to treat as exceptions */
@ -361,7 +366,7 @@ static int fts5UnicodeCreate(
int i;
memset(p, 0, sizeof(Unicode61Tokenizer));
p->bRemoveDiacritic = 1;
p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
p->nFold = 64;
p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
if( p->aFold==0 ){
@ -382,10 +387,15 @@ static int fts5UnicodeCreate(
for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
const char *zArg = azArg[i+1];
if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
rc = SQLITE_ERROR;
}else{
p->eRemoveDiacritic = (zArg[0] - '0');
assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
|| p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
|| p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
);
}
p->bRemoveDiacritic = (zArg[0]=='1');
}else
if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
rc = fts5UnicodeAddExceptions(p, zArg, 1);
@ -499,7 +509,7 @@ static int fts5UnicodeTokenize(
READ_UTF8(zCsr, zTerm, iCode);
if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
non_ascii_tokenchar:
iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
if( iCode ) WRITE_UTF8(zOut, iCode);
}else{
break;

View File

@ -28,32 +28,47 @@
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int fts5_remove_diacritic(int c){
static int fts5_remove_diacritic(int c, int bComplex){
unsigned short aDia[] = {
0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
62924, 63050, 63082, 63274, 63390,
3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
63182, 63242, 63274, 63310, 63368, 63390,
};
char aChar[] = {
'\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
'e', 'i', 'o', 'u', 'y',
'\0', 'a'|0x00, 'c'|0x00, 'e'|0x00, 'i'|0x00, 'n'|0x00,
'o'|0x00, 'u'|0x00, 'y'|0x00, 'y'|0x00, 'a'|0x00, 'c'|0x00,
'd'|0x00, 'e'|0x00, 'e'|0x00, 'g'|0x00, 'h'|0x00, 'i'|0x00,
'j'|0x00, 'k'|0x00, 'l'|0x00, 'n'|0x00, 'o'|0x00, 'r'|0x00,
's'|0x00, 't'|0x00, 'u'|0x00, 'u'|0x00, 'w'|0x00, 'y'|0x00,
'z'|0x00, 'o'|0x00, 'u'|0x00, 'a'|0x00, 'i'|0x00, 'o'|0x00,
'u'|0x00, 'u'|0x80, 'a'|0x80, 'g'|0x00, 'k'|0x00, 'o'|0x00,
'o'|0x80, 'j'|0x00, 'g'|0x00, 'n'|0x00, 'a'|0x80, 'a'|0x00,
'e'|0x00, 'i'|0x00, 'o'|0x00, 'r'|0x00, 'u'|0x00, 's'|0x00,
't'|0x00, 'h'|0x00, 'a'|0x00, 'e'|0x00, 'o'|0x80, 'o'|0x00,
'o'|0x80, 'y'|0x00, '\0', '\0', '\0', '\0',
'\0', '\0', '\0', '\0', 'a'|0x00, 'b'|0x00,
'c'|0x80, 'd'|0x00, 'd'|0x00, 'e'|0x80, 'e'|0x00, 'e'|0x80,
'f'|0x00, 'g'|0x00, 'h'|0x00, 'h'|0x00, 'i'|0x00, 'i'|0x80,
'k'|0x00, 'l'|0x00, 'l'|0x80, 'l'|0x00, 'm'|0x00, 'n'|0x00,
'o'|0x80, 'p'|0x00, 'r'|0x00, 'r'|0x80, 'r'|0x00, 's'|0x00,
's'|0x80, 't'|0x00, 'u'|0x00, 'u'|0x80, 'v'|0x00, 'w'|0x00,
'w'|0x00, 'x'|0x00, 'y'|0x00, 'z'|0x00, 'h'|0x00, 't'|0x00,
'w'|0x00, 'y'|0x00, 'a'|0x00, 'a'|0x80, 'a'|0x80, 'a'|0x80,
'e'|0x00, 'e'|0x80, 'e'|0x80, 'i'|0x00, 'o'|0x00, 'o'|0x80,
'o'|0x80, 'o'|0x80, 'u'|0x00, 'u'|0x80, 'u'|0x80, 'y'|0x00,
};
unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
@ -70,7 +85,8 @@ static int fts5_remove_diacritic(int c){
}
}
assert( key>=aDia[iRes] );
return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
}
@ -97,7 +113,7 @@ int sqlite3Fts5UnicodeIsdiacritic(int c){
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){
/* Each entry in the following array defines a rule for folding a range
** of codepoints to lower case. The rule applies to a range of nRange
** codepoints starting at codepoint iCode.
@ -220,7 +236,9 @@ int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
assert( ret>0 );
}
if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
if( eRemoveDiacritic ){
ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2);
}
}
else if( c>=66560 && c<66600 ){
@ -231,11 +249,9 @@ int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
}
#if 0
int sqlite3Fts5UnicodeNCat(void) {
return 32;
}
#endif
int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){
aArray[0] = 1;
@ -756,7 +772,7 @@ void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
int n = (aFts5UnicodeData[iTbl] >> 5) + i;
for(; i<128 && i<n; i++){
aAscii[i] = (u8)bToken;
aAscii[i] = bToken;
}
iTbl++;
}

View File

@ -189,7 +189,7 @@ do_catchsql_test 6.2 {
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.3 {
CREATE VIRTUAL TABLE a3 USING fts5(
x, y, tokenize = 'unicode61 remove_diacritics 2'
x, y, tokenize = 'unicode61 remove_diacritics 3'
);
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.4 {

View File

@ -0,0 +1,65 @@
# 2014 June 17
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library. The
# focus of this script is testing the FTS5 module.
#
source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5umlaut
# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
ifcapable !fts5 {
finish_test
return
}
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE t1 USING fts5(x);
CREATE VIRTUAL TABLE t2 USING fts5(
x,
tokenize="unicode61 remove_diacritics 2"
);
}
foreach {tn q res1 res2} {
1 "Hà Nội" 0 1
2 "Hà Noi" 1 1
3 "Ha Noi" 1 1
4 "Ha N\u1ed9i" 0 1
5 "Ha N\u006fi" 1 1
6 "Ha N\u006f\u0302i" 1 1
7 "Ha N\u006f\u0323\u0302i" 1 1
} {
do_execsql_test 1.$tn.1 {
DELETE FROM t1;
INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
SELECT count(*) FROM t1($q)
} $res1
do_execsql_test 1.$tn.2 {
DELETE FROM t1;
INSERT INTO t1(rowid, x) VALUES (1, $q);
SELECT count(*) FROM t1('Ha Noi')
} $res1
do_execsql_test 1.$tn.2 {
DELETE FROM t2;
INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
SELECT count(*) FROM t2($q)
} $res2
do_execsql_test 1.$tn.2 {
DELETE FROM t2;
INSERT INTO t2(rowid, x) VALUES (1, $q);
SELECT count(*) FROM t2('Ha Noi')
} $res2
}
finish_test

View File

@ -36,24 +36,26 @@ foreach x [an_load_unicodedata_text $UD] {
}
foreach {y} [rd_load_unicodedata_text $UD] {
foreach {code ascii} $y {}
foreach {code ascii f} $y {}
if {$ascii==""} {
set int 0
} else {
binary scan $ascii c int
}
set aDiacritic($code) $int
set aDiacritic($code,$f) $int
if {$f==0} { set aDiacritic($code,1) $int }
}
proc tcl_fold {i {bRemoveDiacritic 0}} {
global tl_lookup_table
global aDiacritic
set f [expr $bRemoveDiacritic==2]
if {[info exists tl_lookup_table($i)]} {
set i $tl_lookup_table($i)
}
if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
set i $aDiacritic($i)
if {$bRemoveDiacritic && [info exists aDiacritic($i,$f)]} {
set i $aDiacritic($i,$f)
}
expr $i
}
@ -85,7 +87,7 @@ do_execsql_test 1.1 {
SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
} {0 {}}
do_execsql_test 1.2 {
do_execsql_test 1.2.1 {
WITH ii(i) AS (
SELECT -1
UNION ALL
@ -95,6 +97,16 @@ do_execsql_test 1.2 {
WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
} {0 {}}
do_execsql_test 1.2.2 {
WITH ii(i) AS (
SELECT -1
UNION ALL
SELECT i+1 FROM ii WHERE i<100000
)
SELECT count(*), min(i) FROM ii
WHERE fts5_fold(i,2)!=CAST(tcl_fold(i,2) AS int);
} {0 {}}
do_execsql_test 1.3 {
WITH ii(i) AS (
SELECT -1

View File

@ -1,5 +1,5 @@
C Update\sthe\sautoconf\smakefile\sfor\sMSVC.
D 2018-12-03T14:58:07.611
C Add\sthe\s"remove_diacritics=2"\soption\sto\sthe\sunicode61\stokenizer\sin\sboth\sFTS5\nand\sFTS3/4.
D 2018-12-03T16:14:49.664
F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
F Makefile.in a050c8670ea0d7b37b2192306cbb50d392acd9902b84e9b56f3444d006f97a6c
@ -96,8 +96,8 @@ F ext/fts3/fts3_tokenize_vtab.c a47c2a33de6db00816704315ac0a9afdfa1c71fa5b99f791
F ext/fts3/fts3_tokenizer.c a22bf311a71f3efa9d7012d8cc48fc9b0f3dace7
F ext/fts3/fts3_tokenizer.h 64c6ef6c5272c51ebe60fc607a896e84288fcbc3
F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
F ext/fts3/fts3_unicode.c 525a3bd9a7564603c5c061b7de55403a565307758a94600e8a2f6b00d1c40d9d
F ext/fts3/fts3_unicode2.c cc04fc672bfd42b1e650398cb0bf71f64f9aae032cfe75bbcfe75b9cf966029c
F ext/fts3/fts3_unicode.c b1902e9ad47a6569fbb8ecb5ce52f20fe59b590d5c5e3bbdd56b10b03bdf632b
F ext/fts3/fts3_unicode2.c 90e65f4291c8ecceee284ecc8d5d48734e95ecd4b008e06f36f14e77f93d655f
F ext/fts3/fts3_write.c a85bc4885fde7f1b44c9de013b62f7cd3332dc59e208053d878729b1d04745bc
F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100
@ -105,8 +105,8 @@ F ext/fts3/tool/fts3cov.sh c331d006359456cf6f8f953e37f2b9c7d568f3863f00bb5f7eb87
F ext/fts3/tool/fts3view.c 202801a2056995b763864d60c2dee744d46f1677
F ext/fts3/unicode/CaseFolding.txt 8c678ca52ecc95e16bc7afc2dbf6fc9ffa05db8c
F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
F ext/fts3/unicode/mkunicode.tcl 0069320b64db6ee269c5e95f1f150d070fbf0a863fc7b3549d7e52bd068fb118
F ext/fts3/unicode/parseunicode.tcl 024ae0bdd96309d7b8fc479148191e9b3001dc74017a3f65f9a27de3b3ff968b
F ext/fts3/unicode/mkunicode.tcl 106bb4ff6365b36301fa4a009e5b4bf6ed02a2fbe9156349be9dfd9a92697cde
F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a03cf1e6f52a6959fc77eb
F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0
F ext/fts5/fts5.h 5edc74ca603d71284d475886e6e91b5c5cf2e8e93e9ba3a36ba2f2440ee97948
F ext/fts5/fts5Int.h 39f12034b598df4e0f59bbe6cf03af03a905a534b04f182d155641a04e1eb797
@ -121,8 +121,8 @@ F ext/fts5/fts5_storage.c 4bec8a1b3905978b22a67bca5f4a3cfdb94af234cf51efb36f4f2d
F ext/fts5/fts5_tcl.c 39bcbae507f594aad778172fa914cad0f585bf92fd3b078c686e249282db0d95
F ext/fts5/fts5_test_mi.c 65864ba1e5c34a61d409c4c587e0bbe0466eb4f8f478d85dc42a92caad1338e6
F ext/fts5/fts5_test_tok.c 80de1a4b1a3caa216c3be8862440f0117a8357dd9b7cfc5a2a2ce11fe6eb64ae
F ext/fts5/fts5_tokenize.c ebd13d034f3dc7c841e1c32c364a4fca5cc2e05a0b91682a93fa1e6defcd4292
F ext/fts5/fts5_unicode2.c 543cf0987c27ad59e5a7a6222480b917b5431009b7b139027c9581a63e39e37e
F ext/fts5/fts5_tokenize.c ca2b6a033794945ac505241a86b0aa978709c23aa2e6121984d3e3ede96003c8
F ext/fts5/fts5_unicode2.c 051f207a76a90890009a8b5009ca0c9a327342ec4d10c2145b61a334784b713a
F ext/fts5/fts5_varint.c a5aceacda04dafcbae725413d7a16818ecd65738
F ext/fts5/fts5_vocab.c fbe38044889b2d2d99babeeef239c620fb0332bb928a84506ac748d81500b354
F ext/fts5/fts5parse.y eb526940f892ade5693f22ffd6c4f2702543a9059942772526eac1fde256bb05
@ -206,10 +206,11 @@ F ext/fts5/test/fts5synonym.test 1651815b8008de170e8e600dcacc17521d765482ea8f074
F ext/fts5/test/fts5synonym2.test b54cce5c34ec08ed616f646635538ae82e34a0e28f947ec60b6fadbc4b3fb17a
F ext/fts5/test/fts5tok1.test ce6551e41ff56f30b69963577324624733bed0d1753589f06120d664d9cd45c9
F ext/fts5/test/fts5tok2.test dcacb32d4a2a3f0dd3215d4a3987f78ae4be21a2
F ext/fts5/test/fts5tokenizer.test 6aeb5e8061ffc0ff9a5299f27beaee3b2b4b8b336d4f107262bca338bea8f8e9
F ext/fts5/test/fts5tokenizer.test ac3c9112b263a639fb0508ae73a3ee886bf4866d2153771a8e8a20c721305a43
F ext/fts5/test/fts5umlaut.test a42fe2fe6387c40c49ab27ccbd070e1ae38e07f38d05926482cc0bccac9ad602
F ext/fts5/test/fts5unicode.test 17056f4efe6b0a5d4f41fdf7a7dc9af2873004562eaa899d40633b93dc95f5a9
F ext/fts5/test/fts5unicode2.test 9b3df486de05fb4bde4aa7ee8de2e6dae1df6eb90e3f2e242c9383b95d314e3e
F ext/fts5/test/fts5unicode3.test c3caecbe8264629ffe653b43ca5790b9793eba4422f92203e5247558e5a534e7
F ext/fts5/test/fts5unicode3.test 9cbc82e2b02e2e3b7504103580c90f095e07fe8230b1951a9ed7558717b5feb7
F ext/fts5/test/fts5unicode4.test 6463301d669f963c83988017aa354108be0b947d325aef58d3abddf27147b687
F ext/fts5/test/fts5unindexed.test 9021af86a0fb9fc616f7a69a996db0116e7936d0db63892db6bafabbec21af4d
F ext/fts5/test/fts5update.test 0737876e20e97a6a6abf45de19fc99315727bcee6a83fadcada1cc080b9aa8f0
@ -956,6 +957,7 @@ F test/fts4merge4.test d895b1057a7798b67e03455d0fa50e9ea836c47b
F test/fts4noti.test 5553d7bb2e20bf4a06b23e849352efc022ce6309
F test/fts4onepass.test d69ddc4ee3415e40b0c5d1d0408488a87614d4f63ba9c44f3e52db541d6b7cc7
F test/fts4opt.test 0fd0cc84000743ff2a883b9b84b4a5be07249f0ba790c8848a757164cdd46b2a
F test/fts4umlaut.test 1d28e2a2dffa794e15babadebdece091431b09d740be79c08eb6aba1173a8c84
F test/fts4unicode.test ceca76422abc251818cb25dabe33d3c3970da5f7c90e1540f190824e6b3a7c95
F test/full.test 6b3c8fb43c6beab6b95438c1675374b95fab245d
F test/func.test 09dda479bcfc568f99f3070413e9672a8eeedc1be9c5d819bf55d4788c2583b7
@ -1779,7 +1781,7 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93
F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
P 15824ccda0f110794a479b58fbf36082d8c383f34bae9dc0921d96547fb37869
R 6917ee4bf0492fb65a72a63e44f83083
U mistachkin
Z 0fd7a981eb53d48862b14fa362aff3ba
P 675aba1f8b989cfd99370704ecb09031026dc3321cccad122ea91d816e02fdba
R 7123bc66be985a6ec633c5a7bea2ac49
U dan
Z 3c4b639682109864205c768a98540aa1

View File

@ -1 +1 @@
675aba1f8b989cfd99370704ecb09031026dc3321cccad122ea91d816e02fdba
06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec

65
test/fts4umlaut.test Normal file
View File

@ -0,0 +1,65 @@
# 2018 December 3
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library. The
# focus of this script is testing the FTS5 module.
#
set testdir [file dirname $argv0]
source $testdir/tester.tcl
set testprefix fts4umlaut
ifcapable !fts3 {
finish_test
return
}
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE t1 USING fts5(x);
CREATE VIRTUAL TABLE t2 USING fts4(
x,
tokenize=unicode61 "remove_diacritics=2"
);
}
foreach {tn q res1 res2} {
1 "Hà Nội" 0 1
2 "Hà Noi" 1 1
3 "Ha Noi" 1 1
4 "Ha N\u1ed9i" 0 1
5 "Ha N\u006fi" 1 1
6 "Ha N\u006f\u0302i" 1 1
7 "Ha N\u006f\u0323\u0302i" 1 1
} {
do_execsql_test 1.$tn.1 {
DELETE FROM t1;
INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
SELECT count(*) FROM t1 WHERE t1 MATCH $q
} $res1
do_execsql_test 1.$tn.2 {
DELETE FROM t1;
INSERT INTO t1(rowid, x) VALUES (1, $q);
SELECT count(*) FROM t1 WHERE t1 MATCH 'Ha Noi'
} $res1
do_execsql_test 1.$tn.2 {
DELETE FROM t2;
INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
SELECT count(*) FROM t2 WHERE t2 MATCH $q
} $res2
do_execsql_test 1.$tn.2 {
DELETE FROM t2;
INSERT INTO t2(rowid, x) VALUES (1, $q);
SELECT count(*) FROM t2 WHERE t2 MATCH 'Ha Noi'
} $res2
}
finish_test