From 25cdf46ae4aea2bf4b7a93052d896cc765bc39db Mon Sep 17 00:00:00 2001 From: dan Date: Thu, 7 Jun 2012 15:53:48 +0000 Subject: [PATCH] Add the "tokenchars=" and "separators=" options, for customizing the set of characters considered to be token separators, to the unicode61 tokenizer. FossilOrigin-Name: e56fb462aa1f11bb23303ae0dc62815c21e26a52 --- ext/fts3/fts3_tokenizer.c | 28 +++---- ext/fts3/fts3_unicode.c | 162 ++++++++++++++++++++++++++++++++------ manifest | 16 ++-- manifest.uuid | 2 +- test/fts4unicode.test | 88 +++++++++++++++++++++ 5 files changed, 251 insertions(+), 45 deletions(-) diff --git a/ext/fts3/fts3_tokenizer.c b/ext/fts3/fts3_tokenizer.c index f6b044ff6a..4a7a17567a 100644 --- a/ext/fts3/fts3_tokenizer.c +++ b/ext/fts3/fts3_tokenizer.c @@ -209,10 +209,9 @@ int sqlite3Fts3InitTokenizer( /* ** Implementation of a special SQL scalar function for testing tokenizers ** designed to be used in concert with the Tcl testing framework. This -** function must be called with two arguments: +** function must be called with two or more arguments: ** -** SELECT (, ); -** SELECT (, ); +** SELECT (, ..., ); ** ** where is the name passed as the second argument ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer') @@ -249,27 +248,27 @@ static void testFunc( const char *zInput; int nInput; - const char *zArg = 0; + const char *azArg[64]; const char *zToken; int nToken; int iStart; int iEnd; int iPos; + int i; Tcl_Obj *pRet; - assert( argc==2 || argc==3 ); + if( argc<2 ){ + sqlite3_result_error(context, "insufficient arguments", -1); + return; + } nName = sqlite3_value_bytes(argv[0]); zName = (const char *)sqlite3_value_text(argv[0]); nInput = sqlite3_value_bytes(argv[argc-1]); zInput = (const char *)sqlite3_value_text(argv[argc-1]); - if( argc==3 ){ - zArg = (const char *)sqlite3_value_text(argv[1]); - } - pHash = (Fts3Hash *)sqlite3_user_data(context); p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1); @@ -283,7 +282,11 @@ static void testFunc( pRet = Tcl_NewObj(); Tcl_IncrRefCount(pRet); - if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ + for(i=1; ixCreate(argc-2, azArg, &pTokenizer) ){ zErr = "error in xCreate()"; goto finish; } @@ -467,10 +470,7 @@ int sqlite3Fts3InitHashTable( } #ifdef SQLITE_TEST if( SQLITE_OK==rc ){ - rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0); - } - if( SQLITE_OK==rc ){ - rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0); + rc = sqlite3_create_function(db, zTest, -1, any, p, testFunc, 0, 0); } if( SQLITE_OK==rc ){ rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0); diff --git a/ext/fts3/fts3_unicode.c b/ext/fts3/fts3_unicode.c index 5dcfb30acc..79941edbb8 100644 --- a/ext/fts3/fts3_unicode.c +++ b/ext/fts3/fts3_unicode.c @@ -83,6 +83,8 @@ typedef struct unicode_cursor unicode_cursor; struct unicode_tokenizer { sqlite3_tokenizer base; int bRemoveDiacritic; + int nException; + int *aiException; }; struct unicode_cursor { @@ -95,6 +97,121 @@ struct unicode_cursor { int nAlloc; /* space allocated at zToken */ }; + +/* +** Destroy a tokenizer allocated by unicodeCreate(). +*/ +static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){ + if( pTokenizer ){ + unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer; + sqlite3_free(p->aiException); + sqlite3_free(p); + } + return SQLITE_OK; +} + +/* +** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE +** statement has specified that the tokenizer for this table shall consider +** all characters in string zIn/nIn to be separators (if bAlnum==0) or +** token characters (if bAlnum==1). +** +** For each codepoint in the zIn/nIn string, this function checks if the +** sqlite3FtsUnicodeIsalnum() function already returns the desired result. +** If so, no action is taken. Otherwise, the codepoint is added to the +** unicode_tokenizer.aiException[] array. For the purposes of tokenization, +** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all +** codepoints in the aiException[] array. +** +** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic() +** identifies as a diacritic) occurs in the zIn/nIn string it is ignored. +** It is not possible to change the behaviour of the tokenizer with respect +** to these codepoints. +*/ +static int unicodeAddExceptions( + unicode_tokenizer *p, /* Tokenizer to add exceptions to */ + int bAlnum, /* Replace Isalnum() return value with this */ + const char *zIn, /* Array of characters to make exceptions */ + int nIn /* Length of z in bytes */ +){ + const unsigned char *z = (const unsigned char *)zIn; + const unsigned char *zTerm = &z[nIn]; + int iCode; + int nEntry = 0; + + assert( bAlnum==0 || bAlnum==1 ); + + while( zaiException, (p->nException+nEntry)*sizeof(int)); + if( aNew==0 ) return SQLITE_NOMEM; + nNew = p->nException; + + z = (const unsigned char *)zIn; + while( zi; j--) aNew[j] = aNew[j-1]; + aNew[i] = iCode; + nNew++; + } + } + p->aiException = aNew; + p->nException = nNew; + } + + return SQLITE_OK; +} + +/* +** Return true if the p->aiException[] array contains the value iCode. +*/ +static int unicodeIsException(unicode_tokenizer *p, int iCode){ + if( p->nException>0 ){ + int *a = p->aiException; + int iLo = 0; + int iHi = p->nException-1; + + while( iHi>=iLo ){ + int iTest = (iHi + iLo) / 2; + if( iCode==a[iTest] ){ + return 1; + }else if( iCode>a[iTest] ){ + iLo = iTest+1; + }else{ + iHi = iTest-1; + } + } + } + + return 0; +} + +/* +** Return true if, for the purposes of tokenization, codepoint iCode is +** considered a token character (not a separator). +*/ +static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){ + assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); + return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode); +} + /* ** Create a new tokenizer instance. */ @@ -105,14 +222,14 @@ static int unicodeCreate( ){ unicode_tokenizer *pNew; /* New tokenizer object */ int i; + int rc = SQLITE_OK; + pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer)); - if( pNew==NULL ){ - return SQLITE_NOMEM; - } + if( pNew==NULL ) return SQLITE_NOMEM; memset(pNew, 0, sizeof(unicode_tokenizer)); pNew->bRemoveDiacritic = 1; - for(i=0; ibRemoveDiacritic = 0; } + else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){ + rc = unicodeAddExceptions(pNew, 1, &z[11], n-11); + } + else if( n>=11 && memcmp("separators=", z, 11)==0 ){ + rc = unicodeAddExceptions(pNew, 0, &z[11], n-11); + } else{ /* Unrecognized argument */ - return SQLITE_ERROR; + rc = SQLITE_ERROR; } } - *pp = &pNew->base; - return SQLITE_OK; -} - -/* -** Destroy a tokenizer allocated by unicodeCreate(). -*/ -static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){ - sqlite3_free(pTokenizer); - return SQLITE_OK; + if( rc!=SQLITE_OK ){ + unicodeDestroy((sqlite3_tokenizer *)pNew); + pNew = 0; + } + *pp = (sqlite3_tokenizer *)pNew; + return rc; } /* @@ -190,14 +309,15 @@ static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){ ** have been opened by a prior call to simpleOpen(). */ static int unicodeNext( - sqlite3_tokenizer_cursor *p, /* Cursor returned by simpleOpen */ + sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */ const char **paToken, /* OUT: Token text */ int *pnToken, /* OUT: Number of bytes at *paToken */ int *piStart, /* OUT: Starting offset of token */ int *piEnd, /* OUT: Ending offset of token */ int *piPos /* OUT: Position integer of token */ ){ - unicode_cursor *pCsr = (unicode_cursor *)p; + unicode_cursor *pCsr = (unicode_cursor *)pC; + unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); int iCode; char *zOut; const unsigned char *z = &pCsr->aInput[pCsr->iOff]; @@ -210,7 +330,7 @@ static int unicodeNext( ** the input. */ while( z=zTerm ) return SQLITE_DONE; @@ -230,9 +350,7 @@ static int unicodeNext( /* Write the folded case of the last character read to the output */ zEnd = z; - iOut = sqlite3FtsUnicodeFold(iCode, - ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic - ); + iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic); if( iOut ){ WRITE_UTF8(zOut, iOut); } @@ -240,7 +358,7 @@ static int unicodeNext( /* If the cursor is not at EOF, read the next character */ if( z>=zTerm ) break; READ_UTF8(z, zTerm, iCode); - }while( sqlite3FtsUnicodeIsalnum(iCode) + }while( unicodeIsAlnum(p, iCode) || sqlite3FtsUnicodeIsdiacritic(iCode) ); diff --git a/manifest b/manifest index 69af0ff4c4..f87782fe92 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sa\smalloc/free\smismatch\sin\spager.c\s(sqlite3_free()\scalled\son\sa\sbuffer\sallocated\sby\ssqlite3DbMalloc()). -D 2012-06-07T07:24:04.829 +C Add\sthe\s"tokenchars="\sand\s"separators="\soptions,\sfor\scustomizing\sthe\sset\sof\scharacters\sconsidered\sto\sbe\stoken\sseparators,\sto\sthe\sunicode61\stokenizer. +D 2012-06-07T15:53:48.974 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 4f37eb61be9d38643cdd839a74b8e3bad724cfcf F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -67,10 +67,10 @@ F ext/fts3/fts3_porter.c a465b49fcb8249a755792f87516eff182efa42b3 F ext/fts3/fts3_snippet.c bf67520ae9d2352a65368ed101729ff701c08808 F ext/fts3/fts3_term.c a521f75132f9a495bdca1bdd45949b3191c52763 F ext/fts3/fts3_test.c 348f7d08cae05285794e23dc4fe8b8fdf66e264a -F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce +F ext/fts3/fts3_tokenizer.c e94a8b901066031437ccfe4769fc76370257cede F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68 F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004 -F ext/fts3/fts3_unicode.c b9660ab4d7231d92d1853f34dc1a035efb59aa6d +F ext/fts3/fts3_unicode.c 49e36e6ba59f79e6bd6a8bfe434570fe48d20559 F ext/fts3/fts3_unicode2.c 2965d217c37079f1dbbdbd2c58f843be285d73f2 F ext/fts3/fts3_write.c 6a6391d6b01114f885e24e1f66bbc11ffba0e9e2 F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9 @@ -501,7 +501,7 @@ F test/fts4langid.test 24a6e41063b416bbdf371ff6b4476fa41c194aa7 F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891 F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7 -F test/fts4unicode.test f394585139ff878f9af0c83791a5f612d45a5984 +F test/fts4unicode.test 247e6c64563b5f930aec0f89a5b01ed6b4b129cd F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01 F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a @@ -1005,7 +1005,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381 -P 208825cd830748a2ca456affc57be75bbe618e47 -R 615bebd2a4edb95abd2eb071a3babed8 +P 506008f000ba4af0b35da023b8c52f7a3f5033bd +R 90fdd2c25413aca73a47ceb66fb14b8e U dan -Z def87c7e203d2567dcd1543b1ddecb6d +Z 483c809c7f7cc8104f2baffc94efe46a diff --git a/manifest.uuid b/manifest.uuid index 716f35ce92..28edd46d99 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -506008f000ba4af0b35da023b8c52f7a3f5033bd \ No newline at end of file +e56fb462aa1f11bb23303ae0dc62815c21e26a52 \ No newline at end of file diff --git a/test/fts4unicode.test b/test/fts4unicode.test index 5393d0d020..3abceb68b2 100644 --- a/test/fts4unicode.test +++ b/test/fts4unicode.test @@ -31,6 +31,18 @@ proc do_unicode_token_test2 {tn input res} { " [list [list {*}$res]]] } +proc do_unicode_token_test3 {tn args} { + set res [lindex $args end] + set sql "SELECT fts3_tokenizer_test('unicode61'" + foreach a [lrange $args 0 end-1] { + append sql ", '" + append sql [string map {' ''} $a] + append sql "'" + } + append sql ")" + uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]] +} + do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D} do_unicode_token_test 1.1 {Ä Ö Ü} {0 ä Ä 1 ö Ö 2 ü Ü} do_unicode_token_test 1.2 {xÄx xÖx xÜx} {0 xäx xÄx 1 xöx xÖx 2 xüx xÜx} @@ -236,6 +248,82 @@ do_test 4.3 { } } {} +#------------------------------------------------------------------------- + +do_unicode_token_test3 5.1 {tokenchars=} { + sqlite3_reset sqlite3_column_int +} { + 0 sqlite3 sqlite3 + 1 reset reset + 2 sqlite3 sqlite3 + 3 column column + 4 int int +} + +do_unicode_token_test3 5.2 {tokenchars=_} { + sqlite3_reset sqlite3_column_int +} { + 0 sqlite3_reset sqlite3_reset + 1 sqlite3_column_int sqlite3_column_int +} + +do_unicode_token_test3 5.3 {separators=xyz} { + Laotianxhorseyrunszfast +} { + 0 laotian Laotian + 1 horse horse + 2 runs runs + 3 fast fast +} + +do_unicode_token_test3 5.4 {tokenchars=xyz} { + Laotianxhorseyrunszfast +} { + 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast +} + +do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} { + sqlite3_resetxsqlite3_column_intyhonda_phantom +} { + 0 sqlite3_reset sqlite3_reset + 1 sqlite3_column_int sqlite3_column_int + 2 honda_phantom honda_phantom +} + +do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" { + 0 abc abc 1 def def +} + +do_unicode_token_test3 5.7 \ + "tokenchars=\u2444\u2445" \ + "separators=\u05D0\u05D1\u05D2" \ + "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \ + [list \ + 0 \u2444fre\u2445sh \u2444fre\u2445sh \ + 1 water water \ + 2 fish fish \ + 3 \u2445timer \u2445timer \ + ] + +# Check that it is not possible to add a standalone diacritic codepoint +# to either separators or tokenchars. +do_unicode_token_test3 5.8 "separators=\u0301" \ + "hello\u0301world \u0301helloworld" \ + "0 helloworld hello\u0301world 1 helloworld helloworld" + +do_unicode_token_test3 5.9 "tokenchars=\u0301" \ + "hello\u0301world \u0301helloworld" \ + "0 helloworld hello\u0301world 1 helloworld helloworld" + +do_unicode_token_test3 5.10 "separators=\u0301" \ + "remove_diacritics=0" \ + "hello\u0301world \u0301helloworld" \ + "0 hello\u0301world hello\u0301world 1 helloworld helloworld" + +do_unicode_token_test3 5.11 "tokenchars=\u0301" \ + "remove_diacritics=0" \ + "hello\u0301world \u0301helloworld" \ + "0 hello\u0301world hello\u0301world 1 helloworld helloworld" finish_test