Add the "tokenchars=" and "separators=" options, for customizing the set of characters considered to be token separators, to the unicode61 tokenizer.

FossilOrigin-Name: e56fb462aa1f11bb23303ae0dc62815c21e26a52
2012-06-07 15:53:48 +00:00 · 2012-06-07 15:53:48 +00:00 · 25cdf46ae4
commit 25cdf46ae4
parent a879342b0e
5 changed files with 251 additions and 45 deletions
--- a/ext/fts3/fts3_tokenizer.c
+++ b/ext/fts3/fts3_tokenizer.c
@ -209,10 +209,9 @@ int sqlite3Fts3InitTokenizer(
 /*
 ** Implementation of a special SQL scalar function for testing tokenizers 
 ** designed to be used in concert with the Tcl testing framework. This
-** function must be called with two arguments:
+** function must be called with two or more arguments:
 **
-**   SELECT <function-name>(<key-name>, <input-string>);
+**   SELECT <function-name>(<key-name>, ..., <input-string>);
 **   SELECT <function-name>(<key-name>, <pointer>);
 **
 ** where <function-name> is the name passed as the second argument
 ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
@ -249,27 +248,27 @@ static void testFunc(
  const char *zInput;
  int nInput;
-  const char *zArg = 0;
+  const char *azArg[64];
  const char *zToken;
  int nToken;
  int iStart;
  int iEnd;
  int iPos;
  int i;
  Tcl_Obj *pRet;
-  assert( argc==2 || argc==3 );
+  if( argc<2 ){
    sqlite3_result_error(context, "insufficient arguments", -1);
    return;
  }
  nName = sqlite3_value_bytes(argv[0]);
  zName = (const char *)sqlite3_value_text(argv[0]);
  nInput = sqlite3_value_bytes(argv[argc-1]);
  zInput = (const char *)sqlite3_value_text(argv[argc-1]);
  if( argc==3 ){
    zArg = (const char *)sqlite3_value_text(argv[1]);
  }
  pHash = (Fts3Hash *)sqlite3_user_data(context);
  p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
@ -283,7 +282,11 @@ static void testFunc(
  pRet = Tcl_NewObj();
  Tcl_IncrRefCount(pRet);
-  if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
+  for(i=1; i<argc-1; i++){
    azArg[i-1] = (const char *)sqlite3_value_text(argv[i]);
  }
  if( SQLITE_OK!=p->xCreate(argc-2, azArg, &pTokenizer) ){
    zErr = "error in xCreate()";
    goto finish;
  }
@ -467,10 +470,7 @@ int sqlite3Fts3InitHashTable(
  }
 #ifdef SQLITE_TEST
  if( SQLITE_OK==rc ){
-    rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0);
+    rc = sqlite3_create_function(db, zTest, -1, any, p, testFunc, 0, 0);
  }
  if( SQLITE_OK==rc ){
    rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0);
  }
  if( SQLITE_OK==rc ){
    rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0);
--- a/ext/fts3/fts3_unicode.c
+++ b/ext/fts3/fts3_unicode.c
@ -83,6 +83,8 @@ typedef struct unicode_cursor unicode_cursor;
 struct unicode_tokenizer {
  sqlite3_tokenizer base;
  int bRemoveDiacritic;
  int nException;
  int *aiException;
 };
 struct unicode_cursor {
@ -95,6 +97,121 @@ struct unicode_cursor {
  int nAlloc;                     /* space allocated at zToken */
 };
 /*
 ** Destroy a tokenizer allocated by unicodeCreate().
 */
 static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
  if( pTokenizer ){
    unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer;
    sqlite3_free(p->aiException);
    sqlite3_free(p);
  }
  return SQLITE_OK;
 }
 /*
 ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
 ** statement has specified that the tokenizer for this table shall consider
 ** all characters in string zIn/nIn to be separators (if bAlnum==0) or
 ** token characters (if bAlnum==1).
 **
 ** For each codepoint in the zIn/nIn string, this function checks if the
 ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
 ** If so, no action is taken. Otherwise, the codepoint is added to the 
 ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
 ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
 ** codepoints in the aiException[] array.
 **
 ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
 ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
 ** It is not possible to change the behaviour of the tokenizer with respect
 ** to these codepoints.
 */
 static int unicodeAddExceptions(
  unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
  int bAlnum,                     /* Replace Isalnum() return value with this */
  const char *zIn,                /* Array of characters to make exceptions */
  int nIn                         /* Length of z in bytes */
 ){
  const unsigned char *z = (const unsigned char *)zIn;
  const unsigned char *zTerm = &z[nIn];
  int iCode;
  int nEntry = 0;
  assert( bAlnum==0 || bAlnum==1 );
  while( z<zTerm ){
    READ_UTF8(z, zTerm, iCode);
    assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
    if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
     && sqlite3FtsUnicodeIsdiacritic(iCode)==0 
    ){
      nEntry++;
    }
  }
  if( nEntry ){
    int *aNew;                    /* New aiException[] array */
    int nNew;                     /* Number of valid entries in array aNew[] */
    aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));
    if( aNew==0 ) return SQLITE_NOMEM;
    nNew = p->nException;
    z = (const unsigned char *)zIn;
    while( z<zTerm ){
      READ_UTF8(z, zTerm, iCode);
      if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum 
       && sqlite3FtsUnicodeIsdiacritic(iCode)==0
      ){
        int i, j;
        for(i=0; i<nNew && aNew[i]<iCode; i++);
        for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
        aNew[i] = iCode;
        nNew++;
      }
    }
    p->aiException = aNew;
    p->nException = nNew;
  }
  return SQLITE_OK;
 }
 /*
 ** Return true if the p->aiException[] array contains the value iCode.
 */
 static int unicodeIsException(unicode_tokenizer *p, int iCode){
  if( p->nException>0 ){
    int *a = p->aiException;
    int iLo = 0;
    int iHi = p->nException-1;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( iCode==a[iTest] ){
        return 1;
      }else if( iCode>a[iTest] ){
        iLo = iTest+1;
      }else{
        iHi = iTest-1;
      }
    }
  }
  return 0;
 }
 /*
 ** Return true if, for the purposes of tokenization, codepoint iCode is
 ** considered a token character (not a separator).
 */
 static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){
  assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
  return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
 }
 /*
 ** Create a new tokenizer instance.
 */
@ -105,14 +222,14 @@ static int unicodeCreate(
 ){
  unicode_tokenizer *pNew;        /* New tokenizer object */
  int i;
  int rc = SQLITE_OK;
  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
-  if( pNew==NULL ){
+  if( pNew==NULL ) return SQLITE_NOMEM;
    return SQLITE_NOMEM;
  }
  memset(pNew, 0, sizeof(unicode_tokenizer));
  pNew->bRemoveDiacritic = 1;
-  for(i=0; i<nArg; i++){
+  for(i=0; rc==SQLITE_OK && i<nArg; i++){
    const char *z = azArg[i];
    int n = strlen(z);
@ -122,22 +239,24 @@ static int unicodeCreate(
    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
      pNew->bRemoveDiacritic = 0;
    }
    else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
      rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
    }
    else if( n>=11 && memcmp("separators=", z, 11)==0 ){
      rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
    }
    else{
      /* Unrecognized argument */
-      return SQLITE_ERROR;
+      rc  = SQLITE_ERROR;
    }
  }
-  *pp = &pNew->base;
+  if( rc!=SQLITE_OK ){
-  return SQLITE_OK;
+    unicodeDestroy((sqlite3_tokenizer *)pNew);
    pNew = 0;
  }
-
+  *pp = (sqlite3_tokenizer *)pNew;
-/*
+  return rc;
 ** Destroy a tokenizer allocated by unicodeCreate().
 */
 static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
  sqlite3_free(pTokenizer);
  return SQLITE_OK;
 }
 /*
@ -190,14 +309,15 @@ static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
 ** have been opened by a prior call to simpleOpen().
 */
 static int unicodeNext(
-  sqlite3_tokenizer_cursor *p,    /* Cursor returned by simpleOpen */
+  sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
  const char **paToken,           /* OUT: Token text */
  int *pnToken,                   /* OUT: Number of bytes at *paToken */
  int *piStart,                   /* OUT: Starting offset of token */
  int *piEnd,                     /* OUT: Ending offset of token */
  int *piPos                      /* OUT: Position integer of token */
 ){
-  unicode_cursor *pCsr = (unicode_cursor *)p;
+  unicode_cursor *pCsr = (unicode_cursor *)pC;
  unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
  int iCode;
  char *zOut;
  const unsigned char *z = &pCsr->aInput[pCsr->iOff];
@ -210,7 +330,7 @@ static int unicodeNext(
  ** the input.  */
  while( z<zTerm ){
    READ_UTF8(z, zTerm, iCode);
-    if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
+    if( unicodeIsAlnum(p, iCode) ) break;
    zStart = z;
  }
  if( zStart>=zTerm ) return SQLITE_DONE;
@ -230,9 +350,7 @@ static int unicodeNext(
    /* Write the folded case of the last character read to the output */
    zEnd = z;
-    iOut = sqlite3FtsUnicodeFold(iCode, 
+    iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);
        ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic
    );
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }
@ -240,7 +358,7 @@ static int unicodeNext(
    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);
-  }while( sqlite3FtsUnicodeIsalnum(iCode) 
+  }while( unicodeIsAlnum(p, iCode) 
       || sqlite3FtsUnicodeIsdiacritic(iCode)
  );
--- a/16
+++ b/16
@ -1,5 +1,5 @@
-C Fix\sa\smalloc/free\smismatch\sin\spager.c\s(sqlite3_free()\scalled\son\sa\sbuffer\sallocated\sby\ssqlite3DbMalloc()).
+C Add\sthe\s"tokenchars="\sand\s"separators="\soptions,\sfor\scustomizing\sthe\sset\sof\scharacters\sconsidered\sto\sbe\stoken\sseparators,\sto\sthe\sunicode61\stokenizer.
-D 2012-06-07T07:24:04.829
+D 2012-06-07T15:53:48.974
 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
 F Makefile.in 4f37eb61be9d38643cdd839a74b8e3bad724cfcf
 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@ -67,10 +67,10 @@ F ext/fts3/fts3_porter.c a465b49fcb8249a755792f87516eff182efa42b3
 F ext/fts3/fts3_snippet.c bf67520ae9d2352a65368ed101729ff701c08808
 F ext/fts3/fts3_term.c a521f75132f9a495bdca1bdd45949b3191c52763
 F ext/fts3/fts3_test.c 348f7d08cae05285794e23dc4fe8b8fdf66e264a
-F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce
+F ext/fts3/fts3_tokenizer.c e94a8b901066031437ccfe4769fc76370257cede
 F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68
 F ext/fts3/fts3_tokenizer1.c 5c98225a53705e5ee34824087478cf477bdb7004
-F ext/fts3/fts3_unicode.c b9660ab4d7231d92d1853f34dc1a035efb59aa6d
+F ext/fts3/fts3_unicode.c 49e36e6ba59f79e6bd6a8bfe434570fe48d20559
 F ext/fts3/fts3_unicode2.c 2965d217c37079f1dbbdbd2c58f843be285d73f2
 F ext/fts3/fts3_write.c 6a6391d6b01114f885e24e1f66bbc11ffba0e9e2
 F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9
@ -501,7 +501,7 @@ F test/fts4langid.test 24a6e41063b416bbdf371ff6b4476fa41c194aa7
 F test/fts4merge.test c424309743fdd203f8e56a1f1cd7872cd66cc0ee
 F test/fts4merge2.test 5faa558d1b672f82b847d2a337465fa745e46891
 F test/fts4merge3.test aab02a09f50fe6baaddc2e159c3eabc116d45fc7
-F test/fts4unicode.test f394585139ff878f9af0c83791a5f612d45a5984
+F test/fts4unicode.test 247e6c64563b5f930aec0f89a5b01ed6b4b129cd
 F test/func.test 9809b7622d721904a8cc33c1ffb87f46d506ed01
 F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f
 F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a
@ -1005,7 +1005,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06
 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
 F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
-P 208825cd830748a2ca456affc57be75bbe618e47
+P 506008f000ba4af0b35da023b8c52f7a3f5033bd
-R 615bebd2a4edb95abd2eb071a3babed8
+R 90fdd2c25413aca73a47ceb66fb14b8e
 U dan
-Z def87c7e203d2567dcd1543b1ddecb6d
+Z 483c809c7f7cc8104f2baffc94efe46a
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-506008f000ba4af0b35da023b8c52f7a3f5033bd
+e56fb462aa1f11bb23303ae0dc62815c21e26a52
--- a/test/fts4unicode.test
+++ b/test/fts4unicode.test
@ -31,6 +31,18 @@ proc do_unicode_token_test2 {tn input res} {
  " [list [list {*}$res]]]
 }
 proc do_unicode_token_test3 {tn args} {
  set res   [lindex $args end]
  set sql "SELECT fts3_tokenizer_test('unicode61'"
  foreach a [lrange $args 0 end-1] {
    append sql ", '"
    append sql [string map {' ''} $a]
    append sql "'"
  }
  append sql ")"
  uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
 }
 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
 do_unicode_token_test 1.1 {Ä Ö Ü} {0 ä Ä 1 ö Ö 2 ü Ü}
 do_unicode_token_test 1.2 {xÄx xÖx xÜx} {0 xäx xÄx 1 xöx xÖx 2 xüx xÜx}
@ -236,6 +248,82 @@ do_test 4.3 {
  }
 } {}
 #-------------------------------------------------------------------------
 do_unicode_token_test3 5.1 {tokenchars=} {
  sqlite3_reset sqlite3_column_int
 } {
  0 sqlite3 sqlite3 
  1 reset reset 
  2 sqlite3 sqlite3 
  3 column column 
  4 int int
 }
 do_unicode_token_test3 5.2 {tokenchars=_} {
  sqlite3_reset sqlite3_column_int
 } {
  0 sqlite3_reset sqlite3_reset 
  1 sqlite3_column_int sqlite3_column_int
 }
 do_unicode_token_test3 5.3 {separators=xyz} {
  Laotianxhorseyrunszfast
 } {
  0 laotian Laotian
  1 horse horse
  2 runs runs
  3 fast fast
 }
 do_unicode_token_test3 5.4 {tokenchars=xyz} {
  Laotianxhorseyrunszfast
 } {
  0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
 }
 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
  sqlite3_resetxsqlite3_column_intyhonda_phantom
 } {
  0 sqlite3_reset sqlite3_reset 
  1 sqlite3_column_int sqlite3_column_int
  2 honda_phantom honda_phantom
 }
 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
  0 abc abc 1 def def
 }
 do_unicode_token_test3 5.7                             \
  "tokenchars=\u2444\u2445"                            \
  "separators=\u05D0\u05D1\u05D2"                      \
  "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
  [list                                                \
    0 \u2444fre\u2445sh \u2444fre\u2445sh              \
    1 water water                                      \
    2 fish fish                                        \
    3 \u2445timer \u2445timer                          \
  ]
 # Check that it is not possible to add a standalone diacritic codepoint 
 # to either separators or tokenchars.
 do_unicode_token_test3 5.8 "separators=\u0301" \
  "hello\u0301world \u0301helloworld"          \
  "0 helloworld hello\u0301world 1 helloworld helloworld"
 do_unicode_token_test3 5.9 "tokenchars=\u0301" \
  "hello\u0301world \u0301helloworld"          \
  "0 helloworld hello\u0301world 1 helloworld helloworld"
 do_unicode_token_test3 5.10 "separators=\u0301" \
  "remove_diacritics=0"                        \
  "hello\u0301world \u0301helloworld"          \
  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
 do_unicode_token_test3 5.11 "tokenchars=\u0301" \
  "remove_diacritics=0"                         \
  "hello\u0301world \u0301helloworld"           \
  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
 finish_test
		`@ -1 +1 @@`
			`506008f000ba4af0b35da023b8c52f7a3f5033bd`				`e56fb462aa1f11bb23303ae0dc62815c21e26a52`