From 996073b3aec31f41354a25148bbc995c1391633c Mon Sep 17 00:00:00 2001 From: dan Date: Sat, 3 Mar 2012 18:46:41 +0000 Subject: [PATCH] Add the xLanguageid method to sqlite3_fts3_tokenizer versions 1 and greater. FossilOrigin-Name: f8e9c445dd358c40e5a7bf3756b9f291909dbea7 --- ext/fts3/fts3.c | 10 +- ext/fts3/fts3Int.h | 6 +- ext/fts3/fts3_expr.c | 43 ++++++-- ext/fts3/fts3_snippet.c | 16 +-- ext/fts3/fts3_test.c | 199 ++++++++++++++++++++++++++++++++++++++ ext/fts3/fts3_tokenizer.c | 3 +- ext/fts3/fts3_tokenizer.h | 11 ++- ext/fts3/fts3_write.c | 39 ++++---- manifest | 30 +++--- manifest.uuid | 2 +- test/fts4langid.test | 41 +++++++- test/permutations.test | 3 +- 12 files changed, 340 insertions(+), 63 deletions(-) diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c index dec7f8722c..f16191f31f 100644 --- a/ext/fts3/fts3.c +++ b/ext/fts3/fts3.c @@ -2958,8 +2958,11 @@ static int fts3FilterMethod( return SQLITE_NOMEM; } - rc = sqlite3Fts3ExprParse(p->pTokenizer, p->azColumn, p->bHasStat, - p->nColumn, iCol, zQuery, -1, &pCsr->pExpr + pCsr->iLangid = 0; + if( nVal==2 ) pCsr->iLangid = sqlite3_value_int(apVal[1]); + + rc = sqlite3Fts3ExprParse(p->pTokenizer, pCsr->iLangid, + p->azColumn, p->bHasStat, p->nColumn, iCol, zQuery, -1, &pCsr->pExpr ); if( rc!=SQLITE_OK ){ if( rc==SQLITE_ERROR ){ @@ -2969,9 +2972,6 @@ static int fts3FilterMethod( return rc; } - pCsr->iLangid = 0; - if( nVal==2 ) pCsr->iLangid = sqlite3_value_int(apVal[1]); - rc = sqlite3Fts3ReadLock(p); if( rc!=SQLITE_OK ) return rc; diff --git a/ext/fts3/fts3Int.h b/ext/fts3/fts3Int.h index 078b5b987b..393cd6aea1 100644 --- a/ext/fts3/fts3Int.h +++ b/ext/fts3/fts3Int.h @@ -498,7 +498,7 @@ void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *, void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *); /* fts3_expr.c */ -int sqlite3Fts3ExprParse(sqlite3_tokenizer *, +int sqlite3Fts3ExprParse(sqlite3_tokenizer *, int, char **, int, int, int, const char *, int, Fts3Expr ** ); void sqlite3Fts3ExprFree(Fts3Expr *); @@ -507,6 +507,10 @@ int sqlite3Fts3ExprInitTestInterface(sqlite3 *db); int sqlite3Fts3InitTerm(sqlite3 *db); #endif +int sqlite3Fts3OpenTokenizer(sqlite3_tokenizer *, int, const char *, int, + sqlite3_tokenizer_cursor ** +); + /* fts3_aux.c */ int sqlite3Fts3InitAux(sqlite3 *db); diff --git a/ext/fts3/fts3_expr.c b/ext/fts3/fts3_expr.c index 1c3a79071c..a6e3492242 100644 --- a/ext/fts3/fts3_expr.c +++ b/ext/fts3/fts3_expr.c @@ -92,6 +92,7 @@ int sqlite3_fts3_enable_parentheses = 0; typedef struct ParseContext ParseContext; struct ParseContext { sqlite3_tokenizer *pTokenizer; /* Tokenizer module */ + int iLangid; /* Language id used with tokenizer */ const char **azCol; /* Array of column names for fts3 table */ int bFts4; /* True to allow FTS4-only syntax */ int nCol; /* Number of entries in azCol[] */ @@ -127,6 +128,33 @@ static void *fts3MallocZero(int nByte){ return pRet; } +int sqlite3Fts3OpenTokenizer( + sqlite3_tokenizer *pTokenizer, + int iLangid, + const char *z, + int n, + sqlite3_tokenizer_cursor **ppCsr +){ + sqlite3_tokenizer_module const *pModule = pTokenizer->pModule; + sqlite3_tokenizer_cursor *pCsr = 0; + int rc; + + rc = pModule->xOpen(pTokenizer, z, n, &pCsr); + assert( rc==SQLITE_OK || pCsr==0 ); + if( rc==SQLITE_OK ){ + pCsr->pTokenizer = pTokenizer; + if( pModule->iVersion>=1 ){ + rc = pModule->xLanguageid(pCsr, iLangid); + if( rc!=SQLITE_OK ){ + pModule->xClose(pCsr); + pCsr = 0; + } + } + } + *ppCsr = pCsr; + return rc; +} + /* ** Extract the next token from buffer z (length n) using the tokenizer @@ -154,15 +182,13 @@ static int getNextToken( Fts3Expr *pRet = 0; int nConsumed = 0; - rc = pModule->xOpen(pTokenizer, z, n, &pCursor); + rc = sqlite3Fts3OpenTokenizer(pTokenizer, pParse->iLangid, z, n, &pCursor); if( rc==SQLITE_OK ){ const char *zToken; int nToken, iStart, iEnd, iPosition; int nByte; /* total space to allocate */ - pCursor->pTokenizer = pTokenizer; rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition); - if( rc==SQLITE_OK ){ nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken; pRet = (Fts3Expr *)fts3MallocZero(nByte); @@ -268,10 +294,10 @@ static int getNextString( ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase ** structures. */ - rc = pModule->xOpen(pTokenizer, zInput, nInput, &pCursor); + rc = sqlite3Fts3OpenTokenizer( + pTokenizer, pParse->iLangid, zInput, nInput, &pCursor); if( rc==SQLITE_OK ){ int ii; - pCursor->pTokenizer = pTokenizer; for(ii=0; rc==SQLITE_OK; ii++){ const char *zByte; int nByte, iBegin, iEnd, iPos; @@ -745,6 +771,7 @@ exprparse_out: */ int sqlite3Fts3ExprParse( sqlite3_tokenizer *pTokenizer, /* Tokenizer module */ + int iLangid, /* Language id for tokenizer */ char **azCol, /* Array of column names for fts3 table */ int bFts4, /* True to allow FTS4-only syntax */ int nCol, /* Number of entries in azCol[] */ @@ -755,11 +782,13 @@ int sqlite3Fts3ExprParse( int nParsed; int rc; ParseContext sParse; + + memset(&sParse, 0, sizeof(ParseContext)); sParse.pTokenizer = pTokenizer; + sParse.iLangid = iLangid; sParse.azCol = (const char **)azCol; sParse.nCol = nCol; sParse.iDefaultCol = iDefaultCol; - sParse.nNest = 0; sParse.bFts4 = bFts4; if( z==0 ){ *ppExpr = 0; @@ -950,7 +979,7 @@ static void fts3ExprTest( } rc = sqlite3Fts3ExprParse( - pTokenizer, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr + pTokenizer, 0, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr ); if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){ sqlite3_result_error(context, "Error parsing expression", -1); diff --git a/ext/fts3/fts3_snippet.c b/ext/fts3/fts3_snippet.c index 23ef25c5d4..fd5bc9786b 100644 --- a/ext/fts3/fts3_snippet.c +++ b/ext/fts3/fts3_snippet.c @@ -532,6 +532,7 @@ static int fts3StringAppend( */ static int fts3SnippetShift( Fts3Table *pTab, /* FTS3 table snippet comes from */ + int iLangid, /* Language id to use in tokenizing */ int nSnippet, /* Number of tokens desired for snippet */ const char *zDoc, /* Document text to extract snippet from */ int nDoc, /* Size of buffer zDoc in bytes */ @@ -567,11 +568,10 @@ static int fts3SnippetShift( /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) ** or more tokens in zDoc/nDoc. */ - rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC); if( rc!=SQLITE_OK ){ return rc; } - pC->pTokenizer = pTab->pTokenizer; while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){ const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3; rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); @@ -631,11 +631,10 @@ static int fts3SnippetText( /* Open a token cursor on the document. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; - rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC); if( rc!=SQLITE_OK ){ return rc; } - pC->pTokenizer = pTab->pTokenizer; while( rc==SQLITE_OK ){ int iBegin; /* Offset in zDoc of start of token */ @@ -657,7 +656,9 @@ static int fts3SnippetText( if( !isShiftDone ){ int n = nDoc - iBegin; - rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask); + rc = fts3SnippetShift( + pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask + ); isShiftDone = 1; /* Now that the shift has been done, check if the initial "..." are @@ -1390,9 +1391,10 @@ void sqlite3Fts3Offsets( } /* Initialize a tokenizer iterator to iterate through column iCol. */ - rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC); + rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, + zDoc, nDoc, &pC + ); if( rc!=SQLITE_OK ) goto offsets_out; - pC->pTokenizer = pTab->pTokenizer; rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); while( rc==SQLITE_OK ){ diff --git a/ext/fts3/fts3_test.c b/ext/fts3/fts3_test.c index 72735f3d12..7cbc9eae9f 100644 --- a/ext/fts3/fts3_test.c +++ b/ext/fts3/fts3_test.c @@ -13,6 +13,9 @@ ** This file is not part of the production FTS code. It is only used for ** testing. It contains a Tcl command that can be used to test if a document ** matches an FTS NEAR expression. +** +** As of March 2012, it also contains a version 1 tokenizer used for testing +** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly. */ #include @@ -314,11 +317,207 @@ static int fts3_configure_incr_load_cmd( return TCL_OK; } +/************************************************************************** +** Beginning of test tokenizer code. +** +** For language 0, this tokenizer is similar to the default 'simple' +** tokenizer. For other languages L, the following: +** +** * Odd numbered languages are case-sensitive. Even numbered +** languages are not. +** +** * Language ids 100 or greater are considered an error. +** +** The implementation assumes that the input contains only ASCII characters +** (i.e. those that may be encoded in UTF-8 using a single byte). +*/ +typedef struct test_tokenizer { + sqlite3_tokenizer base; +} test_tokenizer; + +typedef struct test_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *aInput; /* Input being tokenized */ + int nInput; /* Size of the input in bytes */ + int iInput; /* Current offset in aInput */ + int iToken; /* Index of next token to be returned */ + char *aBuffer; /* Buffer containing current token */ + int nBuffer; /* Number of bytes allocated at pToken */ + int iLangid; /* Configured language id */ +} test_tokenizer_cursor; + +static int testTokenizerCreate( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer +){ + test_tokenizer *pNew; + + pNew = sqlite3_malloc(sizeof(test_tokenizer)); + if( !pNew ) return SQLITE_NOMEM; + memset(pNew, 0, sizeof(test_tokenizer)); + + *ppTokenizer = (sqlite3_tokenizer *)pNew; + return SQLITE_OK; +} + +static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){ + test_tokenizer *p = (test_tokenizer *)pTokenizer; + sqlite3_free(p); + return SQLITE_OK; +} + +static int testTokenizerOpen( + sqlite3_tokenizer *pTokenizer, /* The tokenizer */ + const char *pInput, int nBytes, /* String to be tokenized */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ +){ + int rc = SQLITE_OK; /* Return code */ + test_tokenizer_cursor *pCsr; /* New cursor object */ + + UNUSED_PARAMETER(pTokenizer); + + pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor)); + if( pCsr==0 ){ + rc = SQLITE_NOMEM; + }else{ + memset(pCsr, 0, sizeof(test_tokenizer_cursor)); + pCsr->aInput = pInput; + if( nBytes<0 ){ + pCsr->nInput = strlen(pInput); + }else{ + pCsr->nInput = nBytes; + } + } + + *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; + return rc; +} + +static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){ + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + sqlite3_free(pCsr->aBuffer); + sqlite3_free(pCsr); + return SQLITE_OK; +} + +static int testIsTokenChar(char c){ + return (c>='a' && c<='z') || (c>='A' && c<='Z'); +} +static int testTolower(char c){ + char ret = c; + if( ret>='A' && ret<='Z') ret = ret - ('A'-'a'); + return ret; +} + +static int testTokenizerNext( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */ + const char **ppToken, /* OUT: *ppToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ +){ + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + int rc = SQLITE_OK; + const char *p; + const char *pEnd; + + p = &pCsr->aInput[pCsr->iInput]; + pEnd = &pCsr->aInput[pCsr->nInput]; + + /* Skip past any white-space */ + assert( p<=pEnd ); + while( ppCsr->nBuffer ){ + sqlite3_free(pCsr->aBuffer); + pCsr->aBuffer = sqlite3_malloc(nToken); + } + if( pCsr->aBuffer==0 ){ + rc = SQLITE_NOMEM; + }else{ + int i; + + if( pCsr->iLangid & 0x00000001 ){ + for(i=0; iaBuffer[i] = pToken[i]; + }else{ + for(i=0; iaBuffer[i] = testTolower(pToken[i]); + } + pCsr->iToken++; + pCsr->iInput = p - pCsr->aInput; + + *ppToken = pCsr->aBuffer; + *pnBytes = nToken; + *piStartOffset = pToken - pCsr->aInput; + *piEndOffset = p - pCsr->aInput; + *piPosition = pCsr->iToken; + } + } + + return rc; +} + +static int testTokenizerLanguage( + sqlite3_tokenizer_cursor *pCursor, + int iLangid +){ + int rc = SQLITE_OK; + test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; + pCsr->iLangid = iLangid; + if( pCsr->iLangid>=100 ){ + rc = SQLITE_ERROR; + } + return rc; +} + +static int fts3_test_tokenizer_cmd( + ClientData clientData, + Tcl_Interp *interp, + int objc, + Tcl_Obj *CONST objv[] +){ + static const sqlite3_tokenizer_module testTokenizerModule = { + 1, + testTokenizerCreate, + testTokenizerDestroy, + testTokenizerOpen, + testTokenizerClose, + testTokenizerNext, + testTokenizerLanguage + }; + const sqlite3_tokenizer_module *pPtr = &testTokenizerModule; + if( objc!=1 ){ + Tcl_WrongNumArgs(interp, 1, objv, ""); + return TCL_ERROR; + } + Tcl_SetObjResult(interp, Tcl_NewByteArrayObj( + (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *) + )); + return TCL_OK; +} + +/* +** End of tokenizer code. +**************************************************************************/ + int Sqlitetestfts3_Init(Tcl_Interp *interp){ Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); Tcl_CreateObjCommand(interp, "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 ); + Tcl_CreateObjCommand( + interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0 + ); return TCL_OK; } #endif /* ifdef SQLITE_TEST */ diff --git a/ext/fts3/fts3_tokenizer.c b/ext/fts3/fts3_tokenizer.c index 6494bb96d8..f6b044ff6a 100644 --- a/ext/fts3/fts3_tokenizer.c +++ b/ext/fts3/fts3_tokenizer.c @@ -288,11 +288,10 @@ static void testFunc( goto finish; } pTokenizer->pModule = p; - if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ + if( sqlite3Fts3OpenTokenizer(pTokenizer, 0, zInput, nInput, &pCsr) ){ zErr = "error in xOpen()"; goto finish; } - pCsr->pTokenizer = pTokenizer; while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); diff --git a/ext/fts3/fts3_tokenizer.h b/ext/fts3/fts3_tokenizer.h index 615644506c..c91c7ed790 100644 --- a/ext/fts3/fts3_tokenizer.h +++ b/ext/fts3/fts3_tokenizer.h @@ -52,7 +52,7 @@ typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; struct sqlite3_tokenizer_module { /* - ** Structure version. Should always be set to 0. + ** Structure version. Should always be set to 0 or 1. */ int iVersion; @@ -133,6 +133,15 @@ struct sqlite3_tokenizer_module { int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ int *piPosition /* OUT: Number of tokens returned before this one */ ); + + /*********************************************************************** + ** Methods below this point are only available if iVersion>=1. + */ + + /* + ** Configure the language id of a tokenizer cursor. + */ + int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); }; struct sqlite3_tokenizer { diff --git a/ext/fts3/fts3_write.c b/ext/fts3/fts3_write.c index decbe0b73a..8f97c8be98 100644 --- a/ext/fts3/fts3_write.c +++ b/ext/fts3/fts3_write.c @@ -657,6 +657,7 @@ static int fts3PendingTermsAddOne( */ static int fts3PendingTermsAdd( Fts3Table *p, /* Table into which text will be inserted */ + int iLangid, /* Language id to use */ const char *zText, /* Text of document to be inserted */ int iCol, /* Column into which text is being inserted */ u32 *pnWord /* OUT: Number of tokens inserted */ @@ -686,11 +687,10 @@ static int fts3PendingTermsAdd( return SQLITE_OK; } - rc = pModule->xOpen(pTokenizer, zText, -1, &pCsr); + rc = sqlite3Fts3OpenTokenizer(pTokenizer, iLangid, zText, -1, &pCsr); if( rc!=SQLITE_OK ){ return rc; } - pCsr->pTokenizer = pTokenizer; xNext = pModule->xNext; while( SQLITE_OK==rc @@ -783,11 +783,16 @@ void sqlite3Fts3PendingTermsClear(Fts3Table *p){ ** Argument apVal is the same as the similarly named argument passed to ** fts3InsertData(). Parameter iDocid is the docid of the new row. */ -static int fts3InsertTerms(Fts3Table *p, sqlite3_value **apVal, u32 *aSz){ +static int fts3InsertTerms( + Fts3Table *p, + int iLangid, + sqlite3_value **apVal, + u32 *aSz +){ int i; /* Iterator variable */ for(i=2; inColumn+2; i++){ const char *zText = (const char *)sqlite3_value_text(apVal[i]); - int rc = fts3PendingTermsAdd(p, zText, i-2, &aSz[i-2]); + int rc = fts3PendingTermsAdd(p, iLangid, zText, i-2, &aSz[i-2]); if( rc!=SQLITE_OK ){ return rc; } @@ -933,13 +938,11 @@ static void fts3DeleteTerms( if( rc==SQLITE_OK ){ if( SQLITE_ROW==sqlite3_step(pSelect) ){ int i; - rc = fts3PendingTermsDocid(p, - langidFromSelect(p, pSelect), - sqlite3_column_int64(pSelect, 0) - ); + int iLangid = langidFromSelect(p, pSelect); + rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pSelect, 0)); for(i=1; rc==SQLITE_OK && i<=p->nColumn; i++){ const char *zText = (const char *)sqlite3_column_text(pSelect, i); - rc = fts3PendingTermsAdd(p, zText, -1, &aSz[i-1]); + rc = fts3PendingTermsAdd(p, iLangid, zText, -1, &aSz[i-1]); aSz[p->nColumn] += sqlite3_column_bytes(pSelect, i); } if( rc!=SQLITE_OK ){ @@ -3102,13 +3105,12 @@ static int fts3DoRebuild(Fts3Table *p){ while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){ int iCol; - rc = fts3PendingTermsDocid(p, - langidFromSelect(p, pStmt), sqlite3_column_int64(pStmt, 0) - ); + int iLangid = langidFromSelect(p, pStmt); + rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pStmt, 0)); aSz[p->nColumn] = 0; for(iCol=0; rc==SQLITE_OK && iColnColumn; iCol++){ const char *z = (const char *) sqlite3_column_text(pStmt, iCol+1); - rc = fts3PendingTermsAdd(p, z, iCol, &aSz[iCol]); + rc = fts3PendingTermsAdd(p, iLangid, z, iCol, &aSz[iCol]); aSz[p->nColumn] += sqlite3_column_bytes(pStmt, iCol+1); } if( p->bHasDocsize ){ @@ -3227,14 +3229,13 @@ int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *pCsr){ const char *zText = (const char *)sqlite3_column_text(pCsr->pStmt, i+1); sqlite3_tokenizer_cursor *pTC = 0; - rc = pModule->xOpen(pT, zText, -1, &pTC); + rc = sqlite3Fts3OpenTokenizer(pT, pCsr->iLangid, zText, -1, &pTC); while( rc==SQLITE_OK ){ char const *zToken; /* Buffer containing token */ int nToken; /* Number of bytes in token */ int iDum1, iDum2; /* Dummy variables */ int iPos; /* Position of token in zText */ - pTC->pTokenizer = pT; rc = pModule->xNext(pTC, &zToken, &nToken, &iDum1, &iDum2, &iPos); for(pDef=pCsr->pDeferred; pDef && rc==SQLITE_OK; pDef=pDef->pNext){ Fts3PhraseToken *pPT = pDef->pToken; @@ -3467,6 +3468,7 @@ int sqlite3Fts3UpdateMethod( /* If this is an INSERT or UPDATE operation, insert the new record. */ if( nArg>1 && rc==SQLITE_OK ){ + int iLangid = sqlite3_value_int(apVal[2 + p->nColumn + 2]); if( bInsertDone==0 ){ rc = fts3InsertData(p, apVal, pRowid); if( rc==SQLITE_CONSTRAINT && p->zContentTbl==0 ){ @@ -3474,14 +3476,11 @@ int sqlite3Fts3UpdateMethod( } } if( rc==SQLITE_OK && (!isRemove || *pRowid!=p->iPrevDocid ) ){ - rc = fts3PendingTermsDocid(p, - sqlite3_value_int(apVal[2 + p->nColumn + 2]), - *pRowid - ); + rc = fts3PendingTermsDocid(p, iLangid, *pRowid); } if( rc==SQLITE_OK ){ assert( p->iPrevDocid==*pRowid ); - rc = fts3InsertTerms(p, apVal, aSzIns); + rc = fts3InsertTerms(p, iLangid, apVal, aSzIns); } if( p->bHasDocsize ){ fts3InsertDocsize(&rc, p, aSzIns); diff --git a/manifest b/manifest index 15fcd20c9b..ef077935d3 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fix\sproblems\swith\scombining\scontent=\sand\slanguageid=\sin\sa\ssingle\sfts4\stable. -D 2012-03-02T19:53:02.350 +C Add\sthe\sxLanguageid\smethod\sto\ssqlite3_fts3_tokenizer\sversions\s1\sand\sgreater. +D 2012-03-03T18:46:41.456 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in 3f79a373e57c3b92dabf76f40b065e719d31ac34 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -63,22 +63,22 @@ F ext/fts3/README.content fdc666a70d5257a64fee209f97cf89e0e6e32b51 F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9 F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d -F ext/fts3/fts3.c fd89caa4169520c32cf46ca5a62df6dd48201422 +F ext/fts3/fts3.c fcda9a9ff7ccfb9fe4388d36063e3405a652e15f F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe -F ext/fts3/fts3Int.h 521d300f2af4e741f53c4e2dd540275fb64533eb +F ext/fts3/fts3Int.h d1d7f964ddee067bcd16a6af4ba7ecf66220056d F ext/fts3/fts3_aux.c 72de4cb43db7bfc2f68fbda04b7d8095ae9a6239 -F ext/fts3/fts3_expr.c f5df26bddf46a5916b2a5f80c4027996e92b7b15 +F ext/fts3/fts3_expr.c dbc7ba4c3a6061adde0f38ed8e9b349568299551 F ext/fts3/fts3_hash.c 8dd2d06b66c72c628c2732555a32bc0943114914 F ext/fts3/fts3_hash.h 8331fb2206c609f9fc4c4735b9ab5ad6137c88ec F ext/fts3/fts3_icu.c 6c8f395cdf9e1e3afa7fadb7e523dbbf381c6dfa F ext/fts3/fts3_porter.c b7e5276f9f0a5fc7018b6fa55ce0f31f269ef881 -F ext/fts3/fts3_snippet.c 1f9ee6a8e0e242649645968dcec4deb253d86c2a +F ext/fts3/fts3_snippet.c c9e126c20760988aa7c43c6ea1379db34738282e F ext/fts3/fts3_term.c d3466cf99432291be08e379d89645462431809d6 -F ext/fts3/fts3_test.c 24fa13f330db011500acb95590da9eee24951894 -F ext/fts3/fts3_tokenizer.c 9ff7ec66ae3c5c0340fa081958e64f395c71a106 -F ext/fts3/fts3_tokenizer.h 13ffd9fcb397fec32a05ef5cd9e0fa659bf3dbd3 +F ext/fts3/fts3_test.c a026412a41450a014ccb7abdd5efaa7c9711d49e +F ext/fts3/fts3_tokenizer.c 3da7254a9881f7e270ab28e2004e0d22b3212bce +F ext/fts3/fts3_tokenizer.h 66dec98e365854b6cd2d54f1a96bb6d428fc5a68 F ext/fts3/fts3_tokenizer1.c 0dde8f307b8045565cf63797ba9acfaff1c50c68 -F ext/fts3/fts3_write.c 35b98a42f9bbdd28af1b1f3bb0c09ff07090a764 +F ext/fts3/fts3_write.c f87bb2d27d31cb7a7bf306747079095393c9d073 F ext/fts3/fts3speed.tcl b54caf6a18d38174f1a6e84219950d85e98bb1e9 F ext/fts3/mkfts3amal.tcl 252ecb7fe6467854f2aa237bf2c390b74e71f100 F ext/icu/README.txt bf8461d8cdc6b8f514c080e4e10dc3b2bbdfefa9 @@ -496,7 +496,7 @@ F test/fts3snippet.test 8e956051221a34c7daeb504f023cb54d5fa5a8b2 F test/fts3sort.test 95be0b19d7e41c44b29014f13ea8bddd495fd659 F test/fts4aa.test 6e7f90420b837b2c685f3bcbe84c868492d40a68 F test/fts4content.test 17b2360f7d1a9a7e5aa8022783f5c5731b6dfd4f -F test/fts4langid.test be989b5cddcd7596b87232af193f6c4560a34272 +F test/fts4langid.test fabdd5a8db0fa00292e0704809f566e3fb6dba3a F test/func.test 6c5ce11e3a0021ca3c0649234e2d4454c89110ca F test/func2.test 772d66227e4e6684b86053302e2d74a2500e1e0f F test/func3.test 001021e5b88bd02a3b365a5c5fd8f6f49d39744a @@ -635,7 +635,7 @@ F test/pageropt.test 9191867ed19a2b3db6c42d1b36b6fbc657cd1ab0 F test/pagesize.test 1dd51367e752e742f58e861e65ed7390603827a0 F test/pcache.test 065aa286e722ab24f2e51792c1f093bf60656b16 F test/pcache2.test a83efe2dec0d392f814bfc998def1d1833942025 -F test/permutations.test fa6f0e5f13fe0b1d3f7a7613179b7f7b20028184 +F test/permutations.test 2b5a1b64a8e5114757457fbce9010387d1fe7682 F test/pragma.test f6111ded4d56b79436a60a757d62f3c96a9cf3f5 F test/pragma2.test 3a55f82b954242c642f8342b17dffc8b47472947 F test/printf.test ec9870c4dce8686a37818e0bf1aba6e6a1863552 @@ -992,7 +992,7 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06 F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/warnings-clang.sh 9f406d66e750e8ac031c63a9ef3248aaa347ef2a F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381 -P 181bc35731f19c3e4497ba3338c209918d34ea69 -R acfb20f690a18ac8d67e116ae8c76f7d +P 22491e7bc38aee43819b888e04241cb6a6ef73a3 +R b3e8f4cc7fcd5d7bfa4510a8d2434f16 U dan -Z b771fbbc3f5dd49e39970a5f917fc8b7 +Z 8fb02bd9c9b9481fcea2986d10daaba1 diff --git a/manifest.uuid b/manifest.uuid index def5beae28..868a4cc932 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -22491e7bc38aee43819b888e04241cb6a6ef73a3 \ No newline at end of file +f8e9c445dd358c40e5a7bf3756b9f291909dbea7 \ No newline at end of file diff --git a/test/fts4langid.test b/test/fts4langid.test index b9110cb768..08f1a21c17 100644 --- a/test/fts4langid.test +++ b/test/fts4langid.test @@ -40,7 +40,6 @@ set ::testprefix fts4langid # 3.* - Tests with content= tables. Both where there is a real # underlying content table and where there is not. # -# # 4.* - Test that if one is provided, the tokenizer xLanguage method # is called to configure the tokenizer before tokenizing query # or document text. @@ -342,7 +341,45 @@ do_test_query1 3.3.4 {"zero one" OR "one two"} { or_merge_lists [rowid_list "zero one"] [rowid_list "one two"] } +#------------------------------------------------------------------------- +# Test cases 4.* +# +proc build_multilingual_db_2 {db} { + $db eval { + CREATE VIRTUAL TABLE t4 USING fts4( + tokenize=testtokenizer, + languageid=lid + ); + } + for {set i 0} {$i < 50} {incr i} { + execsql { + INSERT INTO t4(docid, content, lid) VALUES($i, 'The Quick Brown Fox', $i) + } + } +} +do_test 4.1.0 { + reset_db + set ptr [fts3_test_tokenizer] + execsql { SELECT fts3_tokenizer('testtokenizer', $ptr) } + build_multilingual_db_2 db +} {} +do_execsql_test 4.1.1 { + SELECT docid FROM t4 WHERE t4 MATCH 'quick'; +} {0} +do_execsql_test 4.1.2 { + SELECT docid FROM t4 WHERE t4 MATCH 'quick' AND lid=1; +} {} +do_execsql_test 4.1.3 { + SELECT docid FROM t4 WHERE t4 MATCH 'Quick' AND lid=1; +} {1} +for {set i 0} {$i < 50} {incr i} { + do_execsql_test 4.1.4.$i { + SELECT count(*) FROM t4 WHERE t4 MATCH 'fox' AND lid=$i; + } [expr 0==($i%2)] +} +do_catchsql_test 4.1.5 { + INSERT INTO t4(content, lid) VALUES('hello world', 101) +} {1 {SQL logic error or missing database}} finish_test - diff --git a/test/permutations.test b/test/permutations.test index f55db1be73..26c1b2a514 100644 --- a/test/permutations.test +++ b/test/permutations.test @@ -184,8 +184,7 @@ test_suite "fts3" -prefix "" -description { fts3aux1.test fts3comp1.test fts3auto.test fts4aa.test fts4content.test fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test - fts3corrupt2.test - fts3first.test + fts3corrupt2.test fts3first.test fts4langid.test }