Change the fts5 tokenizer API to allow more than one token to occupy a single position within a document.

FossilOrigin-Name: 90b85b42f2b2dd3e939b129b7df2b822a05e243d
This commit is contained in:
dan 2015-08-28 19:56:47 +00:00
parent 2798f0b54b
commit 57e0add3f9
12 changed files with 99 additions and 58 deletions

View File

@ -217,7 +217,7 @@ struct Fts5ExtensionApi {
int (*xTokenize)(Fts5Context*,
const char *pText, int nText, /* Text to tokenize */
void *pCtx, /* Context passed to xToken() */
int (*xToken)(void*, const char*, int, int, int) /* Callback */
int (*xToken)(void*, const char*, int, int, int, int) /* Callback */
);
int (*xPhraseCount)(Fts5Context*);
@ -309,17 +309,24 @@ struct fts5_tokenizer {
void (*xDelete)(Fts5Tokenizer*);
int (*xTokenize)(Fts5Tokenizer*,
void *pCtx,
int flags,
const char *pText, int nText,
int (*xToken)(
void *pCtx, /* Copy of 2nd argument to xTokenize() */
const char *pToken, /* Pointer to buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Byte offset of token within input text */
int iEnd /* Byte offset of end of token within input text */
int iEnd, /* Byte offset of end of token within input text */
int iPos /* Number of tokens before this one in input text */
)
);
};
#define FTS5_TOKENIZE_QUERY 0x0001
#define FTS5_TOKENIZE_PREFIX 0x0002
#define FTS5_TOKENIZE_DOCUMENT 0x0004
#define FTS5_TOKENIZE_AUX 0x0008
/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/
@ -329,7 +336,7 @@ struct fts5_tokenizer {
*/
typedef struct fts5_api fts5_api;
struct fts5_api {
int iVersion; /* Currently always set to 1 */
int iVersion; /* Currently always set to 2 */
/* Create a new tokenizer */
int (*xCreateTokenizer)(

View File

@ -166,9 +166,10 @@ int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig);
int sqlite3Fts5Tokenize(
Fts5Config *pConfig, /* FTS5 Configuration object */
int flags, /* FTS5_TOKENIZE_* flags */
const char *pText, int nText, /* Text to tokenize */
void *pCtx, /* Context passed to xToken() */
int (*xToken)(void*, const char*, int, int, int) /* Callback */
int (*xToken)(void*, const char*, int, int, int, int) /* Callback */
);
void sqlite3Fts5Dequote(char *z);

View File

@ -151,11 +151,14 @@ static int fts5HighlightCb(
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStartOff, /* Start offset of token */
int iEndOff /* End offset of token */
int iEndOff, /* End offset of token */
int iPos
){
HighlightContext *p = (HighlightContext*)pContext;
int rc = SQLITE_OK;
int iPos = p->iPos++;
if( iPos<p->iPos ) return SQLITE_OK;
p->iPos = iPos+1;
if( p->iRangeEnd>0 ){
if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;

View File

@ -645,12 +645,15 @@ int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig){
*/
int sqlite3Fts5Tokenize(
Fts5Config *pConfig, /* FTS5 Configuration object */
int flags, /* FTS5_TOKENIZE_* flags */
const char *pText, int nText, /* Text to tokenize */
void *pCtx, /* Context passed to xToken() */
int (*xToken)(void*, const char*, int, int, int) /* Callback */
int (*xToken)(void*, const char*, int, int, int, int) /* Callback */
){
if( pText==0 ) return SQLITE_OK;
return pConfig->pTokApi->xTokenize(pConfig->pTok, pCtx, pText, nText, xToken);
return pConfig->pTokApi->xTokenize(
pConfig->pTok, pCtx, flags, pText, nText, xToken
);
}
/*

View File

@ -1341,7 +1341,8 @@ static int fts5ParseTokenize(
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
int iEnd, /* End offset of token */
int iPos
){
int rc = SQLITE_OK;
const int SZALLOC = 8;
@ -1417,8 +1418,11 @@ Fts5ExprPhrase *sqlite3Fts5ParseTerm(
rc = fts5ParseStringFromToken(pToken, &z);
if( rc==SQLITE_OK ){
int flags = FTS5_TOKENIZE_QUERY | (bPrefix ? FTS5_TOKENIZE_QUERY : 0);
int n;
sqlite3Fts5Dequote(z);
rc = sqlite3Fts5Tokenize(pConfig, z, strlen(z), &sCtx, fts5ParseTokenize);
n = strlen(z);
rc = sqlite3Fts5Tokenize(pConfig, flags, z, n, &sCtx, fts5ParseTokenize);
}
sqlite3_free(z);
if( rc ){

View File

@ -1498,11 +1498,13 @@ static int fts5ApiTokenize(
Fts5Context *pCtx,
const char *pText, int nText,
void *pUserData,
int (*xToken)(void*, const char*, int, int, int)
int (*xToken)(void*, const char*, int, int, int, int)
){
Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
return sqlite3Fts5Tokenize(pTab->pConfig, pText, nText, pUserData, xToken);
return sqlite3Fts5Tokenize(
pTab->pConfig, FTS5_TOKENIZE_AUX, pText, nText, pUserData, xToken
);
}
static int fts5ApiPhraseCount(Fts5Context *pCtx){
@ -1658,10 +1660,11 @@ static int fts5ColumnSizeCb(
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
int iEnd, /* End offset of token */
int iPos
){
int *pCnt = (int*)pContext;
*pCnt = *pCnt + 1;
*pCnt = iPos+1;
return SQLITE_OK;
}
@ -1691,7 +1694,9 @@ static int fts5ApiColumnSize(Fts5Context *pCtx, int iCol, int *pnToken){
pCsr->aColumnSize[i] = 0;
rc = fts5ApiColumnText(pCtx, i, &z, &n);
if( rc==SQLITE_OK ){
rc = sqlite3Fts5Tokenize(pConfig, z, n, p, fts5ColumnSizeCb);
rc = sqlite3Fts5Tokenize(
pConfig, FTS5_TOKENIZE_AUX, z, n, p, fts5ColumnSizeCb
);
}
}
}
@ -2344,7 +2349,7 @@ int sqlite3_fts5_init(
void *p = (void*)pGlobal;
memset(pGlobal, 0, sizeof(Fts5Global));
pGlobal->db = db;
pGlobal->api.iVersion = 1;
pGlobal->api.iVersion = 2;
pGlobal->api.xCreateFunction = fts5CreateAux;
pGlobal->api.xCreateTokenizer = fts5CreateTokenizer;
pGlobal->api.xFindTokenizer = fts5FindTokenizer;

View File

@ -362,11 +362,13 @@ static int fts5StorageInsertCallback(
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
int iEnd, /* End offset of token */
int iPos
){
Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
Fts5Index *pIdx = pCtx->pStorage->pIndex;
int iPos = pCtx->szCol++;
assert( iPos+1>=pCtx->szCol );
pCtx->szCol = iPos+1;
return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, iPos, pToken, nToken);
}
@ -394,6 +396,7 @@ static int fts5StorageDeleteFromIndex(Fts5Storage *p, i64 iDel){
if( pConfig->abUnindexed[iCol-1] ) continue;
ctx.szCol = 0;
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pSeek, iCol),
sqlite3_column_bytes(pSeek, iCol),
(void*)&ctx,
@ -565,6 +568,7 @@ int sqlite3Fts5StorageSpecialDelete(
if( pConfig->abUnindexed[iCol] ) continue;
ctx.szCol = 0;
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_value_text(apVal[iCol]),
sqlite3_value_bytes(apVal[iCol]),
(void*)&ctx,
@ -654,6 +658,7 @@ int sqlite3Fts5StorageRebuild(Fts5Storage *p){
ctx.szCol = 0;
if( pConfig->abUnindexed[ctx.iCol]==0 ){
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pScan, ctx.iCol+1),
sqlite3_column_bytes(pScan, ctx.iCol+1),
(void*)&ctx,
@ -771,6 +776,7 @@ int sqlite3Fts5StorageInsert(
ctx.szCol = 0;
if( pConfig->abUnindexed[ctx.iCol]==0 ){
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_value_text(apVal[ctx.iCol+2]),
sqlite3_value_bytes(apVal[ctx.iCol+2]),
(void*)&ctx,
@ -841,10 +847,12 @@ static int fts5StorageIntegrityCallback(
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
int iEnd, /* End offset of token */
int iPos
){
Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
int iPos = pCtx->szCol++;
assert( iPos+1>=pCtx->szCol );
pCtx->szCol = iPos+1;
pCtx->cksum ^= sqlite3Fts5IndexCksum(
pCtx->pConfig, pCtx->iRowid, pCtx->iCol, iPos, pToken, nToken
);
@ -886,8 +894,8 @@ int sqlite3Fts5StorageIntegrity(Fts5Storage *p){
if( pConfig->abUnindexed[i] ) continue;
ctx.iCol = i;
ctx.szCol = 0;
rc = sqlite3Fts5Tokenize(
pConfig,
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pScan, i+1),
sqlite3_column_bytes(pScan, i+1),
(void*)&ctx,

View File

@ -142,7 +142,7 @@ struct F5tAuxData {
static int xTokenizeCb(
void *pCtx,
const char *zToken, int nToken,
int iStart, int iEnd
int iStart, int iEnd, int iPos
){
F5tFunction *p = (F5tFunction*)pCtx;
Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript);
@ -585,7 +585,7 @@ struct F5tTokenizeCtx {
static int xTokenizeCb2(
void *pCtx,
const char *zToken, int nToken,
int iStart, int iEnd
int iStart, int iEnd, int iPos
){
F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
if( p->bSubst ){
@ -666,7 +666,9 @@ static int f5tTokenize(
ctx.bSubst = (objc==5);
ctx.pRet = pRet;
ctx.zInput = zText;
rc = tokenizer.xTokenize(pTok, (void*)&ctx, zText, nText, xTokenizeCb2);
rc = tokenizer.xTokenize(
pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, zText, nText, xTokenizeCb2
);
tokenizer.xDelete(pTok);
if( rc!=SQLITE_OK ){
Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", 0);
@ -748,8 +750,9 @@ static void f5tTokenizerDelete(Fts5Tokenizer *p){
static int f5tTokenizerTokenize(
Fts5Tokenizer *p,
void *pCtx,
int flags,
const char *pText, int nText,
int (*xToken)(void*, const char*, int, int, int)
int (*xToken)(void*, const char*, int, int, int, int)
){
F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
void *pOldCtx;

View File

@ -116,13 +116,15 @@ static void asciiFold(char *aOut, const char *aIn, int nByte){
static int fts5AsciiTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
int flags,
const char *pText, int nText,
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
int rc = SQLITE_OK;
int ie;
int is = 0;
int iPos = 0;
char aFold[64];
int nFold = sizeof(aFold);
@ -158,7 +160,7 @@ static int fts5AsciiTokenize(
asciiFold(pFold, &pText[is], nByte);
/* Invoke the token callback */
rc = xToken(pCtx, pFold, nByte, is, ie);
rc = xToken(pCtx, pFold, nByte, is, ie, iPos++);
is = ie+1;
}
@ -385,12 +387,14 @@ static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
static int fts5UnicodeTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
int flags,
const char *pText, int nText,
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
int rc = SQLITE_OK;
unsigned char *a = p->aTokenChar;
int iPos = 0;
unsigned char *zTerm = (unsigned char*)&pText[nText];
unsigned char *zCsr = (unsigned char *)pText;
@ -475,7 +479,7 @@ static int fts5UnicodeTokenize(
}
/* Invoke the token callback */
rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
rc = xToken(pCtx, aFold, zOut-aFold, is, ie, iPos++);
}
tokenize_done:
@ -553,7 +557,7 @@ static int fts5PorterCreate(
typedef struct PorterContext PorterContext;
struct PorterContext {
void *pCtx;
int (*xToken)(void*, const char*, int, int, int);
int (*xToken)(void*, const char*, int, int, int, int);
char *aBuf;
};
@ -1121,7 +1125,8 @@ static int fts5PorterCb(
const char *pToken,
int nToken,
int iStart,
int iEnd
int iEnd,
int iPos
){
PorterContext *p = (PorterContext*)pCtx;
@ -1175,10 +1180,10 @@ static int fts5PorterCb(
nBuf--;
}
return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd);
return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd, iPos);
pass_through:
return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd);
return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd, iPos);
}
/*
@ -1187,8 +1192,9 @@ static int fts5PorterCb(
static int fts5PorterTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
int flags,
const char *pText, int nText,
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
PorterContext sCtx;
@ -1196,7 +1202,7 @@ static int fts5PorterTokenize(
sCtx.pCtx = pCtx;
sCtx.aBuf = p->aBuf;
return p->tokenizer.xTokenize(
p->pTokenizer, (void*)&sCtx, pText, nText, fts5PorterCb
p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
);
}

View File

@ -355,10 +355,10 @@ do_execsql_test 10.1 {
#---------------------------------------------------------------------------
# Test the 'y' matchinfo flag
#
set sqlite_fts3_enable_parentheses 1
reset_db
sqlite3_fts5_register_matchinfo db
do_execsql_test 11.0 {
CREATE VIRTUAL TABLE tt USING fts3(x, y);
CREATE VIRTUAL TABLE tt USING fts5(x, y);
INSERT INTO tt VALUES('c d a c d d', 'e a g b d a'); -- 1
INSERT INTO tt VALUES('c c g a e b', 'c g d g e c'); -- 2
INSERT INTO tt VALUES('b e f d e g', 'b a c b c g'); -- 3
@ -432,19 +432,18 @@ foreach {tn expr res} {
SELECT rowid, mit(matchinfo(tt, 'b')) FROM tt WHERE tt MATCH $expr
} $r2
}
set sqlite_fts3_enable_parentheses 0
#---------------------------------------------------------------------------
# Test the 'b' matchinfo flag
#
set sqlite_fts3_enable_parentheses 1
reset_db
sqlite3_fts5_register_matchinfo db
db func mit mit
do_test 12.0 {
set cols [list]
for {set i 0} {$i < 50} {incr i} { lappend cols "c$i" }
execsql "CREATE VIRTUAL TABLE tt USING fts3([join $cols ,])"
execsql "CREATE VIRTUAL TABLE tt USING fts5([join $cols ,])"
} {}
do_execsql_test 12.1 {
@ -452,6 +451,5 @@ do_execsql_test 12.1 {
SELECT mit(matchinfo(tt, 'b')) FROM tt WHERE tt MATCH 'abc';
} [list [list [expr 1<<4] [expr 1<<(45-32)]]]
set sqlite_fts3_enable_parentheses 0
finish_test

View File

@ -1,5 +1,5 @@
C Fix\scompiler\swarnings\sin\srbu\scode.
D 2015-08-28T16:41:45.530
C Change\sthe\sfts5\stokenizer\sAPI\sto\sallow\smore\sthan\sone\stoken\sto\soccupy\sa\ssingle\sposition\swithin\sa\sdocument.
D 2015-08-28T19:56:47.300
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in e2218eb228374422969de7b1680eda6864affcef
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@ -105,19 +105,19 @@ F ext/fts3/unicode/UnicodeData.txt cd07314edb62d49fde34debdaf92fa2aa69011e7
F ext/fts3/unicode/mkunicode.tcl 95cf7ec186e48d4985e433ff8a1c89090a774252
F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95
F ext/fts5/extract_api_docs.tcl 06583c935f89075ea0b32f85efa5dd7619fcbd03
F ext/fts5/fts5.h 1950ec0544de667a24c1d8af9b2fde5db7db3bc9
F ext/fts5/fts5Int.h 45f2ceb3c030f70e2cc4c199e9f700c2f2367f77
F ext/fts5/fts5_aux.c 044cb176a815f4388308738437f6e130aa384fb0
F ext/fts5/fts5.h b9dfb487ada3caab4400210609b8309b71a4fb4d
F ext/fts5/fts5Int.h b0cfe44ec9451f766b77c4e5f771e7919c6dc8d5
F ext/fts5/fts5_aux.c 7d0e275ee94ad7afdd4208d6b071b4319e8f9ca0
F ext/fts5/fts5_buffer.c 80f9ba4431848cb857e3d2158f5280093dcd8015
F ext/fts5/fts5_config.c fdfa63ae8e527ecfaa50f94063c610429cc887cf
F ext/fts5/fts5_expr.c d075d36c84975a1cfcf070442d28e28027b61c25
F ext/fts5/fts5_config.c ab81c8ccff6c0fb79f21c369e18e8e0dec365ec5
F ext/fts5/fts5_expr.c f53917b6e68dee62e4c525466edacacf82eb7cbc
F ext/fts5/fts5_hash.c 4bf4b99708848357b8a2b5819e509eb6d3df9246
F ext/fts5/fts5_index.c 076c4995bf06a6d1559a6e31f9a86b90f2105374
F ext/fts5/fts5_main.c fc47ad734dfb55765b7542a345cee981170e7caa
F ext/fts5/fts5_storage.c 22ec9b5d35a39e2b5b65daf4ba7cd47fbb2d0df5
F ext/fts5/fts5_tcl.c 96a3b9e982c4a64a242eefd752fa6669cd405a67
F ext/fts5/fts5_main.c 7afdb84ac40b0e5bbb920a07a5cd5e062963816c
F ext/fts5/fts5_storage.c 9c263323479a4aa554738e421813cd05615d379c
F ext/fts5/fts5_tcl.c 41e2d6b455547a157085fd35fd59d4fd890dc7d3
F ext/fts5/fts5_test_mi.c 80a9e86fb4c5b6b58f8fefac05e9b96d1a6574e1
F ext/fts5/fts5_tokenize.c 2836f6728bd74c7efac7487f5d9c27ca3e1b509c
F ext/fts5/fts5_tokenize.c 07a894410bc074685ddc0a9d89b5e7bf57ea4482
F ext/fts5/fts5_unicode2.c 78273fbd588d1d9bd0a7e4e0ccc9207348bae33c
F ext/fts5/fts5_varint.c 3f86ce09cab152e3d45490d7586b7ed2e40c13f1
F ext/fts5/fts5_vocab.c 4622e0b7d84a488a1585aaa56eb214ee67a988bc
@ -160,7 +160,7 @@ F ext/fts5/test/fts5fault6.test 234dc6355f8d3f8b5be2763f30699d770247c215
F ext/fts5/test/fts5full.test 6f6143af0c6700501d9fd597189dfab1555bb741
F ext/fts5/test/fts5hash.test 42eb066f667e9a389a63437cb7038c51974d4fc6
F ext/fts5/test/fts5integrity.test 29f41d2c7126c6122fbb5d54e556506456876145
F ext/fts5/test/fts5matchinfo.test ee6e7b130096c708c12049fa9c1ceb628954c4f9
F ext/fts5/test/fts5matchinfo.test 2163b0013e824bba65499da9e34ea4da41349cc2
F ext/fts5/test/fts5merge.test 8f3cdba2ec9c5e7e568246e81b700ad37f764367
F ext/fts5/test/fts5near.test b214cddb1c1f1bddf45c75af768f20145f7e71cc
F ext/fts5/test/fts5optimize.test 42741e7c085ee0a1276140a752d4407d97c2c9f5
@ -1380,7 +1380,10 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
P a84cf4f5d326270a61faf4ff867260f2dd1e68a6
R 25627e3633787096f1c6a6ac573b9337
P 0fdc36fe35ae2fc8e9688fe6c53437f4d47502d9
R 694c0e23ba08ed9bcc32d2c502ed8f13
T *branch * fts5-incompatible
T *sym-fts5-incompatible *
T -sym-trunk *
U dan
Z 8fac2bb28cf0b676d514df6920f608e8
Z 745a50831400d199b74f44c2476ec260

View File

@ -1 +1 @@
0fdc36fe35ae2fc8e9688fe6c53437f4d47502d9
90b85b42f2b2dd3e939b129b7df2b822a05e243d