diff --git a/ext/fts5/fts5_aux.c b/ext/fts5/fts5_aux.c index 6281cf60d0..2426c1dc51 100644 --- a/ext/fts5/fts5_aux.c +++ b/ext/fts5/fts5_aux.c @@ -411,19 +411,26 @@ static void fts5SnippetFunction( } } -typedef struct Fts5GatherCtx Fts5GatherCtx; + +/* +** Context object passed by fts5GatherTotals() to xQueryPhrase callback +** fts5GatherCallback(). +*/ struct Fts5GatherCtx { - int nCol; - int iPhrase; - int *anVal; + int nCol; /* Number of columns in FTS table */ + int iPhrase; /* Phrase currently under investigation */ + int *anVal; /* Array to populate */ }; +/* +** Callback used by fts5GatherTotals() with the xQueryPhrase() API. +*/ static int fts5GatherCallback( const Fts5ExtensionApi *pApi, Fts5Context *pFts, - void *pUserData + void *pUserData /* Pointer to Fts5GatherCtx object */ ){ - Fts5GatherCtx *p = (Fts5GatherCtx*)pUserData; + struct Fts5GatherCtx *p = (struct Fts5GatherCtx*)pUserData; int i = 0; int iPrev = -1; i64 iPos = 0; @@ -466,7 +473,7 @@ static int fts5GatherTotals( int nPhrase = pApi->xPhraseCount(pFts); int nCol = pApi->xColumnCount(pFts); int nByte = nCol * nPhrase * sizeof(int); - Fts5GatherCtx sCtx; + struct Fts5GatherCtx sCtx; sCtx.nCol = nCol; anVal = sCtx.anVal = (int*)sqlite3_malloc(nByte); @@ -492,24 +499,19 @@ static int fts5GatherTotals( typedef struct Fts5Bm25Context Fts5Bm25Context; struct Fts5Bm25Context { - int nPhrase; - int nCol; + int nPhrase; /* Number of phrases in query */ + int nCol; /* Number of columns in FTS table */ double *aIDF; /* Array of IDF values */ double *aAvg; /* Average size of each column in tokens */ }; -static void fts5Bm25Function( +static int fts5Bm25GetContext( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ - sqlite3_context *pCtx, /* Context for returning result/error */ - int nVal, /* Number of values in apVal[] array */ - sqlite3_value **apVal /* Array of trailing arguments */ + Fts5Bm25Context **pp /* OUT: Context object */ ){ - const double k1 = 1.2; - const double B = 0.75; - - int rc = SQLITE_OK; Fts5Bm25Context *p; + int rc = SQLITE_OK; p = pApi->xGetAuxdata(pFts, 0); if( p==0 ){ @@ -530,11 +532,14 @@ static void fts5Bm25Function( memset(p, 0, nByte); p->aAvg = (double*)&p[1]; p->aIDF = (double*)&p->aAvg[nCol]; + p->nCol = nCol; + p->nPhrase = nPhrase; } if( rc==SQLITE_OK ){ rc = pApi->xRowCount(pFts, &nRow); assert( nRow>0 || rc!=SQLITE_OK ); + if( nRow<2 ) nRow = 2; } for(ic=0; rc==SQLITE_OK && icaIDF[idx] = log( (0.5 + nRow - anVal[idx]) / (0.5 + anVal[idx]) ); - if( p->aIDF[idx]<0.0 ) p->aIDF[idx] = 0.0; + /* Calculate the IDF (Inverse Document Frequency) for phrase ip + ** in column ic. This is done using the standard BM25 formula as + ** found on wikipedia: + ** + ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) ) + ** + ** where "N" is the total number of documents in the set and nHit + ** is the number that contain at least one instance of the phrase + ** under consideration. + ** + ** The problem with this is that if (N < 2*nHit), the IDF is + ** negative. Which is undesirable. So the mimimum allowable IDF is + ** (1e-6) - roughly the same as a term that appears in just over + ** half of set of 5,000,000 documents. */ + int idx = ip * nCol + ic; /* Index in aIDF[] and anVal[] arrays */ + int nHit = anVal[idx]; /* Number of docs matching "ic: ip" */ + + p->aIDF[idx] = log( (0.5 + nRow - nHit) / (0.5 + nHit) ); + if( p->aIDF[idx]<=0.0 ) p->aIDF[idx] = 1e-6; + assert( p->aIDF[idx]>=0.0 ); } } @@ -560,39 +582,122 @@ static void fts5Bm25Function( } if( rc!=SQLITE_OK ){ sqlite3_free(p); + p = 0; } } + *pp = p; + return rc; +} + +static void fts5Bm25DebugContext( + int *pRc, /* IN/OUT: Return code */ + Fts5Buffer *pBuf, /* Buffer to populate */ + Fts5Bm25Context *p /* Context object to decode */ +){ + int ip; + int ic; + + sqlite3Fts5BufferAppendString(pRc, pBuf, "idf "); + if( p->nPhrase>1 || p->nCol>1 ){ + sqlite3Fts5BufferAppendString(pRc, pBuf, "{"); + } + for(ip=0; ipnPhrase; ip++){ + if( ip>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " "); + if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{"); + for(ic=0; icnCol; ic++){ + if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " "); + sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aIDF[ip*p->nCol+ic]); + } + if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}"); + } + if( p->nPhrase>1 || p->nCol>1 ){ + sqlite3Fts5BufferAppendString(pRc, pBuf, "}"); + } + + sqlite3Fts5BufferAppendString(pRc, pBuf, " avgdl "); + if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{"); + for(ic=0; icnCol; ic++){ + if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " "); + sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aAvg[ic]); + } + if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}"); +} + +static void fts5Bm25DebugRow( + int *pRc, + Fts5Buffer *pBuf, + Fts5Bm25Context *p, + const Fts5ExtensionApi *pApi, + Fts5Context *pFts +){ +} + +static void fts5Bm25Function( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + sqlite3_context *pCtx, /* Context for returning result/error */ + int nVal, /* Number of values in apVal[] array */ + sqlite3_value **apVal /* Array of trailing arguments */ +){ + const double k1 = 1.2; + const double B = 0.75; + int rc = SQLITE_OK; + Fts5Bm25Context *p; + + rc = fts5Bm25GetContext(pApi, pFts, &p); + if( rc==SQLITE_OK ){ + /* If the bDebug flag is set, instead of returning a numeric rank, this + ** function returns a text value showing how the rank is calculated. */ + Fts5Buffer debug; + int bDebug = (pApi->xUserData(pFts)!=0); + memset(&debug, 0, sizeof(Fts5Buffer)); + int ip; double score = 0.0; + if( bDebug ){ + fts5Bm25DebugContext(&rc, &debug, p); + fts5Bm25DebugRow(&rc, &debug, p, pApi, pFts); + } + for(ip=0; rc==SQLITE_OK && ipnPhrase; ip++){ int iPrev = 0; int nHit = 0; int i = 0; i64 iPos = 0; - while( rc==SQLITE_OK && 0==pApi->xPoslist(pFts, ip, &i, &iPos) ){ + while( rc==SQLITE_OK ){ + int bDone = pApi->xPoslist(pFts, ip, &i, &iPos); int iCol = FTS5_POS2COLUMN(iPos); - if( iCol!=iPrev && nHit>0 ){ + if( (iCol!=iPrev || bDone) && nHit>0 ){ int sz = 0; int idx = ip * p->nCol + iPrev; + double bm25; rc = pApi->xColumnSize(pFts, iPrev, &sz); - score += p->aIDF[idx] * nHit * (k1+1.0) / - (nHit + k1 * (1.0 - B + B * sz / p->aAvg[iCol])); + bm25 = (p->aIDF[idx] * nHit * (k1+1.0)) / + (nHit + k1 * (1.0 - B + B * sz / p->aAvg[iPrev])); + + + score = score + bm25; nHit = 0; } + if( bDone ) break; nHit++; iPrev = iCol; } } - - if( rc==SQLITE_OK ){ - sqlite3_result_double(pCtx, score); - } + if( rc==SQLITE_OK ){ + if( bDebug ){ + sqlite3_result_text(pCtx, (const char*)debug.p, -1, SQLITE_TRANSIENT); + }else{ + sqlite3_result_double(pCtx, score); + } + } + sqlite3_free(debug.p); } if( rc!=SQLITE_OK ){ @@ -852,6 +957,7 @@ int sqlite3Fts5AuxInit(Fts5Global *pGlobal){ void (*xDestroy)(void*); /* Destructor function */ } aBuiltin [] = { { "bm25", 0, fts5Bm25Function, 0 }, + { "bm25debug", (void*)1, fts5Bm25Function, 0 }, { "snippet", 0, fts5SnippetFunction, 0 }, { "fts5_test", 0, fts5TestFunction, 0 }, }; diff --git a/manifest b/manifest index e50b5af932..84cd1f32f8 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\sextension\sapis\sxRowCount,\sxQueryPhrase,\sxSetAuxdata\sand\sxGetAuxdata.\sAnd\sa\sranking\sfunction\sthat\suses\sall\sof\sthe\sabove. -D 2014-07-25T20:30:47.445 +C Add\stests\sand\sfixes\sfor\sbm25()\sfunction. +D 2014-07-26T18:38:51.294 F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f F Makefile.in b03432313a3aad96c706f8164fb9f5307eaf19f5 F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23 @@ -106,7 +106,7 @@ F ext/fts3/unicode/mkunicode.tcl dc6f268eb526710e2c6e496c372471d773d0c368 F ext/fts5/fts5.c 1496aff16dd9b0a013d14b6c8cf5b7df8c170abe F ext/fts5/fts5.h 8ace10d5b249a3baa983c79e7a1306d2a79cfd6a F ext/fts5/fts5Int.h 92fb9c4f759674ef569aebc338f363e167a8933c -F ext/fts5/fts5_aux.c f8bed7a86b65cb07cffdafbf4f0611f127b36274 +F ext/fts5/fts5_aux.c 78adc5db0ff4d6834df220ba6b3caa351d98b971 F ext/fts5/fts5_buffer.c 248c61ac9fec001602efc72a45704f3b8d367c00 F ext/fts5/fts5_config.c 94f1b4cb4de6a7cd5780c14adb0198e289df8cef F ext/fts5/fts5_expr.c 65c1918002f2ec1755e4c0c28bf007659409fbd8 @@ -599,7 +599,7 @@ F test/fts5aa.test a2c7bbc18f25f0b57ea8fc483c8a8830273b9ed4 F test/fts5ab.test dc04ed48cf93ca957d174406e6c192f2ff4f3397 F test/fts5ac.test 9be418d037763f4cc5d86f4239db41fc86bb4f85 F test/fts5ad.test 2ed38bbc865678cb2905247120d02ebba7f20e07 -F test/fts5ae.test 1424ec557d543ace1f3cf6d231b247bc7b9f337c +F test/fts5ae.test 24b337571c51a10da1ae439b96b70317813a2fd4 F test/fts5af.test 5f53d0a52280b63caf5a519d6994c4d428835155 F test/fts5ea.test ff43b40f8879ba50b82def70f2ab67c195d1a1d4 F test/full.test 6b3c8fb43c6beab6b95438c1675374b95fab245d @@ -1196,7 +1196,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1 F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4 F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32 F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f -P bdc58fd28a63ac9632c3df6c7768a9a236566605 -R 2e8cb20122478987f116ef8ff9f6144b +P c4d50428ab97f77e6721c4f8d03eaaf3ea91f3eb +R 3301ccb2b839356242606883792ca77e U dan -Z 5dd5c36b8a0e52d63a87d23e7179571f +Z 456b4a2f1abc554b124e25c35490489e diff --git a/manifest.uuid b/manifest.uuid index 8319bdf8c4..17caf8ac8b 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -c4d50428ab97f77e6721c4f8d03eaaf3ea91f3eb \ No newline at end of file +71d32f53e81921e43c933cc968cb1c18d83fe1e0 \ No newline at end of file diff --git a/test/fts5ae.test b/test/fts5ae.test index 4480c081df..bb4904f210 100644 --- a/test/fts5ae.test +++ b/test/fts5ae.test @@ -229,6 +229,42 @@ do_execsql_test 7.4 { SELECT fts5_test(t7, 'rowcount') FROM t7 WHERE t7 MATCH 'a'; } {5 5 5 5} +#do_execsql_test 7.4 { +# SELECT rowid, bm25debug(t7) FROM t7 WHERE t7 MATCH 'a'; +#} {5 5 5 5} +# + +#------------------------------------------------------------------------- +# +do_test 8.1 { + execsql { CREATE VIRTUAL TABLE t8 USING fts5(x, y) } + foreach {rowid x y} { + 0 {A o} {o o o C o o o o o o o o} + 1 {o o B} {o o o C C o o o o o o o} + 2 {A o o} {o o o o D D o o o o o o} + 3 {o B} {o o o o o D o o o o o o} + 4 {E o G} {H o o o o o o o o o o o} + 5 {F o G} {I o J o o o o o o o o o} + 6 {E o o} {H o J o o o o o o o o o} + 7 {o o o} {o o o o o o o o o o o o} + 9 {o o o} {o o o o o o o o o o o o} + } { + execsql { INSERT INTO t8(rowid, x, y) VALUES($rowid, $x, $y) } + } +} {} + +foreach {tn q res} { + 1 {a} {0 2} + 2 {b} {3 1} + 3 {c} {1 0} + 4 {d} {2 3} + 5 {g AND (e OR f)} {5 4} + 6 {j AND (h OR i)} {5 6} +} { + do_execsql_test 8.2.$tn { + SELECT rowid FROM t8 WHERE t8 MATCH $q ORDER BY bm25(t8) DESC; + } $res +} finish_test