Add tests and fixes for bm25() function.
FossilOrigin-Name: 71d32f53e81921e43c933cc968cb1c18d83fe1e0
This commit is contained in:
parent
700b33d7a5
commit
454b5ce524
@ -411,19 +411,26 @@ static void fts5SnippetFunction(
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct Fts5GatherCtx Fts5GatherCtx;
|
||||
|
||||
/*
|
||||
** Context object passed by fts5GatherTotals() to xQueryPhrase callback
|
||||
** fts5GatherCallback().
|
||||
*/
|
||||
struct Fts5GatherCtx {
|
||||
int nCol;
|
||||
int iPhrase;
|
||||
int *anVal;
|
||||
int nCol; /* Number of columns in FTS table */
|
||||
int iPhrase; /* Phrase currently under investigation */
|
||||
int *anVal; /* Array to populate */
|
||||
};
|
||||
|
||||
/*
|
||||
** Callback used by fts5GatherTotals() with the xQueryPhrase() API.
|
||||
*/
|
||||
static int fts5GatherCallback(
|
||||
const Fts5ExtensionApi *pApi,
|
||||
Fts5Context *pFts,
|
||||
void *pUserData
|
||||
void *pUserData /* Pointer to Fts5GatherCtx object */
|
||||
){
|
||||
Fts5GatherCtx *p = (Fts5GatherCtx*)pUserData;
|
||||
struct Fts5GatherCtx *p = (struct Fts5GatherCtx*)pUserData;
|
||||
int i = 0;
|
||||
int iPrev = -1;
|
||||
i64 iPos = 0;
|
||||
@ -466,7 +473,7 @@ static int fts5GatherTotals(
|
||||
int nPhrase = pApi->xPhraseCount(pFts);
|
||||
int nCol = pApi->xColumnCount(pFts);
|
||||
int nByte = nCol * nPhrase * sizeof(int);
|
||||
Fts5GatherCtx sCtx;
|
||||
struct Fts5GatherCtx sCtx;
|
||||
|
||||
sCtx.nCol = nCol;
|
||||
anVal = sCtx.anVal = (int*)sqlite3_malloc(nByte);
|
||||
@ -492,24 +499,19 @@ static int fts5GatherTotals(
|
||||
|
||||
typedef struct Fts5Bm25Context Fts5Bm25Context;
|
||||
struct Fts5Bm25Context {
|
||||
int nPhrase;
|
||||
int nCol;
|
||||
int nPhrase; /* Number of phrases in query */
|
||||
int nCol; /* Number of columns in FTS table */
|
||||
double *aIDF; /* Array of IDF values */
|
||||
double *aAvg; /* Average size of each column in tokens */
|
||||
};
|
||||
|
||||
static void fts5Bm25Function(
|
||||
static int fts5Bm25GetContext(
|
||||
const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
|
||||
Fts5Context *pFts, /* First arg to pass to pApi functions */
|
||||
sqlite3_context *pCtx, /* Context for returning result/error */
|
||||
int nVal, /* Number of values in apVal[] array */
|
||||
sqlite3_value **apVal /* Array of trailing arguments */
|
||||
Fts5Bm25Context **pp /* OUT: Context object */
|
||||
){
|
||||
const double k1 = 1.2;
|
||||
const double B = 0.75;
|
||||
|
||||
int rc = SQLITE_OK;
|
||||
Fts5Bm25Context *p;
|
||||
int rc = SQLITE_OK;
|
||||
|
||||
p = pApi->xGetAuxdata(pFts, 0);
|
||||
if( p==0 ){
|
||||
@ -530,11 +532,14 @@ static void fts5Bm25Function(
|
||||
memset(p, 0, nByte);
|
||||
p->aAvg = (double*)&p[1];
|
||||
p->aIDF = (double*)&p->aAvg[nCol];
|
||||
p->nCol = nCol;
|
||||
p->nPhrase = nPhrase;
|
||||
}
|
||||
|
||||
if( rc==SQLITE_OK ){
|
||||
rc = pApi->xRowCount(pFts, &nRow);
|
||||
assert( nRow>0 || rc!=SQLITE_OK );
|
||||
if( nRow<2 ) nRow = 2;
|
||||
}
|
||||
|
||||
for(ic=0; rc==SQLITE_OK && ic<nCol; ic++){
|
||||
@ -548,9 +553,26 @@ static void fts5Bm25Function(
|
||||
}
|
||||
for(ic=0; ic<nCol; ic++){
|
||||
for(ip=0; rc==SQLITE_OK && ip<nPhrase; ip++){
|
||||
int idx = ip * nCol + ic;
|
||||
p->aIDF[idx] = log( (0.5 + nRow - anVal[idx]) / (0.5 + anVal[idx]) );
|
||||
if( p->aIDF[idx]<0.0 ) p->aIDF[idx] = 0.0;
|
||||
/* Calculate the IDF (Inverse Document Frequency) for phrase ip
|
||||
** in column ic. This is done using the standard BM25 formula as
|
||||
** found on wikipedia:
|
||||
**
|
||||
** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
|
||||
**
|
||||
** where "N" is the total number of documents in the set and nHit
|
||||
** is the number that contain at least one instance of the phrase
|
||||
** under consideration.
|
||||
**
|
||||
** The problem with this is that if (N < 2*nHit), the IDF is
|
||||
** negative. Which is undesirable. So the mimimum allowable IDF is
|
||||
** (1e-6) - roughly the same as a term that appears in just over
|
||||
** half of set of 5,000,000 documents. */
|
||||
int idx = ip * nCol + ic; /* Index in aIDF[] and anVal[] arrays */
|
||||
int nHit = anVal[idx]; /* Number of docs matching "ic: ip" */
|
||||
|
||||
p->aIDF[idx] = log( (0.5 + nRow - nHit) / (0.5 + nHit) );
|
||||
if( p->aIDF[idx]<=0.0 ) p->aIDF[idx] = 1e-6;
|
||||
assert( p->aIDF[idx]>=0.0 );
|
||||
}
|
||||
}
|
||||
|
||||
@ -560,39 +582,122 @@ static void fts5Bm25Function(
|
||||
}
|
||||
if( rc!=SQLITE_OK ){
|
||||
sqlite3_free(p);
|
||||
p = 0;
|
||||
}
|
||||
}
|
||||
|
||||
*pp = p;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void fts5Bm25DebugContext(
|
||||
int *pRc, /* IN/OUT: Return code */
|
||||
Fts5Buffer *pBuf, /* Buffer to populate */
|
||||
Fts5Bm25Context *p /* Context object to decode */
|
||||
){
|
||||
int ip;
|
||||
int ic;
|
||||
|
||||
sqlite3Fts5BufferAppendString(pRc, pBuf, "idf ");
|
||||
if( p->nPhrase>1 || p->nCol>1 ){
|
||||
sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
|
||||
}
|
||||
for(ip=0; ip<p->nPhrase; ip++){
|
||||
if( ip>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
|
||||
if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
|
||||
for(ic=0; ic<p->nCol; ic++){
|
||||
if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
|
||||
sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aIDF[ip*p->nCol+ic]);
|
||||
}
|
||||
if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
|
||||
}
|
||||
if( p->nPhrase>1 || p->nCol>1 ){
|
||||
sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
|
||||
}
|
||||
|
||||
sqlite3Fts5BufferAppendString(pRc, pBuf, " avgdl ");
|
||||
if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "{");
|
||||
for(ic=0; ic<p->nCol; ic++){
|
||||
if( ic>0 ) sqlite3Fts5BufferAppendString(pRc, pBuf, " ");
|
||||
sqlite3Fts5BufferAppendPrintf(pRc, pBuf, "%f", p->aAvg[ic]);
|
||||
}
|
||||
if( p->nCol>1 ) sqlite3Fts5BufferAppendString(pRc, pBuf, "}");
|
||||
}
|
||||
|
||||
static void fts5Bm25DebugRow(
|
||||
int *pRc,
|
||||
Fts5Buffer *pBuf,
|
||||
Fts5Bm25Context *p,
|
||||
const Fts5ExtensionApi *pApi,
|
||||
Fts5Context *pFts
|
||||
){
|
||||
}
|
||||
|
||||
static void fts5Bm25Function(
|
||||
const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
|
||||
Fts5Context *pFts, /* First arg to pass to pApi functions */
|
||||
sqlite3_context *pCtx, /* Context for returning result/error */
|
||||
int nVal, /* Number of values in apVal[] array */
|
||||
sqlite3_value **apVal /* Array of trailing arguments */
|
||||
){
|
||||
const double k1 = 1.2;
|
||||
const double B = 0.75;
|
||||
int rc = SQLITE_OK;
|
||||
Fts5Bm25Context *p;
|
||||
|
||||
rc = fts5Bm25GetContext(pApi, pFts, &p);
|
||||
|
||||
if( rc==SQLITE_OK ){
|
||||
/* If the bDebug flag is set, instead of returning a numeric rank, this
|
||||
** function returns a text value showing how the rank is calculated. */
|
||||
Fts5Buffer debug;
|
||||
int bDebug = (pApi->xUserData(pFts)!=0);
|
||||
memset(&debug, 0, sizeof(Fts5Buffer));
|
||||
|
||||
int ip;
|
||||
double score = 0.0;
|
||||
|
||||
if( bDebug ){
|
||||
fts5Bm25DebugContext(&rc, &debug, p);
|
||||
fts5Bm25DebugRow(&rc, &debug, p, pApi, pFts);
|
||||
}
|
||||
|
||||
for(ip=0; rc==SQLITE_OK && ip<p->nPhrase; ip++){
|
||||
int iPrev = 0;
|
||||
int nHit = 0;
|
||||
int i = 0;
|
||||
i64 iPos = 0;
|
||||
|
||||
while( rc==SQLITE_OK && 0==pApi->xPoslist(pFts, ip, &i, &iPos) ){
|
||||
while( rc==SQLITE_OK ){
|
||||
int bDone = pApi->xPoslist(pFts, ip, &i, &iPos);
|
||||
int iCol = FTS5_POS2COLUMN(iPos);
|
||||
if( iCol!=iPrev && nHit>0 ){
|
||||
if( (iCol!=iPrev || bDone) && nHit>0 ){
|
||||
int sz = 0;
|
||||
int idx = ip * p->nCol + iPrev;
|
||||
double bm25;
|
||||
rc = pApi->xColumnSize(pFts, iPrev, &sz);
|
||||
|
||||
score += p->aIDF[idx] * nHit * (k1+1.0) /
|
||||
(nHit + k1 * (1.0 - B + B * sz / p->aAvg[iCol]));
|
||||
bm25 = (p->aIDF[idx] * nHit * (k1+1.0)) /
|
||||
(nHit + k1 * (1.0 - B + B * sz / p->aAvg[iPrev]));
|
||||
|
||||
|
||||
score = score + bm25;
|
||||
nHit = 0;
|
||||
}
|
||||
if( bDone ) break;
|
||||
nHit++;
|
||||
iPrev = iCol;
|
||||
}
|
||||
}
|
||||
|
||||
if( rc==SQLITE_OK ){
|
||||
sqlite3_result_double(pCtx, score);
|
||||
}
|
||||
|
||||
if( rc==SQLITE_OK ){
|
||||
if( bDebug ){
|
||||
sqlite3_result_text(pCtx, (const char*)debug.p, -1, SQLITE_TRANSIENT);
|
||||
}else{
|
||||
sqlite3_result_double(pCtx, score);
|
||||
}
|
||||
}
|
||||
sqlite3_free(debug.p);
|
||||
}
|
||||
|
||||
if( rc!=SQLITE_OK ){
|
||||
@ -852,6 +957,7 @@ int sqlite3Fts5AuxInit(Fts5Global *pGlobal){
|
||||
void (*xDestroy)(void*); /* Destructor function */
|
||||
} aBuiltin [] = {
|
||||
{ "bm25", 0, fts5Bm25Function, 0 },
|
||||
{ "bm25debug", (void*)1, fts5Bm25Function, 0 },
|
||||
{ "snippet", 0, fts5SnippetFunction, 0 },
|
||||
{ "fts5_test", 0, fts5TestFunction, 0 },
|
||||
};
|
||||
|
14
manifest
14
manifest
@ -1,5 +1,5 @@
|
||||
C Add\sextension\sapis\sxRowCount,\sxQueryPhrase,\sxSetAuxdata\sand\sxGetAuxdata.\sAnd\sa\sranking\sfunction\sthat\suses\sall\sof\sthe\sabove.
|
||||
D 2014-07-25T20:30:47.445
|
||||
C Add\stests\sand\sfixes\sfor\sbm25()\sfunction.
|
||||
D 2014-07-26T18:38:51.294
|
||||
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
|
||||
F Makefile.in b03432313a3aad96c706f8164fb9f5307eaf19f5
|
||||
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
|
||||
@ -106,7 +106,7 @@ F ext/fts3/unicode/mkunicode.tcl dc6f268eb526710e2c6e496c372471d773d0c368
|
||||
F ext/fts5/fts5.c 1496aff16dd9b0a013d14b6c8cf5b7df8c170abe
|
||||
F ext/fts5/fts5.h 8ace10d5b249a3baa983c79e7a1306d2a79cfd6a
|
||||
F ext/fts5/fts5Int.h 92fb9c4f759674ef569aebc338f363e167a8933c
|
||||
F ext/fts5/fts5_aux.c f8bed7a86b65cb07cffdafbf4f0611f127b36274
|
||||
F ext/fts5/fts5_aux.c 78adc5db0ff4d6834df220ba6b3caa351d98b971
|
||||
F ext/fts5/fts5_buffer.c 248c61ac9fec001602efc72a45704f3b8d367c00
|
||||
F ext/fts5/fts5_config.c 94f1b4cb4de6a7cd5780c14adb0198e289df8cef
|
||||
F ext/fts5/fts5_expr.c 65c1918002f2ec1755e4c0c28bf007659409fbd8
|
||||
@ -599,7 +599,7 @@ F test/fts5aa.test a2c7bbc18f25f0b57ea8fc483c8a8830273b9ed4
|
||||
F test/fts5ab.test dc04ed48cf93ca957d174406e6c192f2ff4f3397
|
||||
F test/fts5ac.test 9be418d037763f4cc5d86f4239db41fc86bb4f85
|
||||
F test/fts5ad.test 2ed38bbc865678cb2905247120d02ebba7f20e07
|
||||
F test/fts5ae.test 1424ec557d543ace1f3cf6d231b247bc7b9f337c
|
||||
F test/fts5ae.test 24b337571c51a10da1ae439b96b70317813a2fd4
|
||||
F test/fts5af.test 5f53d0a52280b63caf5a519d6994c4d428835155
|
||||
F test/fts5ea.test ff43b40f8879ba50b82def70f2ab67c195d1a1d4
|
||||
F test/full.test 6b3c8fb43c6beab6b95438c1675374b95fab245d
|
||||
@ -1196,7 +1196,7 @@ F tool/vdbe_profile.tcl 67746953071a9f8f2f668b73fe899074e2c6d8c1
|
||||
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
|
||||
F tool/warnings.sh 0abfd78ceb09b7f7c27c688c8e3fe93268a13b32
|
||||
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
|
||||
P bdc58fd28a63ac9632c3df6c7768a9a236566605
|
||||
R 2e8cb20122478987f116ef8ff9f6144b
|
||||
P c4d50428ab97f77e6721c4f8d03eaaf3ea91f3eb
|
||||
R 3301ccb2b839356242606883792ca77e
|
||||
U dan
|
||||
Z 5dd5c36b8a0e52d63a87d23e7179571f
|
||||
Z 456b4a2f1abc554b124e25c35490489e
|
||||
|
@ -1 +1 @@
|
||||
c4d50428ab97f77e6721c4f8d03eaaf3ea91f3eb
|
||||
71d32f53e81921e43c933cc968cb1c18d83fe1e0
|
@ -229,6 +229,42 @@ do_execsql_test 7.4 {
|
||||
SELECT fts5_test(t7, 'rowcount') FROM t7 WHERE t7 MATCH 'a';
|
||||
} {5 5 5 5}
|
||||
|
||||
#do_execsql_test 7.4 {
|
||||
# SELECT rowid, bm25debug(t7) FROM t7 WHERE t7 MATCH 'a';
|
||||
#} {5 5 5 5}
|
||||
#
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
do_test 8.1 {
|
||||
execsql { CREATE VIRTUAL TABLE t8 USING fts5(x, y) }
|
||||
foreach {rowid x y} {
|
||||
0 {A o} {o o o C o o o o o o o o}
|
||||
1 {o o B} {o o o C C o o o o o o o}
|
||||
2 {A o o} {o o o o D D o o o o o o}
|
||||
3 {o B} {o o o o o D o o o o o o}
|
||||
4 {E o G} {H o o o o o o o o o o o}
|
||||
5 {F o G} {I o J o o o o o o o o o}
|
||||
6 {E o o} {H o J o o o o o o o o o}
|
||||
7 {o o o} {o o o o o o o o o o o o}
|
||||
9 {o o o} {o o o o o o o o o o o o}
|
||||
} {
|
||||
execsql { INSERT INTO t8(rowid, x, y) VALUES($rowid, $x, $y) }
|
||||
}
|
||||
} {}
|
||||
|
||||
foreach {tn q res} {
|
||||
1 {a} {0 2}
|
||||
2 {b} {3 1}
|
||||
3 {c} {1 0}
|
||||
4 {d} {2 3}
|
||||
5 {g AND (e OR f)} {5 4}
|
||||
6 {j AND (h OR i)} {5 6}
|
||||
} {
|
||||
do_execsql_test 8.2.$tn {
|
||||
SELECT rowid FROM t8 WHERE t8 MATCH $q ORDER BY bm25(t8) DESC;
|
||||
} $res
|
||||
}
|
||||
|
||||
|
||||
finish_test
|
||||
|
Loading…
Reference in New Issue
Block a user