sqlite/ext/fts3/fts3_snippet.c
dan 378d0ab97b Fixes for the matchinfo() function related to FTS4 common token handling.
FossilOrigin-Name: deb80eac9112d21835dfd3cee08ed8f09d975bf7
2010-10-23 19:07:30 +00:00

1235 lines
41 KiB
C

/*
** 2009 Oct 23
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
#include "fts3Int.h"
#include <string.h>
#include <assert.h>
/*
** Used as an fts3ExprIterate() context when loading phrase doclists to
** Fts3Expr.aDoclist[]/nDoclist.
*/
typedef struct LoadDoclistCtx LoadDoclistCtx;
struct LoadDoclistCtx {
Fts3Cursor *pCsr; /* FTS3 Cursor */
int nPhrase; /* Number of phrases seen so far */
int nToken; /* Number of tokens seen so far */
};
/*
** The following types are used as part of the implementation of the
** fts3BestSnippet() routine.
*/
typedef struct SnippetIter SnippetIter;
typedef struct SnippetPhrase SnippetPhrase;
typedef struct SnippetFragment SnippetFragment;
struct SnippetIter {
Fts3Cursor *pCsr; /* Cursor snippet is being generated from */
int iCol; /* Extract snippet from this column */
int nSnippet; /* Requested snippet length (in tokens) */
int nPhrase; /* Number of phrases in query */
SnippetPhrase *aPhrase; /* Array of size nPhrase */
int iCurrent; /* First token of current snippet */
};
struct SnippetPhrase {
int nToken; /* Number of tokens in phrase */
char *pList; /* Pointer to start of phrase position list */
int iHead; /* Next value in position list */
char *pHead; /* Position list data following iHead */
int iTail; /* Next value in trailing position list */
char *pTail; /* Position list data following iTail */
};
struct SnippetFragment {
int iCol; /* Column snippet is extracted from */
int iPos; /* Index of first token in snippet */
u64 covered; /* Mask of query phrases covered */
u64 hlmask; /* Mask of snippet terms to highlight */
};
/*
** This type is used as an fts3ExprIterate() context object while
** accumulating the data returned by the matchinfo() function.
*/
typedef struct MatchInfo MatchInfo;
struct MatchInfo {
Fts3Cursor *pCursor; /* FTS3 Cursor */
int nCol; /* Number of columns in table */
u32 *aMatchinfo; /* Pre-allocated buffer */
};
/*
** The snippet() and offsets() functions both return text values. An instance
** of the following structure is used to accumulate those values while the
** functions are running. See fts3StringAppend() for details.
*/
typedef struct StrBuffer StrBuffer;
struct StrBuffer {
char *z; /* Pointer to buffer containing string */
int n; /* Length of z in bytes (excl. nul-term) */
int nAlloc; /* Allocated size of buffer z in bytes */
};
/*
** This function is used to help iterate through a position-list. A position
** list is a list of unique integers, sorted from smallest to largest. Each
** element of the list is represented by an FTS3 varint that takes the value
** of the difference between the current element and the previous one plus
** two. For example, to store the position-list:
**
** 4 9 113
**
** the three varints:
**
** 6 7 106
**
** are encoded.
**
** When this function is called, *pp points to the start of an element of
** the list. *piPos contains the value of the previous entry in the list.
** After it returns, *piPos contains the value of the next element of the
** list and *pp is advanced to the following varint.
*/
static void fts3GetDeltaPosition(char **pp, int *piPos){
int iVal;
*pp += sqlite3Fts3GetVarint32(*pp, &iVal);
*piPos += (iVal-2);
}
/*
** Helper function for fts3ExprIterate() (see below).
*/
static int fts3ExprIterate2(
Fts3Expr *pExpr, /* Expression to iterate phrases of */
int *piPhrase, /* Pointer to phrase counter */
int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
void *pCtx /* Second argument to pass to callback */
){
int rc; /* Return code */
int eType = pExpr->eType; /* Type of expression node pExpr */
if( eType!=FTSQUERY_PHRASE ){
assert( pExpr->pLeft && pExpr->pRight );
rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
}
}else{
rc = x(pExpr, *piPhrase, pCtx);
(*piPhrase)++;
}
return rc;
}
/*
** Iterate through all phrase nodes in an FTS3 query, except those that
** are part of a sub-tree that is the right-hand-side of a NOT operator.
** For each phrase node found, the supplied callback function is invoked.
**
** If the callback function returns anything other than SQLITE_OK,
** the iteration is abandoned and the error code returned immediately.
** Otherwise, SQLITE_OK is returned after a callback has been made for
** all eligible phrase nodes.
*/
static int fts3ExprIterate(
Fts3Expr *pExpr, /* Expression to iterate phrases of */
int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
void *pCtx /* Second argument to pass to callback */
){
int iPhrase = 0; /* Variable used as the phrase counter */
return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
}
/*
** The argument to this function is always a phrase node. Its doclist
** (Fts3Expr.aDoclist[]) and the doclists associated with all phrase nodes
** to the left of this one in the query tree have already been loaded.
**
** If this phrase node is part of a series of phrase nodes joined by
** NEAR operators (and is not the left-most of said series), then elements are
** removed from the phrases doclist consistent with the NEAR restriction. If
** required, elements may be removed from the doclists of phrases to the
** left of this one that are part of the same series of NEAR operator
** connected phrases.
**
** If an OOM error occurs, SQLITE_NOMEM is returned. Otherwise, SQLITE_OK.
*/
static int fts3ExprNearTrim(Fts3Expr *pExpr){
int rc = SQLITE_OK;
Fts3Expr *pParent = pExpr->pParent;
assert( pExpr->eType==FTSQUERY_PHRASE );
while( rc==SQLITE_OK
&& pParent
&& pParent->eType==FTSQUERY_NEAR
&& pParent->pRight==pExpr
){
/* This expression (pExpr) is the right-hand-side of a NEAR operator.
** Find the expression to the left of the same operator.
*/
int nNear = pParent->nNear;
Fts3Expr *pLeft = pParent->pLeft;
if( pLeft->eType!=FTSQUERY_PHRASE ){
assert( pLeft->eType==FTSQUERY_NEAR );
assert( pLeft->pRight->eType==FTSQUERY_PHRASE );
pLeft = pLeft->pRight;
}
rc = sqlite3Fts3ExprNearTrim(pLeft, pExpr, nNear);
pExpr = pLeft;
pParent = pExpr->pParent;
}
return rc;
}
/*
** This is an fts3ExprIterate() callback used while loading the doclists
** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
** fts3ExprLoadDoclists().
*/
static int fts3ExprLoadDoclistsCb1(Fts3Expr *pExpr, int iPhrase, void *ctx){
int rc = SQLITE_OK;
LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
UNUSED_PARAMETER(iPhrase);
p->nPhrase++;
p->nToken += pExpr->pPhrase->nToken;
if( pExpr->isLoaded==0 ){
rc = sqlite3Fts3ExprLoadDoclist(p->pCsr, pExpr);
pExpr->isLoaded = 1;
if( rc==SQLITE_OK ){
rc = fts3ExprNearTrim(pExpr);
}
}
return rc;
}
/*
** This is an fts3ExprIterate() callback used while loading the doclists
** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
** fts3ExprLoadDoclists().
*/
static int fts3ExprLoadDoclistsCb2(Fts3Expr *pExpr, int iPhrase, void *ctx){
UNUSED_PARAMETER(iPhrase);
UNUSED_PARAMETER(ctx);
if( pExpr->aDoclist ){
pExpr->pCurrent = pExpr->aDoclist;
pExpr->iCurrent = 0;
pExpr->pCurrent += sqlite3Fts3GetVarint(pExpr->pCurrent, &pExpr->iCurrent);
}
return SQLITE_OK;
}
/*
** Load the doclists for each phrase in the query associated with FTS3 cursor
** pCsr.
**
** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable
** phrases in the expression (all phrases except those directly or
** indirectly descended from the right-hand-side of a NOT operator). If
** pnToken is not NULL, then it is set to the number of tokens in all
** matchable phrases of the expression.
*/
static int fts3ExprLoadDoclists(
Fts3Cursor *pCsr, /* Fts3 cursor for current query */
int *pnPhrase, /* OUT: Number of phrases in query */
int *pnToken /* OUT: Number of tokens in query */
){
int rc; /* Return Code */
LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */
sCtx.pCsr = pCsr;
rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb1, (void *)&sCtx);
if( rc==SQLITE_OK ){
(void)fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb2, 0);
}
if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
if( pnToken ) *pnToken = sCtx.nToken;
return rc;
}
/*
** Advance the position list iterator specified by the first two
** arguments so that it points to the first element with a value greater
** than or equal to parameter iNext.
*/
static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
char *pIter = *ppIter;
if( pIter ){
int iIter = *piIter;
while( iIter<iNext ){
if( 0==(*pIter & 0xFE) ){
iIter = -1;
pIter = 0;
break;
}
fts3GetDeltaPosition(&pIter, &iIter);
}
*piIter = iIter;
*ppIter = pIter;
}
}
/*
** Advance the snippet iterator to the next candidate snippet.
*/
static int fts3SnippetNextCandidate(SnippetIter *pIter){
int i; /* Loop counter */
if( pIter->iCurrent<0 ){
/* The SnippetIter object has just been initialized. The first snippet
** candidate always starts at offset 0 (even if this candidate has a
** score of 0.0).
*/
pIter->iCurrent = 0;
/* Advance the 'head' iterator of each phrase to the first offset that
** is greater than or equal to (iNext+nSnippet).
*/
for(i=0; i<pIter->nPhrase; i++){
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
}
}else{
int iStart;
int iEnd = 0x7FFFFFFF;
for(i=0; i<pIter->nPhrase; i++){
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
if( pPhrase->pHead && pPhrase->iHead<iEnd ){
iEnd = pPhrase->iHead;
}
}
if( iEnd==0x7FFFFFFF ){
return 1;
}
pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
for(i=0; i<pIter->nPhrase; i++){
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
}
}
return 0;
}
/*
** Retrieve information about the current candidate snippet of snippet
** iterator pIter.
*/
static void fts3SnippetDetails(
SnippetIter *pIter, /* Snippet iterator */
u64 mCovered, /* Bitmask of phrases already covered */
int *piToken, /* OUT: First token of proposed snippet */
int *piScore, /* OUT: "Score" for this snippet */
u64 *pmCover, /* OUT: Bitmask of phrases covered */
u64 *pmHighlight /* OUT: Bitmask of terms to highlight */
){
int iStart = pIter->iCurrent; /* First token of snippet */
int iScore = 0; /* Score of this snippet */
int i; /* Loop counter */
u64 mCover = 0; /* Mask of phrases covered by this snippet */
u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */
for(i=0; i<pIter->nPhrase; i++){
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
if( pPhrase->pTail ){
char *pCsr = pPhrase->pTail;
int iCsr = pPhrase->iTail;
while( iCsr<(iStart+pIter->nSnippet) ){
int j;
u64 mPhrase = (u64)1 << i;
u64 mPos = (u64)1 << (iCsr - iStart);
assert( iCsr>=iStart );
if( (mCover|mCovered)&mPhrase ){
iScore++;
}else{
iScore += 1000;
}
mCover |= mPhrase;
for(j=0; j<pPhrase->nToken; j++){
mHighlight |= (mPos>>j);
}
if( 0==(*pCsr & 0x0FE) ) break;
fts3GetDeltaPosition(&pCsr, &iCsr);
}
}
}
/* Set the output variables before returning. */
*piToken = iStart;
*piScore = iScore;
*pmCover = mCover;
*pmHighlight = mHighlight;
}
/*
** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
** Each invocation populates an element of the SnippetIter.aPhrase[] array.
*/
static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
SnippetIter *p = (SnippetIter *)ctx;
SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
char *pCsr;
pPhrase->nToken = pExpr->pPhrase->nToken;
pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol);
if( pCsr ){
int iFirst = 0;
pPhrase->pList = pCsr;
fts3GetDeltaPosition(&pCsr, &iFirst);
pPhrase->pHead = pCsr;
pPhrase->pTail = pCsr;
pPhrase->iHead = iFirst;
pPhrase->iTail = iFirst;
}else{
assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 );
}
return SQLITE_OK;
}
/*
** Select the fragment of text consisting of nFragment contiguous tokens
** from column iCol that represent the "best" snippet. The best snippet
** is the snippet with the highest score, where scores are calculated
** by adding:
**
** (a) +1 point for each occurence of a matchable phrase in the snippet.
**
** (b) +1000 points for the first occurence of each matchable phrase in
** the snippet for which the corresponding mCovered bit is not set.
**
** The selected snippet parameters are stored in structure *pFragment before
** returning. The score of the selected snippet is stored in *piScore
** before returning.
*/
static int fts3BestSnippet(
int nSnippet, /* Desired snippet length */
Fts3Cursor *pCsr, /* Cursor to create snippet for */
int iCol, /* Index of column to create snippet from */
u64 mCovered, /* Mask of phrases already covered */
u64 *pmSeen, /* IN/OUT: Mask of phrases seen */
SnippetFragment *pFragment, /* OUT: Best snippet found */
int *piScore /* OUT: Score of snippet pFragment */
){
int rc; /* Return Code */
int nList; /* Number of phrases in expression */
SnippetIter sIter; /* Iterates through snippet candidates */
int nByte; /* Number of bytes of space to allocate */
int iBestScore = -1; /* Best snippet score found so far */
int i; /* Loop counter */
memset(&sIter, 0, sizeof(sIter));
/* Iterate through the phrases in the expression to count them. The same
** callback makes sure the doclists are loaded for each phrase.
*/
rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
if( rc!=SQLITE_OK ){
return rc;
}
/* Now that it is known how many phrases there are, allocate and zero
** the required space using malloc().
*/
nByte = sizeof(SnippetPhrase) * nList;
sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
if( !sIter.aPhrase ){
return SQLITE_NOMEM;
}
memset(sIter.aPhrase, 0, nByte);
/* Initialize the contents of the SnippetIter object. Then iterate through
** the set of phrases in the expression to populate the aPhrase[] array.
*/
sIter.pCsr = pCsr;
sIter.iCol = iCol;
sIter.nSnippet = nSnippet;
sIter.nPhrase = nList;
sIter.iCurrent = -1;
(void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
/* Set the *pmSeen output variable. */
for(i=0; i<nList; i++){
if( sIter.aPhrase[i].pHead ){
*pmSeen |= (u64)1 << i;
}
}
/* Loop through all candidate snippets. Store the best snippet in
** *pFragment. Store its associated 'score' in iBestScore.
*/
pFragment->iCol = iCol;
while( !fts3SnippetNextCandidate(&sIter) ){
int iPos;
int iScore;
u64 mCover;
u64 mHighlight;
fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
assert( iScore>=0 );
if( iScore>iBestScore ){
pFragment->iPos = iPos;
pFragment->hlmask = mHighlight;
pFragment->covered = mCover;
iBestScore = iScore;
}
}
sqlite3_free(sIter.aPhrase);
*piScore = iBestScore;
return SQLITE_OK;
}
/*
** Append a string to the string-buffer passed as the first argument.
**
** If nAppend is negative, then the length of the string zAppend is
** determined using strlen().
*/
static int fts3StringAppend(
StrBuffer *pStr, /* Buffer to append to */
const char *zAppend, /* Pointer to data to append to buffer */
int nAppend /* Size of zAppend in bytes (or -1) */
){
if( nAppend<0 ){
nAppend = (int)strlen(zAppend);
}
/* If there is insufficient space allocated at StrBuffer.z, use realloc()
** to grow the buffer until so that it is big enough to accomadate the
** appended data.
*/
if( pStr->n+nAppend+1>=pStr->nAlloc ){
int nAlloc = pStr->nAlloc+nAppend+100;
char *zNew = sqlite3_realloc(pStr->z, nAlloc);
if( !zNew ){
return SQLITE_NOMEM;
}
pStr->z = zNew;
pStr->nAlloc = nAlloc;
}
/* Append the data to the string buffer. */
memcpy(&pStr->z[pStr->n], zAppend, nAppend);
pStr->n += nAppend;
pStr->z[pStr->n] = '\0';
return SQLITE_OK;
}
/*
** The fts3BestSnippet() function often selects snippets that end with a
** query term. That is, the final term of the snippet is always a term
** that requires highlighting. For example, if 'X' is a highlighted term
** and '.' is a non-highlighted term, BestSnippet() may select:
**
** ........X.....X
**
** This function "shifts" the beginning of the snippet forward in the
** document so that there are approximately the same number of
** non-highlighted terms to the right of the final highlighted term as there
** are to the left of the first highlighted term. For example, to this:
**
** ....X.....X....
**
** This is done as part of extracting the snippet text, not when selecting
** the snippet. Snippet selection is done based on doclists only, so there
** is no way for fts3BestSnippet() to know whether or not the document
** actually contains terms that follow the final highlighted term.
*/
static int fts3SnippetShift(
Fts3Table *pTab, /* FTS3 table snippet comes from */
int nSnippet, /* Number of tokens desired for snippet */
const char *zDoc, /* Document text to extract snippet from */
int nDoc, /* Size of buffer zDoc in bytes */
int *piPos, /* IN/OUT: First token of snippet */
u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */
){
u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */
if( hlmask ){
int nLeft; /* Tokens to the left of first highlight */
int nRight; /* Tokens to the right of last highlight */
int nDesired; /* Ideal number of tokens to shift forward */
for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
nDesired = (nLeft-nRight)/2;
/* Ideally, the start of the snippet should be pushed forward in the
** document nDesired tokens. This block checks if there are actually
** nDesired tokens to the right of the snippet. If so, *piPos and
** *pHlMask are updated to shift the snippet nDesired tokens to the
** right. Otherwise, the snippet is shifted by the number of tokens
** available.
*/
if( nDesired>0 ){
int nShift; /* Number of tokens to shift snippet by */
int iCurrent = 0; /* Token counter */
int rc; /* Return Code */
sqlite3_tokenizer_module *pMod;
sqlite3_tokenizer_cursor *pC;
pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
/* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
** or more tokens in zDoc/nDoc.
*/
rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
if( rc!=SQLITE_OK ){
return rc;
}
pC->pTokenizer = pTab->pTokenizer;
while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
}
pMod->xClose(pC);
if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
assert( nShift<=nDesired );
if( nShift>0 ){
*piPos += nShift;
*pHlmask = hlmask >> nShift;
}
}
}
return SQLITE_OK;
}
/*
** Extract the snippet text for fragment pFragment from cursor pCsr and
** append it to string buffer pOut.
*/
static int fts3SnippetText(
Fts3Cursor *pCsr, /* FTS3 Cursor */
SnippetFragment *pFragment, /* Snippet to extract */
int iFragment, /* Fragment number */
int isLast, /* True for final fragment in snippet */
int nSnippet, /* Number of tokens in extracted snippet */
const char *zOpen, /* String inserted before highlighted term */
const char *zClose, /* String inserted after highlighted term */
const char *zEllipsis, /* String inserted between snippets */
StrBuffer *pOut /* Write output here */
){
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
int rc; /* Return code */
const char *zDoc; /* Document text to extract snippet from */
int nDoc; /* Size of zDoc in bytes */
int iCurrent = 0; /* Current token number of document */
int iEnd = 0; /* Byte offset of end of current token */
int isShiftDone = 0; /* True after snippet is shifted */
int iPos = pFragment->iPos; /* First token of snippet */
u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
int iCol = pFragment->iCol+1; /* Query column to extract text from */
sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */
const char *ZDUMMY; /* Dummy argument used with tokenizer */
int DUMMY1; /* Dummy argument used with tokenizer */
zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
if( zDoc==0 ){
if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
return SQLITE_NOMEM;
}
return SQLITE_OK;
}
nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
/* Open a token cursor on the document. */
pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
if( rc!=SQLITE_OK ){
return rc;
}
pC->pTokenizer = pTab->pTokenizer;
while( rc==SQLITE_OK ){
int iBegin; /* Offset in zDoc of start of token */
int iFin; /* Offset in zDoc of end of token */
int isHighlight; /* True for highlighted terms */
rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
if( rc!=SQLITE_OK ){
if( rc==SQLITE_DONE ){
/* Special case - the last token of the snippet is also the last token
** of the column. Append any punctuation that occurred between the end
** of the previous token and the end of the document to the output.
** Then break out of the loop. */
rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
}
break;
}
if( iCurrent<iPos ){ continue; }
if( !isShiftDone ){
int n = nDoc - iBegin;
rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask);
isShiftDone = 1;
/* Now that the shift has been done, check if the initial "..." are
** required. They are required if (a) this is not the first fragment,
** or (b) this fragment does not begin at position 0 of its column.
*/
if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
rc = fts3StringAppend(pOut, zEllipsis, -1);
}
if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
}
if( iCurrent>=(iPos+nSnippet) ){
if( isLast ){
rc = fts3StringAppend(pOut, zEllipsis, -1);
}
break;
}
/* Set isHighlight to true if this term should be highlighted. */
isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
iEnd = iFin;
}
pMod->xClose(pC);
return rc;
}
/*
** This function is used to count the entries in a column-list (a
** delta-encoded list of term offsets within a single column of a single
** row). When this function is called, *ppCollist should point to the
** beginning of the first varint in the column-list (the varint that
** contains the position of the first matching term in the column data).
** Before returning, *ppCollist is set to point to the first byte after
** the last varint in the column-list (either the 0x00 signifying the end
** of the position-list, or the 0x01 that precedes the column number of
** the next column in the position-list).
**
** The number of elements in the column-list is returned.
*/
static int fts3ColumnlistCount(char **ppCollist){
char *pEnd = *ppCollist;
char c = 0;
int nEntry = 0;
/* A column-list is terminated by either a 0x01 or 0x00. */
while( 0xFE & (*pEnd | c) ){
c = *pEnd++ & 0x80;
if( !c ) nEntry++;
}
*ppCollist = pEnd;
return nEntry;
}
static void fts3LoadColumnlistCounts(char **pp, u32 *aOut, int isGlobal){
char *pCsr = *pp;
while( *pCsr ){
int nHit;
sqlite3_int64 iCol = 0;
if( *pCsr==0x01 ){
pCsr++;
pCsr += sqlite3Fts3GetVarint(pCsr, &iCol);
}
nHit = fts3ColumnlistCount(&pCsr);
assert( nHit>0 );
if( isGlobal ){
aOut[iCol*3+1]++;
}
aOut[iCol*3] += nHit;
}
pCsr++;
*pp = pCsr;
}
/*
** fts3ExprIterate() callback used to collect the "global" matchinfo stats
** for a single query. The "global" stats are those elements of the matchinfo
** array that are constant for all rows returned by the current query.
*/
static int fts3ExprGlobalMatchinfoCb(
Fts3Expr *pExpr, /* Phrase expression node */
int iPhrase, /* Phrase number (numbered from zero) */
void *pCtx /* Pointer to MatchInfo structure */
){
MatchInfo *p = (MatchInfo *)pCtx;
Fts3Cursor *pCsr = p->pCursor;
char *pIter;
char *pEnd;
char *pFree = 0;
const int iStart = 2 + (iPhrase * p->nCol * 3) + 1;
assert( pExpr->isLoaded );
assert( pExpr->eType==FTSQUERY_PHRASE );
if( pCsr->pDeferred ){
Fts3Phrase *pPhrase = pExpr->pPhrase;
int ii;
for(ii=0; ii<pPhrase->nToken; ii++){
if( pPhrase->aToken[ii].bFulltext ) break;
}
if( ii<pPhrase->nToken ){
int nFree = 0;
int rc = sqlite3Fts3ExprLoadFtDoclist(pCsr, pExpr, &pFree, &nFree);
if( rc!=SQLITE_OK ) return rc;
pIter = pFree;
pEnd = &pFree[nFree];
}else{
int nDoc = p->aMatchinfo[2 + 3*p->nCol*p->aMatchinfo[0]];
for(ii=0; ii<p->nCol; ii++){
p->aMatchinfo[iStart + ii*3] = nDoc;
p->aMatchinfo[iStart + ii*3 + 1] = nDoc;
}
return SQLITE_OK;
}
}else{
pIter = pExpr->aDoclist;
pEnd = &pExpr->aDoclist[pExpr->nDoclist];
}
/* Fill in the global hit count matrix row for this phrase. */
while( pIter<pEnd ){
while( *pIter++ & 0x80 ); /* Skip past docid. */
fts3LoadColumnlistCounts(&pIter, &p->aMatchinfo[iStart], 1);
}
sqlite3_free(pFree);
return SQLITE_OK;
}
/*
** fts3ExprIterate() callback used to collect the "local" matchinfo stats
** for a single query. The "local" stats are those elements of the matchinfo
** array that are different for each row returned by the query.
*/
static int fts3ExprLocalMatchinfoCb(
Fts3Expr *pExpr, /* Phrase expression node */
int iPhrase, /* Phrase number */
void *pCtx /* Pointer to MatchInfo structure */
){
MatchInfo *p = (MatchInfo *)pCtx;
if( pExpr->aDoclist ){
char *pCsr;
int iStart = 2 + (iPhrase * p->nCol * 3);
int i;
for(i=0; i<p->nCol; i++) p->aMatchinfo[iStart+i*3] = 0;
pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1);
if( pCsr ){
fts3LoadColumnlistCounts(&pCsr, &p->aMatchinfo[iStart], 0);
}
}
return SQLITE_OK;
}
/*
** Populate pCsr->aMatchinfo[] with data for the current row. The
** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
*/
static int fts3GetMatchinfo(Fts3Cursor *pCsr){
MatchInfo sInfo;
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
int rc = SQLITE_OK;
sInfo.pCursor = pCsr;
sInfo.nCol = pTab->nColumn;
if( pCsr->aMatchinfo==0 ){
/* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
** matchinfo function has been called for this query. In this case
** allocate the array used to accumulate the matchinfo data and
** initialize those elements that are constant for every row.
*/
int nPhrase; /* Number of phrases */
int nMatchinfo; /* Number of u32 elements in match-info */
/* Load doclists for each phrase in the query. */
rc = fts3ExprLoadDoclists(pCsr, &nPhrase, 0);
if( rc!=SQLITE_OK ){
return rc;
}
nMatchinfo = 2 + 3*sInfo.nCol*nPhrase;
if( pTab->bHasDocsize ){
nMatchinfo += 1 + 2*pTab->nColumn;
}
sInfo.aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo);
if( !sInfo.aMatchinfo ){
return SQLITE_NOMEM;
}
memset(sInfo.aMatchinfo, 0, sizeof(u32)*nMatchinfo);
/* First element of match-info is the number of phrases in the query */
sInfo.aMatchinfo[0] = nPhrase;
sInfo.aMatchinfo[1] = sInfo.nCol;
if( pTab->bHasDocsize ){
int ofst = 2 + 3*sInfo.aMatchinfo[0]*sInfo.aMatchinfo[1];
rc = sqlite3Fts3MatchinfoDocsizeGlobal(pCsr, &sInfo.aMatchinfo[ofst]);
}
(void)fts3ExprIterate(pCsr->pExpr, fts3ExprGlobalMatchinfoCb,(void*)&sInfo);
pCsr->aMatchinfo = sInfo.aMatchinfo;
pCsr->isMatchinfoNeeded = 1;
}
sInfo.aMatchinfo = pCsr->aMatchinfo;
if( rc==SQLITE_OK && pCsr->isMatchinfoNeeded ){
(void)fts3ExprIterate(pCsr->pExpr, fts3ExprLocalMatchinfoCb, (void*)&sInfo);
if( pTab->bHasDocsize ){
int ofst = 2 + 3*sInfo.aMatchinfo[0]*sInfo.aMatchinfo[1];
rc = sqlite3Fts3MatchinfoDocsizeLocal(pCsr, &sInfo.aMatchinfo[ofst]);
}
pCsr->isMatchinfoNeeded = 0;
}
return SQLITE_OK;
}
/*
** Implementation of snippet() function.
*/
void sqlite3Fts3Snippet(
sqlite3_context *pCtx, /* SQLite function call context */
Fts3Cursor *pCsr, /* Cursor object */
const char *zStart, /* Snippet start text - "<b>" */
const char *zEnd, /* Snippet end text - "</b>" */
const char *zEllipsis, /* Snippet ellipsis text - "<b>...</b>" */
int iCol, /* Extract snippet from this column */
int nToken /* Approximate number of tokens in snippet */
){
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
int rc = SQLITE_OK;
int i;
StrBuffer res = {0, 0, 0};
/* The returned text includes up to four fragments of text extracted from
** the data in the current row. The first iteration of the for(...) loop
** below attempts to locate a single fragment of text nToken tokens in
** size that contains at least one instance of all phrases in the query
** expression that appear in the current row. If such a fragment of text
** cannot be found, the second iteration of the loop attempts to locate
** a pair of fragments, and so on.
*/
int nSnippet = 0; /* Number of fragments in this snippet */
SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */
int nFToken = -1; /* Number of tokens in each fragment */
if( !pCsr->pExpr ){
sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
return;
}
for(nSnippet=1; 1; nSnippet++){
int iSnip; /* Loop counter 0..nSnippet-1 */
u64 mCovered = 0; /* Bitmask of phrases covered by snippet */
u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */
if( nToken>=0 ){
nFToken = (nToken+nSnippet-1) / nSnippet;
}else{
nFToken = -1 * nToken;
}
for(iSnip=0; iSnip<nSnippet; iSnip++){
int iBestScore = -1; /* Best score of columns checked so far */
int iRead; /* Used to iterate through columns */
SnippetFragment *pFragment = &aSnippet[iSnip];
memset(pFragment, 0, sizeof(*pFragment));
/* Loop through all columns of the table being considered for snippets.
** If the iCol argument to this function was negative, this means all
** columns of the FTS3 table. Otherwise, only column iCol is considered.
*/
for(iRead=0; iRead<pTab->nColumn; iRead++){
SnippetFragment sF;
int iS;
if( iCol>=0 && iRead!=iCol ) continue;
/* Find the best snippet of nFToken tokens in column iRead. */
rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
if( rc!=SQLITE_OK ){
goto snippet_out;
}
if( iS>iBestScore ){
*pFragment = sF;
iBestScore = iS;
}
}
mCovered |= pFragment->covered;
}
/* If all query phrases seen by fts3BestSnippet() are present in at least
** one of the nSnippet snippet fragments, break out of the loop.
*/
assert( (mCovered&mSeen)==mCovered );
if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
}
assert( nFToken>0 );
for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
rc = fts3SnippetText(pCsr, &aSnippet[i],
i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
);
}
snippet_out:
sqlite3Fts3SegmentsClose(pTab);
if( rc!=SQLITE_OK ){
sqlite3_result_error_code(pCtx, rc);
sqlite3_free(res.z);
}else{
sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
}
}
typedef struct TermOffset TermOffset;
typedef struct TermOffsetCtx TermOffsetCtx;
struct TermOffset {
char *pList; /* Position-list */
int iPos; /* Position just read from pList */
int iOff; /* Offset of this term from read positions */
};
struct TermOffsetCtx {
int iCol; /* Column of table to populate aTerm for */
int iTerm;
sqlite3_int64 iDocid;
TermOffset *aTerm;
};
/*
** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
*/
static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
TermOffsetCtx *p = (TermOffsetCtx *)ctx;
int nTerm; /* Number of tokens in phrase */
int iTerm; /* For looping through nTerm phrase terms */
char *pList; /* Pointer to position list for phrase */
int iPos = 0; /* First position in position-list */
UNUSED_PARAMETER(iPhrase);
pList = sqlite3Fts3FindPositions(pExpr, p->iDocid, p->iCol);
nTerm = pExpr->pPhrase->nToken;
if( pList ){
fts3GetDeltaPosition(&pList, &iPos);
assert( iPos>=0 );
}
for(iTerm=0; iTerm<nTerm; iTerm++){
TermOffset *pT = &p->aTerm[p->iTerm++];
pT->iOff = nTerm-iTerm-1;
pT->pList = pList;
pT->iPos = iPos;
}
return SQLITE_OK;
}
/*
** Implementation of offsets() function.
*/
void sqlite3Fts3Offsets(
sqlite3_context *pCtx, /* SQLite function call context */
Fts3Cursor *pCsr /* Cursor object */
){
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
const char *ZDUMMY; /* Dummy argument used with xNext() */
int NDUMMY; /* Dummy argument used with xNext() */
int rc; /* Return Code */
int nToken; /* Number of tokens in query */
int iCol; /* Column currently being processed */
StrBuffer res = {0, 0, 0}; /* Result string */
TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */
if( !pCsr->pExpr ){
sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
return;
}
memset(&sCtx, 0, sizeof(sCtx));
assert( pCsr->isRequireSeek==0 );
/* Count the number of terms in the query */
rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
if( rc!=SQLITE_OK ) goto offsets_out;
/* Allocate the array of TermOffset iterators. */
sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
if( 0==sCtx.aTerm ){
rc = SQLITE_NOMEM;
goto offsets_out;
}
sCtx.iDocid = pCsr->iPrevId;
/* Loop through the table columns, appending offset information to
** string-buffer res for each column.
*/
for(iCol=0; iCol<pTab->nColumn; iCol++){
sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
int iStart;
int iEnd;
int iCurrent;
const char *zDoc;
int nDoc;
/* Initialize the contents of sCtx.aTerm[] for column iCol. There is
** no way that this operation can fail, so the return code from
** fts3ExprIterate() can be discarded.
*/
sCtx.iCol = iCol;
sCtx.iTerm = 0;
(void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
/* Retreive the text stored in column iCol. If an SQL NULL is stored
** in column iCol, jump immediately to the next iteration of the loop.
** If an OOM occurs while retrieving the data (this can happen if SQLite
** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM
** to the caller.
*/
zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
if( zDoc==0 ){
if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
continue;
}
rc = SQLITE_NOMEM;
goto offsets_out;
}
/* Initialize a tokenizer iterator to iterate through column iCol. */
rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
if( rc!=SQLITE_OK ) goto offsets_out;
pC->pTokenizer = pTab->pTokenizer;
rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
while( rc==SQLITE_OK ){
int i; /* Used to loop through terms */
int iMinPos = 0x7FFFFFFF; /* Position of next token */
TermOffset *pTerm = 0; /* TermOffset associated with next token */
for(i=0; i<nToken; i++){
TermOffset *pT = &sCtx.aTerm[i];
if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
iMinPos = pT->iPos-pT->iOff;
pTerm = pT;
}
}
if( !pTerm ){
/* All offsets for this column have been gathered. */
break;
}else{
assert( iCurrent<=iMinPos );
if( 0==(0xFE&*pTerm->pList) ){
pTerm->pList = 0;
}else{
fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
}
while( rc==SQLITE_OK && iCurrent<iMinPos ){
rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
}
if( rc==SQLITE_OK ){
char aBuffer[64];
sqlite3_snprintf(sizeof(aBuffer), aBuffer,
"%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
);
rc = fts3StringAppend(&res, aBuffer, -1);
}else if( rc==SQLITE_DONE ){
rc = SQLITE_CORRUPT;
}
}
}
if( rc==SQLITE_DONE ){
rc = SQLITE_OK;
}
pMod->xClose(pC);
if( rc!=SQLITE_OK ) goto offsets_out;
}
offsets_out:
sqlite3_free(sCtx.aTerm);
assert( rc!=SQLITE_DONE );
sqlite3Fts3SegmentsClose(pTab);
if( rc!=SQLITE_OK ){
sqlite3_result_error_code(pCtx, rc);
sqlite3_free(res.z);
}else{
sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
}
return;
}
/*
** Implementation of matchinfo() function.
*/
void sqlite3Fts3Matchinfo(sqlite3_context *pContext, Fts3Cursor *pCsr){
int rc;
if( !pCsr->pExpr ){
sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
return;
}
rc = fts3GetMatchinfo(pCsr);
sqlite3Fts3SegmentsClose((Fts3Table *)pCsr->base.pVtab );
if( rc!=SQLITE_OK ){
sqlite3_result_error_code(pContext, rc);
}else{
Fts3Table *pTab = (Fts3Table*)pCsr->base.pVtab;
int n = sizeof(u32)*(2+pCsr->aMatchinfo[0]*pCsr->aMatchinfo[1]*3);
if( pTab->bHasDocsize ){
n += sizeof(u32)*(1 + 2*pTab->nColumn);
}
sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
}
}
#endif