Implementation of the snippet() function for FTS1. Includes a few

simple test cases but more testing is needed. (CVS 3431)

FossilOrigin-Name: c7ee60d00976efab25a830e7416538010c734129
This commit is contained in:
drh 2006-09-21 02:03:08 +00:00
parent d47522807e
commit 8b62817797
4 changed files with 224 additions and 32 deletions

View File

@ -53,8 +53,7 @@ void initStringBuffer(StringBuffer *sb){
sb->s[0] = '\0';
}
void append(StringBuffer *sb, const char *zFrom){
int nFrom = strlen(zFrom);
void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
if( sb->len + nFrom >= sb->alloced ){
sb->alloced = sb->len + nFrom + 100;
sb->s = realloc(sb->s, sb->alloced+1);
@ -63,8 +62,12 @@ void append(StringBuffer *sb, const char *zFrom){
return;
}
}
strcpy(sb->s + sb->len, zFrom);
memcpy(sb->s + sb->len, zFrom, nFrom);
sb->len += nFrom;
sb->s[sb->len] = 0;
}
void append(StringBuffer *sb, const char *zFrom){
nappend(sb, zFrom, strlen(zFrom));
}
/* We encode variable-length integers in little-endian order using seven bits
@ -909,16 +912,16 @@ typedef struct Snippet {
int nMatch; /* Total number of matches */
int nAlloc; /* Space allocated for aMatch[] */
struct snippetMatch { /* One entry for each matching term */
char exemplar; /* True if this match should be shown in the snippet */
char snStatus; /* Status flag for use while constructing snippets */
short int iCol; /* The column that contains the match */
short int iTerm; /* The index in Query.pTerms[] of the matching term */
short int nByte; /* Number of bytes in the term */
short int nContext; /* Number of bytes of context for this match */
int iStart; /* The offset to the first character of the term */
int iContext; /* Start of the context */
} *aMatch; /* Points to space obtained from malloc */
char *zOffset; /* Text rendering of aMatch[] */
int nOffset; /* strlen(zOffset) */
char *zSnippet; /* Snippet text */
int nSnippet; /* strlen(zSnippet) */
} Snippet;
@ -2002,6 +2005,7 @@ static void queryClear(Query *q){
static void snippetClear(Snippet *p){
free(p->aMatch);
free(p->zOffset);
free(p->zSnippet);
memset(p, 0, sizeof(*p));
}
/*
@ -2025,7 +2029,6 @@ static void snippetAppendMatch(
}
i = p->nMatch++;
pMatch = &p->aMatch[i];
pMatch->exemplar = 0;
pMatch->iCol = iCol;
pMatch->iTerm = iTerm;
pMatch->iStart = iStart;
@ -2166,28 +2169,162 @@ static void snippetOffsetText(Snippet *p){
}
/*
** Scan all matches in Snippet and mark the exemplars. Exemplars are
** matches that we definitely want to include in the snippet.
** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
** of matching words some of which might be in zDoc. zDoc is column
** number iCol.
**
** Generally speaking, each keyword in the search phrase will have
** a single exemplar. When a keyword matches at multiple points
** within the document, the trick is figuring which of these matches
** should be the examplar.
** iBreak is suggested spot in zDoc where we could begin or end an
** excerpt. Return a value similar to iBreak but possibly adjusted
** to be a little left or right so that the break point is better.
*/
static void snippetFindExemplars(Snippet *p, Query *pQ){
static int wordBoundary(
int iBreak, /* The suggested break point */
const char *zDoc, /* Document text */
int nDoc, /* Number of bytes in zDoc[] */
struct snippetMatch *aMatch, /* Matching words */
int nMatch, /* Number of entries in aMatch[] */
int iCol /* The column number for zDoc[] */
){
int i;
if( iBreak<=10 ){
return 0;
}
if( iBreak>=nDoc-10 ){
return nDoc;
}
for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
if( i<nMatch ){
if( aMatch[i].iStart<iBreak+10 ){
return aMatch[i].iStart;
}
if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
return aMatch[i-1].iStart;
}
}
for(i=1; i<=10; i++){
if( isspace(zDoc[iBreak-i]) ){
return iBreak - i + 1;
}
if( isspace(zDoc[iBreak+i]) ){
return iBreak + i + 1;
}
}
return iBreak;
}
/*
** Allowed values for Snippet.aMatch[].snStatus
*/
#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
/*
** Generate the text of a snippet.
*/
static void snippetText(
fulltext_cursor *pCursor, /* The cursor we need the snippet for */
const char *zStartMark, /* Markup to appear before each match */
const char *zEndMark, /* Markup to appear after each match */
const char *zEllipsis /* Ellipsis mark */
){
int i, j;
for(i=0; i<pQ->nTerms; i++){
for(j=0; j<p->nMatch; j++){
if( p->aMatch[j].iTerm==i ){
p->aMatch[j].exemplar = 1;
struct snippetMatch *aMatch;
int nMatch;
int nDesired;
StringBuffer sb;
int tailCol = -1;
int tailOffset = -1;
int iCol;
int nDoc;
const char *zDoc;
int iStart, iEnd;
int wantEllipsis;
int tailEllipsis = 0;
int iMatch;
free(pCursor->snippet.zSnippet);
pCursor->snippet.zSnippet = 0;
aMatch = pCursor->snippet.aMatch;
nMatch = pCursor->snippet.nMatch;
initStringBuffer(&sb);
for(i=0; i<nMatch; i++){
aMatch[i].snStatus = SNIPPET_IGNORE;
}
nDesired = 0;
for(i=0; i<pCursor->q.nTerms; i++){
for(j=0; j<nMatch; j++){
if( aMatch[j].iTerm==i ){
aMatch[j].snStatus = SNIPPET_DESIRED;
nDesired++;
break;
}
}
}
}
static void snippetText(Snippet *p, Query *pQ){
iMatch = 0;
for(i=0; i<nMatch && nDesired>0; i++){
if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
nDesired--;
iCol = aMatch[i].iCol;
zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
iStart = aMatch[i].iStart - 40;
iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
if( iStart<=10 ){
iStart = 0;
wantEllipsis = 0;
}else{
wantEllipsis = 1;
}
if( iCol==tailCol && iStart<=tailOffset+20 ){
iStart = tailOffset;
wantEllipsis = 0;
tailEllipsis = 0;
}
if( wantEllipsis || tailEllipsis ){
append(&sb, zEllipsis);
}
iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
if( iEnd>=nDoc-10 ){
iEnd = nDoc;
tailEllipsis = 0;
}else{
tailEllipsis = 1;
}
while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
while( iStart<iEnd ){
while( iMatch<nMatch && aMatch[iMatch].iStart<iStart ){ iMatch++; }
if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd ){
nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
iStart = aMatch[iMatch].iStart;
append(&sb, zStartMark);
nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
append(&sb, zEndMark);
iStart += aMatch[iMatch].nByte;
for(j=iMatch+1; j<nMatch; j++){
if( aMatch[j].iTerm==aMatch[iMatch].iTerm
&& aMatch[j].snStatus==SNIPPET_DESIRED ){
nDesired--;
aMatch[j].snStatus = SNIPPET_IGNORE;
}
}
}else{
nappend(&sb, &zDoc[iStart], iEnd - iStart);
iStart = iEnd;
}
}
tailCol = iCol;
tailOffset = iEnd;
}
if( tailEllipsis ){
append(&sb, zEllipsis);
}
pCursor->snippet.zSnippet = sb.s;
pCursor->snippet.nSnippet = sb.len;
}
@ -2847,8 +2984,23 @@ static void snippetFunc(
sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
}else{
const char *zStart = "<b>";
const char *zEnd = "</b>";
const char *zEllipsis = "<b>...</b>";
memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
/* TODO: Return the snippet */
if( argc>=2 ){
zStart = (const char*)sqlite3_value_text(argv[1]);
if( argc>=3 ){
zEnd = (const char*)sqlite3_value_text(argv[2]);
if( argc>=4 ){
zEllipsis = (const char*)sqlite3_value_text(argv[3]);
}
}
}
snippetAllOffsets(pCursor);
snippetText(pCursor, zStart, zEnd, zEllipsis);
sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
pCursor->snippet.nSnippet, SQLITE_STATIC);
}
}

View File

@ -1,5 +1,5 @@
C Fixed\sa\sbuild\sproblem\sin\ssqlite3_extension_init().\s(CVS\s3430)
D 2006-09-18T21:14:40
C Implementation\sof\sthe\ssnippet()\sfunction\sfor\sFTS1.\s\sIncludes\sa\sfew\nsimple\stest\scases\sbut\smore\stesting\sis\sneeded.\s(CVS\s3431)
D 2006-09-21T02:03:09
F Makefile.in cabd42d34340f49260bc2a7668c38eba8d4cfd99
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -21,7 +21,7 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1
F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5
F ext/fts1/fts1.c 50770451c8d3c693f7819dad33d397246f44ea90
F ext/fts1/fts1.c 02c5b614ff8055b374b88acaf5cae3a834da3150
F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6
F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114
F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089
@ -192,7 +192,7 @@ F test/fkey1.test 153004438d51e6769fb1ce165f6313972d6263ce
F test/format4.test bf3bed3b13c63abfb3cfec232597a319a31d0bcc
F test/fts1a.test 54fd9451c00fb91074d5abdc207b05dcba6d2d65
F test/fts1b.test 5d8a01aefbecc8b7442b36c94c05eb7a845462d5
F test/fts1c.test 4d84cfcacce229e4802fd676462f4616fabadad3
F test/fts1c.test a57cb192d59ddacba64d17c326ff99393c181dc6
F test/func.test 0ed54b5aeaad319f68016c033acfebef56f5874a
F test/hook.test 7e7645fd9a033f79cce8fdff151e32715e7ec50a
F test/in.test 369cb2aa1eab02296b4ec470732fe8c131260b1d
@ -399,7 +399,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
P cd4e1de896ef715c444071f758b74dbb607e0572
R cdcaff3d2acee8f53c20d70e6e102e11
U adamd
Z 00727001b33fedbb26b4905a3830ed9f
P bb2e1871cb10b470f96c793bb137c043ef30e1da
R 2733a08fd53c5688fa09fb4ef51647ba
U drh
Z 50f7084cc6542485b2db3e3ffa3bb7c3

View File

@ -1 +1 @@
bb2e1871cb10b470f96c793bb137c043ef30e1da
c7ee60d00976efab25a830e7416538010c734129

View File

@ -11,7 +11,7 @@
# This file implements regression tests for SQLite library. The
# focus of this script is testing the FTS1 module.
#
# $Id: fts1c.test,v 1.6 2006/09/18 02:12:48 drh Exp $
# $Id: fts1c.test,v 1.7 2006/09/21 02:03:11 drh Exp $
#
set testdir [file dirname $argv0]
@ -1118,4 +1118,44 @@ do_test fts1c-3.2 {
}
} {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}}
# Snippet generator tests
#
do_test fts1c-4.1 {
execsql {
SELECT snippet(email) FROM email
WHERE email MATCH 'subject:gas reminder'
}
} {{Alert Posted 10:00 AM November 20,2000: E-<b>GAS</b> Request <b>Reminder</b>}}
do_test fts1c-4.2 {
execsql {
SELECT snippet(email) FROM email
WHERE email MATCH 'christmas candlelight'
}
} {{<b>...</b>place.? What do you think about going here <b>Christmas</b>
eve?? They have an 11:00 a.m. service and a <b>candlelight</b> service at 5:00 p.m.,
among others.
<b>...</b>}}
do_test fts1c-4.3 {
execsql {
SELECT snippet(email) FROM email
WHERE email MATCH 'deal sheet potential reuse'
}
} {{EOL-Accenture <b>Deal</b> <b>Sheet</b><b>...</b>intent
Review Enron asset base for <b>potential</b> <b>reuse</b>/ licensing
Contract negotiations
<b>...</b>}}
do_test fts1c-4.4 {
execsql {
SELECT snippet(email,'<<<','>>>',' ') FROM email
WHERE email MATCH 'deal sheet potential reuse'
}
} {{EOL-Accenture <<<Deal>>> <<<Sheet>>> intent
Review Enron asset base for <<<potential>>> <<<reuse>>>/ licensing
Contract negotiations
}}
finish_test