Implementation of the snippet() function for FTS1. Includes a few
simple test cases but more testing is needed. (CVS 3431) FossilOrigin-Name: c7ee60d00976efab25a830e7416538010c734129
This commit is contained in:
parent
d47522807e
commit
8b62817797
196
ext/fts1/fts1.c
196
ext/fts1/fts1.c
@ -53,8 +53,7 @@ void initStringBuffer(StringBuffer *sb){
|
||||
sb->s[0] = '\0';
|
||||
}
|
||||
|
||||
void append(StringBuffer *sb, const char *zFrom){
|
||||
int nFrom = strlen(zFrom);
|
||||
void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
|
||||
if( sb->len + nFrom >= sb->alloced ){
|
||||
sb->alloced = sb->len + nFrom + 100;
|
||||
sb->s = realloc(sb->s, sb->alloced+1);
|
||||
@ -63,8 +62,12 @@ void append(StringBuffer *sb, const char *zFrom){
|
||||
return;
|
||||
}
|
||||
}
|
||||
strcpy(sb->s + sb->len, zFrom);
|
||||
memcpy(sb->s + sb->len, zFrom, nFrom);
|
||||
sb->len += nFrom;
|
||||
sb->s[sb->len] = 0;
|
||||
}
|
||||
void append(StringBuffer *sb, const char *zFrom){
|
||||
nappend(sb, zFrom, strlen(zFrom));
|
||||
}
|
||||
|
||||
/* We encode variable-length integers in little-endian order using seven bits
|
||||
@ -909,16 +912,16 @@ typedef struct Snippet {
|
||||
int nMatch; /* Total number of matches */
|
||||
int nAlloc; /* Space allocated for aMatch[] */
|
||||
struct snippetMatch { /* One entry for each matching term */
|
||||
char exemplar; /* True if this match should be shown in the snippet */
|
||||
char snStatus; /* Status flag for use while constructing snippets */
|
||||
short int iCol; /* The column that contains the match */
|
||||
short int iTerm; /* The index in Query.pTerms[] of the matching term */
|
||||
short int nByte; /* Number of bytes in the term */
|
||||
short int nContext; /* Number of bytes of context for this match */
|
||||
int iStart; /* The offset to the first character of the term */
|
||||
int iContext; /* Start of the context */
|
||||
} *aMatch; /* Points to space obtained from malloc */
|
||||
char *zOffset; /* Text rendering of aMatch[] */
|
||||
int nOffset; /* strlen(zOffset) */
|
||||
char *zSnippet; /* Snippet text */
|
||||
int nSnippet; /* strlen(zSnippet) */
|
||||
} Snippet;
|
||||
|
||||
|
||||
@ -2002,6 +2005,7 @@ static void queryClear(Query *q){
|
||||
static void snippetClear(Snippet *p){
|
||||
free(p->aMatch);
|
||||
free(p->zOffset);
|
||||
free(p->zSnippet);
|
||||
memset(p, 0, sizeof(*p));
|
||||
}
|
||||
/*
|
||||
@ -2025,7 +2029,6 @@ static void snippetAppendMatch(
|
||||
}
|
||||
i = p->nMatch++;
|
||||
pMatch = &p->aMatch[i];
|
||||
pMatch->exemplar = 0;
|
||||
pMatch->iCol = iCol;
|
||||
pMatch->iTerm = iTerm;
|
||||
pMatch->iStart = iStart;
|
||||
@ -2166,28 +2169,162 @@ static void snippetOffsetText(Snippet *p){
|
||||
}
|
||||
|
||||
/*
|
||||
** Scan all matches in Snippet and mark the exemplars. Exemplars are
|
||||
** matches that we definitely want to include in the snippet.
|
||||
** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
|
||||
** of matching words some of which might be in zDoc. zDoc is column
|
||||
** number iCol.
|
||||
**
|
||||
** Generally speaking, each keyword in the search phrase will have
|
||||
** a single exemplar. When a keyword matches at multiple points
|
||||
** within the document, the trick is figuring which of these matches
|
||||
** should be the examplar.
|
||||
** iBreak is suggested spot in zDoc where we could begin or end an
|
||||
** excerpt. Return a value similar to iBreak but possibly adjusted
|
||||
** to be a little left or right so that the break point is better.
|
||||
*/
|
||||
static void snippetFindExemplars(Snippet *p, Query *pQ){
|
||||
static int wordBoundary(
|
||||
int iBreak, /* The suggested break point */
|
||||
const char *zDoc, /* Document text */
|
||||
int nDoc, /* Number of bytes in zDoc[] */
|
||||
struct snippetMatch *aMatch, /* Matching words */
|
||||
int nMatch, /* Number of entries in aMatch[] */
|
||||
int iCol /* The column number for zDoc[] */
|
||||
){
|
||||
int i;
|
||||
if( iBreak<=10 ){
|
||||
return 0;
|
||||
}
|
||||
if( iBreak>=nDoc-10 ){
|
||||
return nDoc;
|
||||
}
|
||||
for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
|
||||
while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
|
||||
if( i<nMatch ){
|
||||
if( aMatch[i].iStart<iBreak+10 ){
|
||||
return aMatch[i].iStart;
|
||||
}
|
||||
if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
|
||||
return aMatch[i-1].iStart;
|
||||
}
|
||||
}
|
||||
for(i=1; i<=10; i++){
|
||||
if( isspace(zDoc[iBreak-i]) ){
|
||||
return iBreak - i + 1;
|
||||
}
|
||||
if( isspace(zDoc[iBreak+i]) ){
|
||||
return iBreak + i + 1;
|
||||
}
|
||||
}
|
||||
return iBreak;
|
||||
}
|
||||
|
||||
/*
|
||||
** Allowed values for Snippet.aMatch[].snStatus
|
||||
*/
|
||||
#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
|
||||
#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
|
||||
|
||||
/*
|
||||
** Generate the text of a snippet.
|
||||
*/
|
||||
static void snippetText(
|
||||
fulltext_cursor *pCursor, /* The cursor we need the snippet for */
|
||||
const char *zStartMark, /* Markup to appear before each match */
|
||||
const char *zEndMark, /* Markup to appear after each match */
|
||||
const char *zEllipsis /* Ellipsis mark */
|
||||
){
|
||||
int i, j;
|
||||
for(i=0; i<pQ->nTerms; i++){
|
||||
for(j=0; j<p->nMatch; j++){
|
||||
if( p->aMatch[j].iTerm==i ){
|
||||
p->aMatch[j].exemplar = 1;
|
||||
struct snippetMatch *aMatch;
|
||||
int nMatch;
|
||||
int nDesired;
|
||||
StringBuffer sb;
|
||||
int tailCol = -1;
|
||||
int tailOffset = -1;
|
||||
int iCol;
|
||||
int nDoc;
|
||||
const char *zDoc;
|
||||
int iStart, iEnd;
|
||||
int wantEllipsis;
|
||||
int tailEllipsis = 0;
|
||||
int iMatch;
|
||||
|
||||
|
||||
free(pCursor->snippet.zSnippet);
|
||||
pCursor->snippet.zSnippet = 0;
|
||||
aMatch = pCursor->snippet.aMatch;
|
||||
nMatch = pCursor->snippet.nMatch;
|
||||
initStringBuffer(&sb);
|
||||
|
||||
for(i=0; i<nMatch; i++){
|
||||
aMatch[i].snStatus = SNIPPET_IGNORE;
|
||||
}
|
||||
nDesired = 0;
|
||||
for(i=0; i<pCursor->q.nTerms; i++){
|
||||
for(j=0; j<nMatch; j++){
|
||||
if( aMatch[j].iTerm==i ){
|
||||
aMatch[j].snStatus = SNIPPET_DESIRED;
|
||||
nDesired++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void snippetText(Snippet *p, Query *pQ){
|
||||
|
||||
iMatch = 0;
|
||||
for(i=0; i<nMatch && nDesired>0; i++){
|
||||
if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
|
||||
nDesired--;
|
||||
iCol = aMatch[i].iCol;
|
||||
zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
|
||||
nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
|
||||
iStart = aMatch[i].iStart - 40;
|
||||
iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
|
||||
if( iStart<=10 ){
|
||||
iStart = 0;
|
||||
wantEllipsis = 0;
|
||||
}else{
|
||||
wantEllipsis = 1;
|
||||
}
|
||||
if( iCol==tailCol && iStart<=tailOffset+20 ){
|
||||
iStart = tailOffset;
|
||||
wantEllipsis = 0;
|
||||
tailEllipsis = 0;
|
||||
}
|
||||
if( wantEllipsis || tailEllipsis ){
|
||||
append(&sb, zEllipsis);
|
||||
}
|
||||
iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
|
||||
iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
|
||||
if( iEnd>=nDoc-10 ){
|
||||
iEnd = nDoc;
|
||||
tailEllipsis = 0;
|
||||
}else{
|
||||
tailEllipsis = 1;
|
||||
}
|
||||
while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
|
||||
while( iStart<iEnd ){
|
||||
while( iMatch<nMatch && aMatch[iMatch].iStart<iStart ){ iMatch++; }
|
||||
if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd ){
|
||||
nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
|
||||
iStart = aMatch[iMatch].iStart;
|
||||
append(&sb, zStartMark);
|
||||
nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
|
||||
append(&sb, zEndMark);
|
||||
iStart += aMatch[iMatch].nByte;
|
||||
for(j=iMatch+1; j<nMatch; j++){
|
||||
if( aMatch[j].iTerm==aMatch[iMatch].iTerm
|
||||
&& aMatch[j].snStatus==SNIPPET_DESIRED ){
|
||||
nDesired--;
|
||||
aMatch[j].snStatus = SNIPPET_IGNORE;
|
||||
}
|
||||
}
|
||||
}else{
|
||||
nappend(&sb, &zDoc[iStart], iEnd - iStart);
|
||||
iStart = iEnd;
|
||||
}
|
||||
}
|
||||
tailCol = iCol;
|
||||
tailOffset = iEnd;
|
||||
}
|
||||
if( tailEllipsis ){
|
||||
append(&sb, zEllipsis);
|
||||
}
|
||||
pCursor->snippet.zSnippet = sb.s;
|
||||
pCursor->snippet.nSnippet = sb.len;
|
||||
}
|
||||
|
||||
|
||||
@ -2847,8 +2984,23 @@ static void snippetFunc(
|
||||
sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
|
||||
sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
|
||||
}else{
|
||||
const char *zStart = "<b>";
|
||||
const char *zEnd = "</b>";
|
||||
const char *zEllipsis = "<b>...</b>";
|
||||
memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
|
||||
/* TODO: Return the snippet */
|
||||
if( argc>=2 ){
|
||||
zStart = (const char*)sqlite3_value_text(argv[1]);
|
||||
if( argc>=3 ){
|
||||
zEnd = (const char*)sqlite3_value_text(argv[2]);
|
||||
if( argc>=4 ){
|
||||
zEllipsis = (const char*)sqlite3_value_text(argv[3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
snippetAllOffsets(pCursor);
|
||||
snippetText(pCursor, zStart, zEnd, zEllipsis);
|
||||
sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
|
||||
pCursor->snippet.nSnippet, SQLITE_STATIC);
|
||||
}
|
||||
}
|
||||
|
||||
|
16
manifest
16
manifest
@ -1,5 +1,5 @@
|
||||
C Fixed\sa\sbuild\sproblem\sin\ssqlite3_extension_init().\s(CVS\s3430)
|
||||
D 2006-09-18T21:14:40
|
||||
C Implementation\sof\sthe\ssnippet()\sfunction\sfor\sFTS1.\s\sIncludes\sa\sfew\nsimple\stest\scases\sbut\smore\stesting\sis\sneeded.\s(CVS\s3431)
|
||||
D 2006-09-21T02:03:09
|
||||
F Makefile.in cabd42d34340f49260bc2a7668c38eba8d4cfd99
|
||||
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
|
||||
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
|
||||
@ -21,7 +21,7 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1
|
||||
F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
|
||||
F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
|
||||
F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5
|
||||
F ext/fts1/fts1.c 50770451c8d3c693f7819dad33d397246f44ea90
|
||||
F ext/fts1/fts1.c 02c5b614ff8055b374b88acaf5cae3a834da3150
|
||||
F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6
|
||||
F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114
|
||||
F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089
|
||||
@ -192,7 +192,7 @@ F test/fkey1.test 153004438d51e6769fb1ce165f6313972d6263ce
|
||||
F test/format4.test bf3bed3b13c63abfb3cfec232597a319a31d0bcc
|
||||
F test/fts1a.test 54fd9451c00fb91074d5abdc207b05dcba6d2d65
|
||||
F test/fts1b.test 5d8a01aefbecc8b7442b36c94c05eb7a845462d5
|
||||
F test/fts1c.test 4d84cfcacce229e4802fd676462f4616fabadad3
|
||||
F test/fts1c.test a57cb192d59ddacba64d17c326ff99393c181dc6
|
||||
F test/func.test 0ed54b5aeaad319f68016c033acfebef56f5874a
|
||||
F test/hook.test 7e7645fd9a033f79cce8fdff151e32715e7ec50a
|
||||
F test/in.test 369cb2aa1eab02296b4ec470732fe8c131260b1d
|
||||
@ -399,7 +399,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
|
||||
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
|
||||
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
|
||||
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
|
||||
P cd4e1de896ef715c444071f758b74dbb607e0572
|
||||
R cdcaff3d2acee8f53c20d70e6e102e11
|
||||
U adamd
|
||||
Z 00727001b33fedbb26b4905a3830ed9f
|
||||
P bb2e1871cb10b470f96c793bb137c043ef30e1da
|
||||
R 2733a08fd53c5688fa09fb4ef51647ba
|
||||
U drh
|
||||
Z 50f7084cc6542485b2db3e3ffa3bb7c3
|
||||
|
@ -1 +1 @@
|
||||
bb2e1871cb10b470f96c793bb137c043ef30e1da
|
||||
c7ee60d00976efab25a830e7416538010c734129
|
@ -11,7 +11,7 @@
|
||||
# This file implements regression tests for SQLite library. The
|
||||
# focus of this script is testing the FTS1 module.
|
||||
#
|
||||
# $Id: fts1c.test,v 1.6 2006/09/18 02:12:48 drh Exp $
|
||||
# $Id: fts1c.test,v 1.7 2006/09/21 02:03:11 drh Exp $
|
||||
#
|
||||
|
||||
set testdir [file dirname $argv0]
|
||||
@ -1118,4 +1118,44 @@ do_test fts1c-3.2 {
|
||||
}
|
||||
} {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}}
|
||||
|
||||
# Snippet generator tests
|
||||
#
|
||||
do_test fts1c-4.1 {
|
||||
execsql {
|
||||
SELECT snippet(email) FROM email
|
||||
WHERE email MATCH 'subject:gas reminder'
|
||||
}
|
||||
} {{Alert Posted 10:00 AM November 20,2000: E-<b>GAS</b> Request <b>Reminder</b>}}
|
||||
do_test fts1c-4.2 {
|
||||
execsql {
|
||||
SELECT snippet(email) FROM email
|
||||
WHERE email MATCH 'christmas candlelight'
|
||||
}
|
||||
} {{<b>...</b>place.? What do you think about going here <b>Christmas</b>
|
||||
eve?? They have an 11:00 a.m. service and a <b>candlelight</b> service at 5:00 p.m.,
|
||||
among others.
|
||||
|
||||
<b>...</b>}}
|
||||
|
||||
do_test fts1c-4.3 {
|
||||
execsql {
|
||||
SELECT snippet(email) FROM email
|
||||
WHERE email MATCH 'deal sheet potential reuse'
|
||||
}
|
||||
} {{EOL-Accenture <b>Deal</b> <b>Sheet</b><b>...</b>intent
|
||||
Review Enron asset base for <b>potential</b> <b>reuse</b>/ licensing
|
||||
Contract negotiations
|
||||
|
||||
<b>...</b>}}
|
||||
do_test fts1c-4.4 {
|
||||
execsql {
|
||||
SELECT snippet(email,'<<<','>>>',' ') FROM email
|
||||
WHERE email MATCH 'deal sheet potential reuse'
|
||||
}
|
||||
} {{EOL-Accenture <<<Deal>>> <<<Sheet>>> intent
|
||||
Review Enron asset base for <<<potential>>> <<<reuse>>>/ licensing
|
||||
Contract negotiations
|
||||
|
||||
}}
|
||||
|
||||
finish_test
|
||||
|
Loading…
x
Reference in New Issue
Block a user