diff --git a/ext/fts1/fts1.c b/ext/fts1/fts1.c index ddde412e89..bfbc483a24 100644 --- a/ext/fts1/fts1.c +++ b/ext/fts1/fts1.c @@ -53,8 +53,7 @@ void initStringBuffer(StringBuffer *sb){ sb->s[0] = '\0'; } -void append(StringBuffer *sb, const char *zFrom){ - int nFrom = strlen(zFrom); +void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ if( sb->len + nFrom >= sb->alloced ){ sb->alloced = sb->len + nFrom + 100; sb->s = realloc(sb->s, sb->alloced+1); @@ -63,8 +62,12 @@ void append(StringBuffer *sb, const char *zFrom){ return; } } - strcpy(sb->s + sb->len, zFrom); + memcpy(sb->s + sb->len, zFrom, nFrom); sb->len += nFrom; + sb->s[sb->len] = 0; +} +void append(StringBuffer *sb, const char *zFrom){ + nappend(sb, zFrom, strlen(zFrom)); } /* We encode variable-length integers in little-endian order using seven bits @@ -909,16 +912,16 @@ typedef struct Snippet { int nMatch; /* Total number of matches */ int nAlloc; /* Space allocated for aMatch[] */ struct snippetMatch { /* One entry for each matching term */ - char exemplar; /* True if this match should be shown in the snippet */ + char snStatus; /* Status flag for use while constructing snippets */ short int iCol; /* The column that contains the match */ short int iTerm; /* The index in Query.pTerms[] of the matching term */ short int nByte; /* Number of bytes in the term */ - short int nContext; /* Number of bytes of context for this match */ int iStart; /* The offset to the first character of the term */ - int iContext; /* Start of the context */ } *aMatch; /* Points to space obtained from malloc */ char *zOffset; /* Text rendering of aMatch[] */ int nOffset; /* strlen(zOffset) */ + char *zSnippet; /* Snippet text */ + int nSnippet; /* strlen(zSnippet) */ } Snippet; @@ -2002,6 +2005,7 @@ static void queryClear(Query *q){ static void snippetClear(Snippet *p){ free(p->aMatch); free(p->zOffset); + free(p->zSnippet); memset(p, 0, sizeof(*p)); } /* @@ -2025,7 +2029,6 @@ static void snippetAppendMatch( } i = p->nMatch++; pMatch = &p->aMatch[i]; - pMatch->exemplar = 0; pMatch->iCol = iCol; pMatch->iTerm = iTerm; pMatch->iStart = iStart; @@ -2166,28 +2169,162 @@ static void snippetOffsetText(Snippet *p){ } /* -** Scan all matches in Snippet and mark the exemplars. Exemplars are -** matches that we definitely want to include in the snippet. +** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set +** of matching words some of which might be in zDoc. zDoc is column +** number iCol. ** -** Generally speaking, each keyword in the search phrase will have -** a single exemplar. When a keyword matches at multiple points -** within the document, the trick is figuring which of these matches -** should be the examplar. +** iBreak is suggested spot in zDoc where we could begin or end an +** excerpt. Return a value similar to iBreak but possibly adjusted +** to be a little left or right so that the break point is better. */ -static void snippetFindExemplars(Snippet *p, Query *pQ){ +static int wordBoundary( + int iBreak, /* The suggested break point */ + const char *zDoc, /* Document text */ + int nDoc, /* Number of bytes in zDoc[] */ + struct snippetMatch *aMatch, /* Matching words */ + int nMatch, /* Number of entries in aMatch[] */ + int iCol /* The column number for zDoc[] */ +){ + int i; + if( iBreak<=10 ){ + return 0; + } + if( iBreak>=nDoc-10 ){ + return nDoc; + } + for(i=0; i0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ + return aMatch[i-1].iStart; + } + } + for(i=1; i<=10; i++){ + if( isspace(zDoc[iBreak-i]) ){ + return iBreak - i + 1; + } + if( isspace(zDoc[iBreak+i]) ){ + return iBreak + i + 1; + } + } + return iBreak; +} + +/* +** Allowed values for Snippet.aMatch[].snStatus +*/ +#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ +#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ + +/* +** Generate the text of a snippet. +*/ +static void snippetText( + fulltext_cursor *pCursor, /* The cursor we need the snippet for */ + const char *zStartMark, /* Markup to appear before each match */ + const char *zEndMark, /* Markup to appear after each match */ + const char *zEllipsis /* Ellipsis mark */ +){ int i, j; - for(i=0; inTerms; i++){ - for(j=0; jnMatch; j++){ - if( p->aMatch[j].iTerm==i ){ - p->aMatch[j].exemplar = 1; + struct snippetMatch *aMatch; + int nMatch; + int nDesired; + StringBuffer sb; + int tailCol = -1; + int tailOffset = -1; + int iCol; + int nDoc; + const char *zDoc; + int iStart, iEnd; + int wantEllipsis; + int tailEllipsis = 0; + int iMatch; + + + free(pCursor->snippet.zSnippet); + pCursor->snippet.zSnippet = 0; + aMatch = pCursor->snippet.aMatch; + nMatch = pCursor->snippet.nMatch; + initStringBuffer(&sb); + + for(i=0; iq.nTerms; i++){ + for(j=0; j0; i++){ + if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; + nDesired--; + iCol = aMatch[i].iCol; + zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); + nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); + iStart = aMatch[i].iStart - 40; + iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); + if( iStart<=10 ){ + iStart = 0; + wantEllipsis = 0; + }else{ + wantEllipsis = 1; + } + if( iCol==tailCol && iStart<=tailOffset+20 ){ + iStart = tailOffset; + wantEllipsis = 0; + tailEllipsis = 0; + } + if( wantEllipsis || tailEllipsis ){ + append(&sb, zEllipsis); + } + iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; + iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); + if( iEnd>=nDoc-10 ){ + iEnd = nDoc; + tailEllipsis = 0; + }else{ + tailEllipsis = 1; + } + while( iMatchsnippet.zSnippet = sb.s; + pCursor->snippet.nSnippet = sb.len; } @@ -2847,8 +2984,23 @@ static void snippetFunc( sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); }else{ + const char *zStart = ""; + const char *zEnd = ""; + const char *zEllipsis = "..."; memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); - /* TODO: Return the snippet */ + if( argc>=2 ){ + zStart = (const char*)sqlite3_value_text(argv[1]); + if( argc>=3 ){ + zEnd = (const char*)sqlite3_value_text(argv[2]); + if( argc>=4 ){ + zEllipsis = (const char*)sqlite3_value_text(argv[3]); + } + } + } + snippetAllOffsets(pCursor); + snippetText(pCursor, zStart, zEnd, zEllipsis); + sqlite3_result_text(pContext, pCursor->snippet.zSnippet, + pCursor->snippet.nSnippet, SQLITE_STATIC); } } diff --git a/manifest b/manifest index dc35284ee2..10e6648b20 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Fixed\sa\sbuild\sproblem\sin\ssqlite3_extension_init().\s(CVS\s3430) -D 2006-09-18T21:14:40 +C Implementation\sof\sthe\ssnippet()\sfunction\sfor\sFTS1.\s\sIncludes\sa\sfew\nsimple\stest\scases\sbut\smore\stesting\sis\sneeded.\s(CVS\s3431) +D 2006-09-21T02:03:09 F Makefile.in cabd42d34340f49260bc2a7668c38eba8d4cfd99 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -21,7 +21,7 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1 F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5 -F ext/fts1/fts1.c 50770451c8d3c693f7819dad33d397246f44ea90 +F ext/fts1/fts1.c 02c5b614ff8055b374b88acaf5cae3a834da3150 F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6 F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114 F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089 @@ -192,7 +192,7 @@ F test/fkey1.test 153004438d51e6769fb1ce165f6313972d6263ce F test/format4.test bf3bed3b13c63abfb3cfec232597a319a31d0bcc F test/fts1a.test 54fd9451c00fb91074d5abdc207b05dcba6d2d65 F test/fts1b.test 5d8a01aefbecc8b7442b36c94c05eb7a845462d5 -F test/fts1c.test 4d84cfcacce229e4802fd676462f4616fabadad3 +F test/fts1c.test a57cb192d59ddacba64d17c326ff99393c181dc6 F test/func.test 0ed54b5aeaad319f68016c033acfebef56f5874a F test/hook.test 7e7645fd9a033f79cce8fdff151e32715e7ec50a F test/in.test 369cb2aa1eab02296b4ec470732fe8c131260b1d @@ -399,7 +399,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P cd4e1de896ef715c444071f758b74dbb607e0572 -R cdcaff3d2acee8f53c20d70e6e102e11 -U adamd -Z 00727001b33fedbb26b4905a3830ed9f +P bb2e1871cb10b470f96c793bb137c043ef30e1da +R 2733a08fd53c5688fa09fb4ef51647ba +U drh +Z 50f7084cc6542485b2db3e3ffa3bb7c3 diff --git a/manifest.uuid b/manifest.uuid index 2a73078c93..3f692ce1af 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -bb2e1871cb10b470f96c793bb137c043ef30e1da \ No newline at end of file +c7ee60d00976efab25a830e7416538010c734129 \ No newline at end of file diff --git a/test/fts1c.test b/test/fts1c.test index b62fc5ff18..3003f5fc64 100644 --- a/test/fts1c.test +++ b/test/fts1c.test @@ -11,7 +11,7 @@ # This file implements regression tests for SQLite library. The # focus of this script is testing the FTS1 module. # -# $Id: fts1c.test,v 1.6 2006/09/18 02:12:48 drh Exp $ +# $Id: fts1c.test,v 1.7 2006/09/21 02:03:11 drh Exp $ # set testdir [file dirname $argv0] @@ -1118,4 +1118,44 @@ do_test fts1c-3.2 { } } {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}} +# Snippet generator tests +# +do_test fts1c-4.1 { + execsql { + SELECT snippet(email) FROM email + WHERE email MATCH 'subject:gas reminder' + } +} {{Alert Posted 10:00 AM November 20,2000: E-GAS Request Reminder}} +do_test fts1c-4.2 { + execsql { + SELECT snippet(email) FROM email + WHERE email MATCH 'christmas candlelight' + } +} {{...place.? What do you think about going here Christmas +eve?? They have an 11:00 a.m. service and a candlelight service at 5:00 p.m., +among others. + +...}} + +do_test fts1c-4.3 { + execsql { + SELECT snippet(email) FROM email + WHERE email MATCH 'deal sheet potential reuse' + } +} {{EOL-Accenture Deal Sheet...intent + Review Enron asset base for potential reuse/ licensing + Contract negotiations + +...}} +do_test fts1c-4.4 { + execsql { + SELECT snippet(email,'<<<','>>>',' ') FROM email + WHERE email MATCH 'deal sheet potential reuse' + } +} {{EOL-Accenture <<>> <<>> intent + Review Enron asset base for <<>> <<>>/ licensing + Contract negotiations + + }} + finish_test