Fix a problem with the fts5 highlight() and snippet() functions when used with tokenizers like "trigram" that output overlapping tokens. Forum post [forum:/forumpost/63735293ec|63735293ec].
FossilOrigin-Name: d570aa02f79b1d7d3889e33f9eebab1b7edcf5231b1357451eed9a538607de54
This commit is contained in:
parent
eb9882d7d1
commit
d548f74024
@ -110,15 +110,19 @@ static int fts5CInstIterInit(
|
||||
*/
|
||||
typedef struct HighlightContext HighlightContext;
|
||||
struct HighlightContext {
|
||||
CInstIter iter; /* Coalesced Instance Iterator */
|
||||
int iPos; /* Current token offset in zIn[] */
|
||||
/* Constant parameters to fts5HighlightCb() */
|
||||
int iRangeStart; /* First token to include */
|
||||
int iRangeEnd; /* If non-zero, last token to include */
|
||||
const char *zOpen; /* Opening highlight */
|
||||
const char *zClose; /* Closing highlight */
|
||||
const char *zIn; /* Input text */
|
||||
int nIn; /* Size of input text in bytes */
|
||||
int iOff; /* Current offset within zIn[] */
|
||||
|
||||
/* Variables modified by fts5HighlightCb() */
|
||||
CInstIter iter; /* Coalesced Instance Iterator */
|
||||
int iPos; /* Current token offset in zIn[] */
|
||||
int iOff; /* Have copied up to this offset in zIn[] */
|
||||
int bOpen; /* True if highlight is open */
|
||||
char *zOut; /* Output value */
|
||||
};
|
||||
|
||||
@ -151,8 +155,8 @@ static int fts5HighlightCb(
|
||||
int tflags, /* Mask of FTS5_TOKEN_* flags */
|
||||
const char *pToken, /* Buffer containing token */
|
||||
int nToken, /* Size of token in bytes */
|
||||
int iStartOff, /* Start offset of token */
|
||||
int iEndOff /* End offset of token */
|
||||
int iStartOff, /* Start byte offset of token */
|
||||
int iEndOff /* End byte offset of token */
|
||||
){
|
||||
HighlightContext *p = (HighlightContext*)pContext;
|
||||
int rc = SQLITE_OK;
|
||||
@ -168,30 +172,47 @@ static int fts5HighlightCb(
|
||||
if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
|
||||
}
|
||||
|
||||
if( iPos==p->iter.iStart ){
|
||||
/* If the parenthesis is open, and this token is not part of the current
|
||||
** phrase, and the starting byte offset of this token is past the point
|
||||
** that has currently been copied into the output buffer, close the
|
||||
** parenthesis. */
|
||||
if( p->bOpen
|
||||
&& (iPos<=p->iter.iStart || p->iter.iStart<0)
|
||||
&& iStartOff>p->iOff
|
||||
){
|
||||
fts5HighlightAppend(&rc, p, p->zClose, -1);
|
||||
p->bOpen = 0;
|
||||
}
|
||||
|
||||
/* If this is the start of a new phrase, and the highlight is not open:
|
||||
**
|
||||
** * copy text from the input up to the start of the phrase, and
|
||||
** * open the highlight.
|
||||
*/
|
||||
if( iPos==p->iter.iStart && p->bOpen==0 ){
|
||||
fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
|
||||
fts5HighlightAppend(&rc, p, p->zOpen, -1);
|
||||
p->iOff = iStartOff;
|
||||
p->bOpen = 1;
|
||||
}
|
||||
|
||||
if( iPos==p->iter.iEnd ){
|
||||
if( p->iRangeEnd>=0 && p->iter.iStart<p->iRangeStart ){
|
||||
if( p->bOpen==0 ){
|
||||
assert( p->iRangeEnd>=0 );
|
||||
fts5HighlightAppend(&rc, p, p->zOpen, -1);
|
||||
p->bOpen = 1;
|
||||
}
|
||||
fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
|
||||
fts5HighlightAppend(&rc, p, p->zClose, -1);
|
||||
p->iOff = iEndOff;
|
||||
|
||||
if( rc==SQLITE_OK ){
|
||||
rc = fts5CInstIterNext(&p->iter);
|
||||
}
|
||||
}
|
||||
|
||||
if( p->iRangeEnd>=0 && iPos==p->iRangeEnd ){
|
||||
if( iPos==p->iRangeEnd ){
|
||||
fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
|
||||
p->iOff = iEndOff;
|
||||
if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){
|
||||
fts5HighlightAppend(&rc, p, p->zClose, -1);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -232,6 +253,9 @@ static void fts5HighlightFunction(
|
||||
if( rc==SQLITE_OK ){
|
||||
rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
|
||||
}
|
||||
if( ctx.bOpen ){
|
||||
fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
|
||||
}
|
||||
fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
|
||||
|
||||
if( rc==SQLITE_OK ){
|
||||
@ -510,6 +534,9 @@ static void fts5SnippetFunction(
|
||||
if( rc==SQLITE_OK ){
|
||||
rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
|
||||
}
|
||||
if( ctx.bOpen ){
|
||||
fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
|
||||
}
|
||||
if( ctx.iRangeEnd>=(nColSize-1) ){
|
||||
fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
|
||||
}else{
|
||||
|
@ -215,4 +215,42 @@ do_execsql_test 7.2 {
|
||||
SELECT rowid FROM f WHERE filename GLOB '*ир*';
|
||||
} {20}
|
||||
|
||||
|
||||
#-------------------------------------------------------------------------
|
||||
reset_db
|
||||
do_execsql_test 8.0 {
|
||||
CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram);
|
||||
INSERT INTO t1 VALUES('abcdefghijklm');
|
||||
}
|
||||
|
||||
foreach {tn match res} {
|
||||
1 "abc ghi" "(abc)def(ghi)jklm"
|
||||
2 "def ghi" "abc(defghi)jklm"
|
||||
3 "efg ghi" "abcd(efghi)jklm"
|
||||
4 "efghi" "abcd(efghi)jklm"
|
||||
5 "abcd jklm" "(abcd)efghi(jklm)"
|
||||
6 "ijkl jklm" "abcdefgh(ijklm)"
|
||||
7 "ijk ijkl hijk" "abcdefg(hijkl)m"
|
||||
|
||||
} {
|
||||
do_execsql_test 8.1.$tn {
|
||||
SELECT highlight(t1, 0, '(', ')') FROM t1($match)
|
||||
} $res
|
||||
}
|
||||
|
||||
do_execsql_test 8.2 {
|
||||
CREATE VIRTUAL TABLE ft2 USING fts5(a, tokenize="trigram");
|
||||
INSERT INTO ft2 VALUES('abc x cde');
|
||||
INSERT INTO ft2 VALUES('abc cde');
|
||||
INSERT INTO ft2 VALUES('abcde');
|
||||
}
|
||||
|
||||
do_execsql_test 8.3 {
|
||||
SELECT highlight(ft2, 0, '[', ']') FROM ft2 WHERE ft2 MATCH 'abc AND cde';
|
||||
} {
|
||||
{[abc] x [cde]}
|
||||
{[abc] [cde]}
|
||||
{[abcde]}
|
||||
}
|
||||
|
||||
finish_test
|
||||
|
19
manifest
19
manifest
@ -1,5 +1,5 @@
|
||||
C Fix\sa\sharmless\scompiler\swarning\sin\sthe\sexpert\sextension.
|
||||
D 2023-10-24T09:57:54.656
|
||||
C Fix\sa\sproblem\swith\sthe\sfts5\shighlight()\sand\ssnippet()\sfunctions\swhen\sused\swith\stokenizers\slike\s"trigram"\sthat\soutput\soverlapping\stokens.\sForum\spost\s[forum:/forumpost/63735293ec|63735293ec].
|
||||
D 2023-10-24T15:53:02.462
|
||||
F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
|
||||
F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
|
||||
F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724
|
||||
@ -89,7 +89,7 @@ F ext/fts3/unicode/parseunicode.tcl a981bd6466d12dd17967515801c3ff23f74a281be1a0
|
||||
F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0
|
||||
F ext/fts5/fts5.h 05501612cc655504c5dce8ba765ab621d50fc478490089beaa0d75e00b23e520
|
||||
F ext/fts5/fts5Int.h 78a63cc0795186cde5384816a9403a68c65774b35d952e05b81a1b4b158e07c8
|
||||
F ext/fts5/fts5_aux.c 572d5ec92ba7301df2fea3258576332f2f4d2dfd66d8263afd157d9deceac480
|
||||
F ext/fts5/fts5_aux.c 35c4101613eff86902877a4dedd9400b07922e412cbdd637b45041dce2fd5388
|
||||
F ext/fts5/fts5_buffer.c 3001fbabb585d6de52947b44b455235072b741038391f830d6b729225eeaf6a5
|
||||
F ext/fts5/fts5_config.c 054359543566cbff1ba65a188330660a5457299513ac71c53b3a07d934c7b081
|
||||
F ext/fts5/fts5_expr.c bd3b81ce669c4104e34ffe66570af1999a317b142c15fccb112de9fb0caa57a6
|
||||
@ -216,7 +216,7 @@ F ext/fts5/test/fts5synonym2.test 8f891fc49cc1e8daed727051e77e1f42849c784a6a54be
|
||||
F ext/fts5/test/fts5tok1.test 1f7817499f5971450d8c4a652114b3d833393c8134e32422d0af27884ffe9cef
|
||||
F ext/fts5/test/fts5tok2.test dcacb32d4a2a3f0dd3215d4a3987f78ae4be21a2
|
||||
F ext/fts5/test/fts5tokenizer.test ac3c9112b263a639fb0508ae73a3ee886bf4866d2153771a8e8a20c721305a43
|
||||
F ext/fts5/test/fts5trigram.test c76acc1913a06182e791a0dfdae285b9cdd67327a1a35b34cabf0a6aa09cf05e
|
||||
F ext/fts5/test/fts5trigram.test 6c4e37864f3e7d90673db5563d9736d7e40080ab94d10ebdffa94c1b77941da0
|
||||
F ext/fts5/test/fts5ubsan.test 783d5a8d13ebfa169e634940228db54540780e3ba7a87ad1e4510e61440bf64b
|
||||
F ext/fts5/test/fts5umlaut.test a42fe2fe6387c40c49ab27ccbd070e1ae38e07f38d05926482cc0bccac9ad602
|
||||
F ext/fts5/test/fts5unicode.test 17056f4efe6b0a5d4f41fdf7a7dc9af2873004562eaa899d40633b93dc95f5a9
|
||||
@ -2138,8 +2138,11 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93
|
||||
F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
|
||||
F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
|
||||
F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
|
||||
P 9d388267e4e6724e2df333fe09d509e87defcfe984c5c2ebe031152d320812d0
|
||||
R c93c65b4d457eea3e32fda50bd8711c7
|
||||
U drh
|
||||
Z 3370819568316647d482b2f4e035059a
|
||||
P 0c4907ddf9abd1ebfef31c1a53d702c4dcaa015c0032b8b52774c9e260b6cfd7
|
||||
R 41175df0065ba91132d9e078adcc9517
|
||||
T *branch * fts5-trigram-snippet-fix
|
||||
T *sym-fts5-trigram-snippet-fix *
|
||||
T -sym-trunk *
|
||||
U dan
|
||||
Z fb6ef8c426c83834a5c6c815bfcd6614
|
||||
# Remove this line to create a well-formed Fossil manifest.
|
||||
|
@ -1 +1 @@
|
||||
0c4907ddf9abd1ebfef31c1a53d702c4dcaa015c0032b8b52774c9e260b6cfd7
|
||||
d570aa02f79b1d7d3889e33f9eebab1b7edcf5231b1357451eed9a538607de54
|
Loading…
Reference in New Issue
Block a user