Improve headeline generation. Now headline can contain
several fragments a-la Google. Sushant Sinha <sushant354@gmail.com>
This commit is contained in:
parent
906b7e5f6c
commit
2a0083ede8
@ -1,4 +1,4 @@
|
|||||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.45 2008/09/23 09:20:34 heikki Exp $ -->
|
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.46 2008/10/17 18:05:19 teodor Exp $ -->
|
||||||
|
|
||||||
<chapter id="textsearch">
|
<chapter id="textsearch">
|
||||||
<title id="textsearch-title">Full Text Search</title>
|
<title id="textsearch-title">Full Text Search</title>
|
||||||
@ -1098,6 +1098,29 @@ ORDER BY rank DESC LIMIT 10;
|
|||||||
value of three eliminates the English articles.
|
value of three eliminates the English articles.
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<literal>MaxFragments</literal>: maximum number of text excerpts
|
||||||
|
or fragments that matches the query words. It also triggers a
|
||||||
|
different headline generation function than the default one. This
|
||||||
|
function finds text fragments with as many query words as possible and
|
||||||
|
stretches those fragments around the query words. As a result
|
||||||
|
query words are close to the middle of each fragment and have words on
|
||||||
|
each side. Each fragment will be of at most MaxWords and will not
|
||||||
|
have words of size less than or equal to ShortWord at the start or
|
||||||
|
end of a fragment. If all query words are not found in the document,
|
||||||
|
then a single fragment of MinWords will be displayed.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<literal>FragmentDelimiter</literal>: When more than one fragments are
|
||||||
|
displayed, then the fragments will be separated by this delimiter. This
|
||||||
|
option is effective only if MaxFragments is greater than 1 and there are
|
||||||
|
more than one fragments to be diplayed. This option has no effect on the
|
||||||
|
default headline generation function.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
<literal>HighlightAll</literal>: Boolean flag; if
|
<literal>HighlightAll</literal>: Boolean flag; if
|
||||||
@ -1109,7 +1132,7 @@ ORDER BY rank DESC LIMIT 10;
|
|||||||
Any unspecified options receive these defaults:
|
Any unspecified options receive these defaults:
|
||||||
|
|
||||||
<programlisting>
|
<programlisting>
|
||||||
StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
|
StartSel=<b>, StopSel=</b>, MaxFragments=0, FragmentDelimiter=" ... ", MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
|
||||||
</programlisting>
|
</programlisting>
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.9 2008/10/17 18:05:19 teodor Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -583,8 +583,11 @@ text *
|
|||||||
generateHeadline(HeadlineParsedText *prs)
|
generateHeadline(HeadlineParsedText *prs)
|
||||||
{
|
{
|
||||||
text *out;
|
text *out;
|
||||||
int len = 128;
|
|
||||||
char *ptr;
|
char *ptr;
|
||||||
|
int len = 128;
|
||||||
|
int numfragments = 0;
|
||||||
|
int2 infrag = 0;
|
||||||
|
|
||||||
HeadlineWordEntry *wrd = prs->words;
|
HeadlineWordEntry *wrd = prs->words;
|
||||||
|
|
||||||
out = (text *) palloc(len);
|
out = (text *) palloc(len);
|
||||||
@ -592,7 +595,7 @@ generateHeadline(HeadlineParsedText *prs)
|
|||||||
|
|
||||||
while (wrd - prs->words < prs->curwords)
|
while (wrd - prs->words < prs->curwords)
|
||||||
{
|
{
|
||||||
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
|
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
|
||||||
{
|
{
|
||||||
int dist = ptr - ((char *) out);
|
int dist = ptr - ((char *) out);
|
||||||
|
|
||||||
@ -603,6 +606,20 @@ generateHeadline(HeadlineParsedText *prs)
|
|||||||
|
|
||||||
if (wrd->in && !wrd->repeated)
|
if (wrd->in && !wrd->repeated)
|
||||||
{
|
{
|
||||||
|
if (!infrag)
|
||||||
|
{
|
||||||
|
|
||||||
|
/* start of a new fragment */
|
||||||
|
infrag = 1;
|
||||||
|
numfragments ++;
|
||||||
|
/* add a fragment delimitor if this is after the first one */
|
||||||
|
if (numfragments > 1)
|
||||||
|
{
|
||||||
|
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
|
||||||
|
ptr += prs->fragdelimlen;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
if (wrd->replace)
|
if (wrd->replace)
|
||||||
{
|
{
|
||||||
*ptr = ' ';
|
*ptr = ' ';
|
||||||
@ -625,7 +642,11 @@ generateHeadline(HeadlineParsedText *prs)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (!wrd->repeated)
|
else if (!wrd->repeated)
|
||||||
|
{
|
||||||
|
if (infrag)
|
||||||
|
infrag = 0;
|
||||||
pfree(wrd->word);
|
pfree(wrd->word);
|
||||||
|
}
|
||||||
|
|
||||||
wrd++;
|
wrd++;
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.16 2008/10/17 17:27:46 teodor Exp $
|
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.17 2008/10/17 18:05:19 teodor Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -1684,18 +1684,247 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Datum
|
static void
|
||||||
prsd_headline(PG_FUNCTION_ARGS)
|
mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
|
||||||
{
|
{
|
||||||
HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
|
int i;
|
||||||
List *prsoptions = (List *) PG_GETARG_POINTER(1);
|
|
||||||
TSQuery query = PG_GETARG_TSQUERY(2);
|
|
||||||
|
|
||||||
/* from opt + start and and tag */
|
for (i = startpos; i <= endpos; i++)
|
||||||
int min_words = 15;
|
{
|
||||||
int max_words = 35;
|
if (prs->words[i].item)
|
||||||
int shortword = 3;
|
prs->words[i].selected = 1;
|
||||||
|
if (highlight == 0)
|
||||||
|
{
|
||||||
|
if (HLIDIGNORE(prs->words[i].type))
|
||||||
|
prs->words[i].replace = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (XMLHLIDIGNORE(prs->words[i].type))
|
||||||
|
prs->words[i].replace = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int4 startpos;
|
||||||
|
int4 endpos;
|
||||||
|
int4 poslen;
|
||||||
|
int4 curlen;
|
||||||
|
int2 in;
|
||||||
|
int2 excluded;
|
||||||
|
} CoverPos;
|
||||||
|
|
||||||
|
static void
|
||||||
|
get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
||||||
|
int *curlen, int *poslen, int max_words)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
/* Objective: Generate a fragment of words between startpos and endpos
|
||||||
|
* such that it has at most max_words and both ends has query words.
|
||||||
|
* If the startpos and endpos are the endpoints of the cover and the
|
||||||
|
* cover has fewer words than max_words, then this function should
|
||||||
|
* just return the cover
|
||||||
|
*/
|
||||||
|
/* first move startpos to an item */
|
||||||
|
for(i = *startpos; i <= *endpos; i++)
|
||||||
|
{
|
||||||
|
*startpos = i;
|
||||||
|
if (prs->words[i].item && !prs->words[i].repeated)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* cut endpos to have only max_words */
|
||||||
|
*curlen = 0;
|
||||||
|
*poslen = 0;
|
||||||
|
for(i = *startpos; i <= *endpos && *curlen < max_words; i++)
|
||||||
|
{
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
*curlen += 1;
|
||||||
|
if (prs->words[i].item && !prs->words[i].repeated)
|
||||||
|
*poslen += 1;
|
||||||
|
}
|
||||||
|
/* if the cover was cut then move back endpos to a query item */
|
||||||
|
if (*endpos > i)
|
||||||
|
{
|
||||||
|
*endpos = i;
|
||||||
|
for(i = *endpos; i >= *startpos; i --)
|
||||||
|
{
|
||||||
|
*endpos = i;
|
||||||
|
if (prs->words[i].item && !prs->words[i].repeated)
|
||||||
|
break;
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
*curlen -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
||||||
|
int shortword, int min_words,
|
||||||
|
int max_words, int max_fragments)
|
||||||
|
{
|
||||||
|
int4 poslen, curlen, i, f, num_f = 0;
|
||||||
|
int4 stretch, maxstretch, posmarker;
|
||||||
|
|
||||||
|
int4 startpos = 0,
|
||||||
|
endpos = 0,
|
||||||
|
p = 0,
|
||||||
|
q = 0;
|
||||||
|
|
||||||
|
int4 numcovers = 0,
|
||||||
|
maxcovers = 32;
|
||||||
|
|
||||||
|
int4 minI, minwords, maxitems;
|
||||||
|
CoverPos *covers;
|
||||||
|
|
||||||
|
covers = palloc(maxcovers * sizeof(CoverPos));
|
||||||
|
|
||||||
|
/* get all covers */
|
||||||
|
while (hlCover(prs, query, &p, &q))
|
||||||
|
{
|
||||||
|
startpos = p;
|
||||||
|
endpos = q;
|
||||||
|
|
||||||
|
/* Break the cover into smaller fragments such that each fragment
|
||||||
|
* has at most max_words. Also ensure that each end of the fragment
|
||||||
|
* is a query word. This will allow us to stretch the fragment in
|
||||||
|
* either direction
|
||||||
|
*/
|
||||||
|
|
||||||
|
while (startpos <= endpos)
|
||||||
|
{
|
||||||
|
get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
|
||||||
|
if (numcovers >= maxcovers)
|
||||||
|
{
|
||||||
|
maxcovers *= 2;
|
||||||
|
covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
|
||||||
|
}
|
||||||
|
covers[numcovers].startpos = startpos;
|
||||||
|
covers[numcovers].endpos = endpos;
|
||||||
|
covers[numcovers].curlen = curlen;
|
||||||
|
covers[numcovers].poslen = poslen;
|
||||||
|
covers[numcovers].in = 0;
|
||||||
|
covers[numcovers].excluded = 0;
|
||||||
|
numcovers ++;
|
||||||
|
startpos = endpos + 1;
|
||||||
|
endpos = q;
|
||||||
|
}
|
||||||
|
/* move p to generate the next cover */
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* choose best covers */
|
||||||
|
for (f = 0; f < max_fragments; f++)
|
||||||
|
{
|
||||||
|
maxitems = 0;
|
||||||
|
minwords = 0x7fffffff;
|
||||||
|
minI = -1;
|
||||||
|
/* Choose the cover that contains max items.
|
||||||
|
* In case of tie choose the one with smaller
|
||||||
|
* number of words.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < numcovers; i ++)
|
||||||
|
{
|
||||||
|
if (!covers[i].in && !covers[i].excluded &&
|
||||||
|
(maxitems < covers[i].poslen || (maxitems == covers[i].poslen
|
||||||
|
&& minwords > covers[i].curlen)))
|
||||||
|
{
|
||||||
|
maxitems = covers[i].poslen;
|
||||||
|
minwords = covers[i].curlen;
|
||||||
|
minI = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* if a cover was found mark it */
|
||||||
|
if (minI >= 0)
|
||||||
|
{
|
||||||
|
covers[minI].in = 1;
|
||||||
|
/* adjust the size of cover */
|
||||||
|
startpos = covers[minI].startpos;
|
||||||
|
endpos = covers[minI].endpos;
|
||||||
|
curlen = covers[minI].curlen;
|
||||||
|
/* stretch the cover if cover size is lower than max_words */
|
||||||
|
if (curlen < max_words)
|
||||||
|
{
|
||||||
|
/* divide the stretch on both sides of cover */
|
||||||
|
maxstretch = (max_words - curlen)/2;
|
||||||
|
/* first stretch the startpos
|
||||||
|
* stop stretching if
|
||||||
|
* 1. we hit the beginning of document
|
||||||
|
* 2. exceed maxstretch
|
||||||
|
* 3. we hit an already marked fragment
|
||||||
|
*/
|
||||||
|
stretch = 0;
|
||||||
|
posmarker = startpos;
|
||||||
|
for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
|
||||||
|
{
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
{
|
||||||
|
curlen ++;
|
||||||
|
stretch ++;
|
||||||
|
}
|
||||||
|
posmarker = i;
|
||||||
|
}
|
||||||
|
/* cut back startpos till we find a non short token */
|
||||||
|
for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
|
||||||
|
{
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
curlen --;
|
||||||
|
}
|
||||||
|
startpos = i;
|
||||||
|
/* now stretch the endpos as much as possible*/
|
||||||
|
posmarker = endpos;
|
||||||
|
for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
|
||||||
|
{
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
curlen ++;
|
||||||
|
posmarker = i;
|
||||||
|
}
|
||||||
|
/* cut back endpos till we find a non-short token */
|
||||||
|
for ( i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
|
||||||
|
{
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
curlen --;
|
||||||
|
}
|
||||||
|
endpos = i;
|
||||||
|
}
|
||||||
|
covers[minI].startpos = startpos;
|
||||||
|
covers[minI].endpos = endpos;
|
||||||
|
covers[minI].curlen = curlen;
|
||||||
|
/* Mark the chosen fragments (covers) */
|
||||||
|
mark_fragment(prs, highlight, startpos, endpos);
|
||||||
|
num_f ++;
|
||||||
|
/* exclude overlapping covers */
|
||||||
|
for (i = 0; i < numcovers; i ++)
|
||||||
|
{
|
||||||
|
if (i != minI && ( (covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
|
||||||
|
covers[i].excluded = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* show at least min_words we have not marked anything*/
|
||||||
|
if (num_f <= 0)
|
||||||
|
{
|
||||||
|
startpos = endpos = curlen = 0;
|
||||||
|
for (i = 0; i < prs->curwords && curlen < min_words; i++)
|
||||||
|
{
|
||||||
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
|
curlen++;
|
||||||
|
endpos = i;
|
||||||
|
}
|
||||||
|
mark_fragment(prs, highlight, startpos, endpos);
|
||||||
|
}
|
||||||
|
pfree(covers);
|
||||||
|
}
|
||||||
|
static void
|
||||||
|
mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
|
||||||
|
int shortword, int min_words, int max_words)
|
||||||
|
{
|
||||||
int p = 0,
|
int p = 0,
|
||||||
q = 0;
|
q = 0;
|
||||||
int bestb = -1,
|
int bestb = -1,
|
||||||
@ -1707,56 +1936,9 @@ prsd_headline(PG_FUNCTION_ARGS)
|
|||||||
curlen;
|
curlen;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
int highlight = 0;
|
|
||||||
ListCell *l;
|
|
||||||
|
|
||||||
/* config */
|
|
||||||
prs->startsel = NULL;
|
|
||||||
prs->stopsel = NULL;
|
|
||||||
foreach(l, prsoptions)
|
|
||||||
{
|
|
||||||
DefElem *defel = (DefElem *) lfirst(l);
|
|
||||||
char *val = defGetString(defel);
|
|
||||||
|
|
||||||
if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
|
|
||||||
max_words = pg_atoi(val, sizeof(int32), 0);
|
|
||||||
else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
|
|
||||||
min_words = pg_atoi(val, sizeof(int32), 0);
|
|
||||||
else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
|
|
||||||
shortword = pg_atoi(val, sizeof(int32), 0);
|
|
||||||
else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
|
|
||||||
prs->startsel = pstrdup(val);
|
|
||||||
else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
|
|
||||||
prs->stopsel = pstrdup(val);
|
|
||||||
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
|
|
||||||
highlight = (pg_strcasecmp(val, "1") == 0 ||
|
|
||||||
pg_strcasecmp(val, "on") == 0 ||
|
|
||||||
pg_strcasecmp(val, "true") == 0 ||
|
|
||||||
pg_strcasecmp(val, "t") == 0 ||
|
|
||||||
pg_strcasecmp(val, "y") == 0 ||
|
|
||||||
pg_strcasecmp(val, "yes") == 0);
|
|
||||||
else
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
||||||
errmsg("unrecognized headline parameter: \"%s\"",
|
|
||||||
defel->defname)));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (highlight == 0)
|
if (highlight == 0)
|
||||||
{
|
{
|
||||||
if (min_words >= max_words)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
||||||
errmsg("MinWords should be less than MaxWords")));
|
|
||||||
if (min_words <= 0)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
||||||
errmsg("MinWords should be positive")));
|
|
||||||
if (shortword < 0)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
||||||
errmsg("ShortWord should be >= 0")));
|
|
||||||
|
|
||||||
while (hlCover(prs, query, &p, &q))
|
while (hlCover(prs, query, &p, &q))
|
||||||
{
|
{
|
||||||
/* find cover len in words */
|
/* find cover len in words */
|
||||||
@ -1877,12 +2059,95 @@ prsd_headline(PG_FUNCTION_ARGS)
|
|||||||
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
|
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
prsd_headline(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
|
||||||
|
List *prsoptions = (List *) PG_GETARG_POINTER(1);
|
||||||
|
TSQuery query = PG_GETARG_TSQUERY(2);
|
||||||
|
|
||||||
|
/* from opt + start and and tag */
|
||||||
|
int min_words = 15;
|
||||||
|
int max_words = 35;
|
||||||
|
int shortword = 3;
|
||||||
|
int max_fragments = 0;
|
||||||
|
int highlight = 0;
|
||||||
|
ListCell *l;
|
||||||
|
|
||||||
|
/* config */
|
||||||
|
prs->startsel = NULL;
|
||||||
|
prs->stopsel = NULL;
|
||||||
|
foreach(l, prsoptions)
|
||||||
|
{
|
||||||
|
DefElem *defel = (DefElem *) lfirst(l);
|
||||||
|
char *val = defGetString(defel);
|
||||||
|
|
||||||
|
if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
|
||||||
|
max_words = pg_atoi(val, sizeof(int32), 0);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
|
||||||
|
min_words = pg_atoi(val, sizeof(int32), 0);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
|
||||||
|
shortword = pg_atoi(val, sizeof(int32), 0);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
|
||||||
|
max_fragments = pg_atoi(val, sizeof(int32), 0);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
|
||||||
|
prs->startsel = pstrdup(val);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
|
||||||
|
prs->stopsel = pstrdup(val);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
|
||||||
|
prs->fragdelim = pstrdup(val);
|
||||||
|
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
|
||||||
|
highlight = (pg_strcasecmp(val, "1") == 0 ||
|
||||||
|
pg_strcasecmp(val, "on") == 0 ||
|
||||||
|
pg_strcasecmp(val, "true") == 0 ||
|
||||||
|
pg_strcasecmp(val, "t") == 0 ||
|
||||||
|
pg_strcasecmp(val, "y") == 0 ||
|
||||||
|
pg_strcasecmp(val, "yes") == 0);
|
||||||
|
else
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("unrecognized headline parameter: \"%s\"",
|
||||||
|
defel->defname)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (highlight == 0)
|
||||||
|
{
|
||||||
|
if (min_words >= max_words)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("MinWords should be less than MaxWords")));
|
||||||
|
if (min_words <= 0)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("MinWords should be positive")));
|
||||||
|
if (shortword < 0)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("ShortWord should be >= 0")));
|
||||||
|
if (max_fragments < 0)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("MaxFragments should be >= 0")));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_fragments == 0)
|
||||||
|
/* call the default headline generator */
|
||||||
|
mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
|
||||||
|
else
|
||||||
|
mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
|
||||||
|
|
||||||
if (!prs->startsel)
|
if (!prs->startsel)
|
||||||
prs->startsel = pstrdup("<b>");
|
prs->startsel = pstrdup("<b>");
|
||||||
if (!prs->stopsel)
|
if (!prs->stopsel)
|
||||||
prs->stopsel = pstrdup("</b>");
|
prs->stopsel = pstrdup("</b>");
|
||||||
|
if (!prs->fragdelim)
|
||||||
|
prs->fragdelim = pstrdup(" ... ");
|
||||||
prs->startsellen = strlen(prs->startsel);
|
prs->startsellen = strlen(prs->startsel);
|
||||||
prs->stopsellen = strlen(prs->stopsel);
|
prs->stopsellen = strlen(prs->stopsel);
|
||||||
|
prs->fragdelimlen = strlen(prs->fragdelim);
|
||||||
|
|
||||||
PG_RETURN_POINTER(prs);
|
PG_RETURN_POINTER(prs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
*
|
*
|
||||||
* Copyright (c) 1998-2008, PostgreSQL Global Development Group
|
* Copyright (c) 1998-2008, PostgreSQL Global Development Group
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.10 2008/06/18 18:42:54 momjian Exp $
|
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.11 2008/10/17 18:05:19 teodor Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -52,8 +52,10 @@ typedef struct
|
|||||||
int4 curwords;
|
int4 curwords;
|
||||||
char *startsel;
|
char *startsel;
|
||||||
char *stopsel;
|
char *stopsel;
|
||||||
|
char *fragdelim;
|
||||||
int2 startsellen;
|
int2 startsellen;
|
||||||
int2 stopsellen;
|
int2 stopsellen;
|
||||||
|
int2 fragdelimlen;
|
||||||
} HeadlineParsedText;
|
} HeadlineParsedText;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -632,6 +632,98 @@ to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
|
|||||||
</html>
|
</html>
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
--Check if headline fragments work
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
|
||||||
|
ts_headline
|
||||||
|
------------------------------------
|
||||||
|
after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted <b>Ocean</b>.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
--Check if more than one fragments are displayed
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
|
||||||
|
ts_headline
|
||||||
|
----------------------------------------------
|
||||||
|
after day, day after day,
|
||||||
|
We <b>stuck</b>, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where ... drop to drink.
|
||||||
|
S. T. <b>Coleridge</b>
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
--Fragments when there all query words are not in the document
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
|
||||||
|
ts_headline
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
--FragmentDelimiter option
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
|
||||||
|
ts_headline
|
||||||
|
--------------------------------------------
|
||||||
|
after day, day after day,
|
||||||
|
We <b>stuck</b>, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where***drop to drink.
|
||||||
|
S. T. <b>Coleridge</b>
|
||||||
|
(1 row)
|
||||||
|
|
||||||
--Rewrite sub system
|
--Rewrite sub system
|
||||||
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
|
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
|
||||||
\set ECHO none
|
\set ECHO none
|
||||||
|
@ -208,6 +208,58 @@ ff-bg
|
|||||||
</html>',
|
</html>',
|
||||||
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
|
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
|
||||||
|
|
||||||
|
--Check if headline fragments work
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
|
||||||
|
|
||||||
|
--Check if more than one fragments are displayed
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
|
||||||
|
|
||||||
|
--Fragments when there all query words are not in the document
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
|
||||||
|
|
||||||
|
--FragmentDelimiter option
|
||||||
|
SELECT ts_headline('english', '
|
||||||
|
Day after day, day after day,
|
||||||
|
We stuck, nor breath nor motion,
|
||||||
|
As idle as a painted Ship
|
||||||
|
Upon a painted Ocean.
|
||||||
|
Water, water, every where
|
||||||
|
And all the boards did shrink;
|
||||||
|
Water, water, every where,
|
||||||
|
Nor any drop to drink.
|
||||||
|
S. T. Coleridge (1772-1834)
|
||||||
|
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
|
||||||
|
|
||||||
--Rewrite sub system
|
--Rewrite sub system
|
||||||
|
|
||||||
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
|
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user