Cosmetic improvements for default text search parser's ts_headline code.
This code was woefully unreadable and under-commented. Try to improve matters by adding comments, using some macros to make complicated if-tests more readable, using boolean type where appropriate, etc. There are a couple of tiny coding improvements too, but this commit includes (I hope) no behavioral change. Nonetheless, back-patch as far as 9.6, because a followup bug-fixing commit depends on this. Discussion: https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
This commit is contained in:
parent
e92e4a2b68
commit
b10f8bb9fd
@ -1915,6 +1915,12 @@ prsd_end(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_VOID();
|
PG_RETURN_VOID();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ts_headline support begins here
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* token type classification macros */
|
||||||
#define LEAVETOKEN(x) ( (x)==SPACE )
|
#define LEAVETOKEN(x) ( (x)==SPACE )
|
||||||
#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
|
#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
|
||||||
#define ENDPUNCTOKEN(x) ( (x)==SPACE )
|
#define ENDPUNCTOKEN(x) ( (x)==SPACE )
|
||||||
@ -1926,23 +1932,54 @@ prsd_end(PG_FUNCTION_ARGS)
|
|||||||
#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
|
#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
|
||||||
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
|
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Macros useful in headline selection. These rely on availability of
|
||||||
|
* "HeadlineParsedText *prs" describing some text, and "int shortword"
|
||||||
|
* describing the "short word" length parameter.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Interesting words are non-repeated search terms */
|
||||||
|
#define INTERESTINGWORD(j) \
|
||||||
|
(prs->words[j].item && !prs->words[j].repeated)
|
||||||
|
|
||||||
|
/* Don't want to end at a non-word or a short word */
|
||||||
|
#define BADENDPOINT(j) \
|
||||||
|
(NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
|
/* one cover (well, really one fragment) for mark_hl_fragments */
|
||||||
|
int32 startpos; /* fragment's starting word index */
|
||||||
|
int32 endpos; /* ending word index (inclusive) */
|
||||||
|
int32 poslen; /* number of interesting words */
|
||||||
|
int32 curlen; /* total number of words */
|
||||||
|
bool chosen; /* chosen? */
|
||||||
|
bool excluded; /* excluded? */
|
||||||
|
} CoverPos;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
/* callback data for checkcondition_HL */
|
||||||
HeadlineWordEntry *words;
|
HeadlineWordEntry *words;
|
||||||
int len;
|
int len;
|
||||||
} hlCheck;
|
} hlCheck;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* TS_execute callback for matching a tsquery operand to headline words
|
||||||
|
*/
|
||||||
static bool
|
static bool
|
||||||
checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
|
checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
hlCheck *checkval = (hlCheck *) opaque;
|
hlCheck *checkval = (hlCheck *) opaque;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* scan words array for marching items */
|
||||||
for (i = 0; i < checkval->len; i++)
|
for (i = 0; i < checkval->len; i++)
|
||||||
{
|
{
|
||||||
if (checkval->words[i].item == val)
|
if (checkval->words[i].item == val)
|
||||||
{
|
{
|
||||||
/* don't need to find all positions */
|
/* if data == NULL, don't need to report positions */
|
||||||
if (!data)
|
if (!data)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
@ -2038,8 +2075,14 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Apply suitable highlight marking to words selected by headline selector
|
||||||
|
*
|
||||||
|
* The words from startpos to endpos inclusive are marked per highlightall
|
||||||
|
*/
|
||||||
static void
|
static void
|
||||||
mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
|
mark_fragment(HeadlineParsedText *prs, bool highlightall,
|
||||||
|
int startpos, int endpos)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@ -2047,7 +2090,7 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
|
|||||||
{
|
{
|
||||||
if (prs->words[i].item)
|
if (prs->words[i].item)
|
||||||
prs->words[i].selected = 1;
|
prs->words[i].selected = 1;
|
||||||
if (highlight == 0)
|
if (!highlightall)
|
||||||
{
|
{
|
||||||
if (HLIDREPLACE(prs->words[i].type))
|
if (HLIDREPLACE(prs->words[i].type))
|
||||||
prs->words[i].replace = 1;
|
prs->words[i].replace = 1;
|
||||||
@ -2064,16 +2107,15 @@ mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef struct
|
/*
|
||||||
{
|
* split a cover substring into fragments not longer than max_words
|
||||||
int32 startpos;
|
*
|
||||||
int32 endpos;
|
* At entry, *startpos and *endpos are the (remaining) bounds of the cover
|
||||||
int32 poslen;
|
* substring. They are updated to hold the bounds of the next fragment.
|
||||||
int32 curlen;
|
*
|
||||||
int16 in;
|
* *curlen and *poslen are set to the fragment's length, in words and
|
||||||
int16 excluded;
|
* interesting words respectively.
|
||||||
} CoverPos;
|
*/
|
||||||
|
|
||||||
static void
|
static void
|
||||||
get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
||||||
int *curlen, int *poslen, int max_words)
|
int *curlen, int *poslen, int max_words)
|
||||||
@ -2081,17 +2123,17 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Objective: Generate a fragment of words between startpos and endpos
|
* Objective: select a fragment of words between startpos and endpos such
|
||||||
* such that it has at most max_words and both ends has query words. If
|
* that it has at most max_words and both ends have query words. If the
|
||||||
* the startpos and endpos are the endpoints of the cover and the cover
|
* startpos and endpos are the endpoints of the cover and the cover has
|
||||||
* has fewer words than max_words, then this function should just return
|
* fewer words than max_words, then this function should just return the
|
||||||
* the cover
|
* cover
|
||||||
*/
|
*/
|
||||||
/* first move startpos to an item */
|
/* first move startpos to an item */
|
||||||
for (i = *startpos; i <= *endpos; i++)
|
for (i = *startpos; i <= *endpos; i++)
|
||||||
{
|
{
|
||||||
*startpos = i;
|
*startpos = i;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* cut endpos to have only max_words */
|
/* cut endpos to have only max_words */
|
||||||
@ -2101,7 +2143,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
|||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
*curlen += 1;
|
*curlen += 1;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
*poslen += 1;
|
*poslen += 1;
|
||||||
}
|
}
|
||||||
/* if the cover was cut then move back endpos to a query item */
|
/* if the cover was cut then move back endpos to a query item */
|
||||||
@ -2111,7 +2153,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
|||||||
for (i = *endpos; i >= *startpos; i--)
|
for (i = *endpos; i >= *startpos; i--)
|
||||||
{
|
{
|
||||||
*endpos = i;
|
*endpos = i;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
break;
|
break;
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
*curlen -= 1;
|
*curlen -= 1;
|
||||||
@ -2119,8 +2161,14 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Headline selector used when MaxFragments > 0
|
||||||
|
*
|
||||||
|
* Note: in this mode, highlightall is disregarded for phrase selection;
|
||||||
|
* it only controls presentation details.
|
||||||
|
*/
|
||||||
static void
|
static void
|
||||||
mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
|
||||||
int shortword, int min_words,
|
int shortword, int min_words,
|
||||||
int max_words, int max_fragments)
|
int max_words, int max_fragments)
|
||||||
{
|
{
|
||||||
@ -2156,7 +2204,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Break the cover into smaller fragments such that each fragment has
|
* Break the cover into smaller fragments such that each fragment has
|
||||||
* at most max_words. Also ensure that each end of the fragment is a
|
* at most max_words. Also ensure that each end of each fragment is a
|
||||||
* query word. This will allow us to stretch the fragment in either
|
* query word. This will allow us to stretch the fragment in either
|
||||||
* direction
|
* direction
|
||||||
*/
|
*/
|
||||||
@ -2173,12 +2221,13 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
covers[numcovers].endpos = endpos;
|
covers[numcovers].endpos = endpos;
|
||||||
covers[numcovers].curlen = curlen;
|
covers[numcovers].curlen = curlen;
|
||||||
covers[numcovers].poslen = poslen;
|
covers[numcovers].poslen = poslen;
|
||||||
covers[numcovers].in = 0;
|
covers[numcovers].chosen = false;
|
||||||
covers[numcovers].excluded = 0;
|
covers[numcovers].excluded = false;
|
||||||
numcovers++;
|
numcovers++;
|
||||||
startpos = endpos + 1;
|
startpos = endpos + 1;
|
||||||
endpos = q;
|
endpos = q;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* move p to generate the next cover */
|
/* move p to generate the next cover */
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
@ -2196,9 +2245,10 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
*/
|
*/
|
||||||
for (i = 0; i < numcovers; i++)
|
for (i = 0; i < numcovers; i++)
|
||||||
{
|
{
|
||||||
if (!covers[i].in && !covers[i].excluded &&
|
if (!covers[i].chosen && !covers[i].excluded &&
|
||||||
(maxitems < covers[i].poslen || (maxitems == covers[i].poslen
|
(maxitems < covers[i].poslen ||
|
||||||
&& minwords > covers[i].curlen)))
|
(maxitems == covers[i].poslen &&
|
||||||
|
minwords > covers[i].curlen)))
|
||||||
{
|
{
|
||||||
maxitems = covers[i].poslen;
|
maxitems = covers[i].poslen;
|
||||||
minwords = covers[i].curlen;
|
minwords = covers[i].curlen;
|
||||||
@ -2208,7 +2258,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
/* if a cover was found mark it */
|
/* if a cover was found mark it */
|
||||||
if (minI >= 0)
|
if (minI >= 0)
|
||||||
{
|
{
|
||||||
covers[minI].in = 1;
|
covers[minI].chosen = true;
|
||||||
/* adjust the size of cover */
|
/* adjust the size of cover */
|
||||||
startpos = covers[minI].startpos;
|
startpos = covers[minI].startpos;
|
||||||
endpos = covers[minI].endpos;
|
endpos = covers[minI].endpos;
|
||||||
@ -2235,8 +2285,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
}
|
}
|
||||||
posmarker = i;
|
posmarker = i;
|
||||||
}
|
}
|
||||||
/* cut back startpos till we find a non short token */
|
/* cut back startpos till we find a good endpoint */
|
||||||
for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
|
for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
|
||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
curlen--;
|
curlen--;
|
||||||
@ -2250,8 +2300,8 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
curlen++;
|
curlen++;
|
||||||
posmarker = i;
|
posmarker = i;
|
||||||
}
|
}
|
||||||
/* cut back endpos till we find a non-short token */
|
/* cut back endpos till we find a good endpoint */
|
||||||
for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
|
for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
|
||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
curlen--;
|
curlen--;
|
||||||
@ -2262,20 +2312,24 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
covers[minI].endpos = endpos;
|
covers[minI].endpos = endpos;
|
||||||
covers[minI].curlen = curlen;
|
covers[minI].curlen = curlen;
|
||||||
/* Mark the chosen fragments (covers) */
|
/* Mark the chosen fragments (covers) */
|
||||||
mark_fragment(prs, highlight, startpos, endpos);
|
mark_fragment(prs, highlightall, startpos, endpos);
|
||||||
num_f++;
|
num_f++;
|
||||||
/* exclude overlapping covers */
|
/* exclude overlapping covers */
|
||||||
for (i = 0; i < numcovers; i++)
|
for (i = 0; i < numcovers; i++)
|
||||||
{
|
{
|
||||||
if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
|
if (i != minI &&
|
||||||
covers[i].excluded = 1;
|
((covers[i].startpos >= covers[minI].startpos &&
|
||||||
|
covers[i].startpos <= covers[minI].endpos) ||
|
||||||
|
(covers[i].endpos >= covers[minI].startpos &&
|
||||||
|
covers[i].endpos <= covers[minI].endpos)))
|
||||||
|
covers[i].excluded = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* show at least min_words we have not marked anything */
|
/* show at least min_words if we have not marked anything */
|
||||||
if (num_f <= 0)
|
if (num_f <= 0)
|
||||||
{
|
{
|
||||||
startpos = endpos = curlen = 0;
|
startpos = endpos = curlen = 0;
|
||||||
@ -2285,13 +2339,17 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
curlen++;
|
curlen++;
|
||||||
endpos = i;
|
endpos = i;
|
||||||
}
|
}
|
||||||
mark_fragment(prs, highlight, startpos, endpos);
|
mark_fragment(prs, highlightall, startpos, endpos);
|
||||||
}
|
}
|
||||||
|
|
||||||
pfree(covers);
|
pfree(covers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Headline selector used when MaxFragments == 0
|
||||||
|
*/
|
||||||
static void
|
static void
|
||||||
mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
|
mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
|
||||||
int shortword, int min_words, int max_words)
|
int shortword, int min_words, int max_words)
|
||||||
{
|
{
|
||||||
int p = 0,
|
int p = 0,
|
||||||
@ -2299,66 +2357,81 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
int bestb = -1,
|
int bestb = -1,
|
||||||
beste = -1;
|
beste = -1;
|
||||||
int bestlen = -1;
|
int bestlen = -1;
|
||||||
int pose = 0,
|
int pose,
|
||||||
posb,
|
posb,
|
||||||
poslen,
|
poslen,
|
||||||
curlen;
|
curlen;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (highlight == 0)
|
if (!highlightall)
|
||||||
{
|
{
|
||||||
|
/* examine all covers, select a headline using the best one */
|
||||||
while (hlCover(prs, query, &p, &q))
|
while (hlCover(prs, query, &p, &q))
|
||||||
{
|
{
|
||||||
/* find cover len in words */
|
/*
|
||||||
|
* Count words (curlen) and interesting words (poslen) within
|
||||||
|
* cover, but stop once we reach max_words. This step doesn't
|
||||||
|
* consider whether that's a good stopping point. posb and pose
|
||||||
|
* are set to the start and end indexes of the possible headline.
|
||||||
|
*/
|
||||||
curlen = 0;
|
curlen = 0;
|
||||||
poslen = 0;
|
poslen = 0;
|
||||||
|
posb = pose = p;
|
||||||
for (i = p; i <= q && curlen < max_words; i++)
|
for (i = p; i <= q && curlen < max_words; i++)
|
||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
curlen++;
|
curlen++;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
poslen++;
|
poslen++;
|
||||||
pose = i;
|
pose = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
|
/* XXX this optimization seems unnecessary and wrong */
|
||||||
|
if (poslen < bestlen && !BADENDPOINT(beste))
|
||||||
{
|
{
|
||||||
/* best already found, so try one more cover */
|
/* better cover already found, so try next cover */
|
||||||
p++;
|
p++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
posb = p;
|
|
||||||
if (curlen < max_words)
|
if (curlen < max_words)
|
||||||
{ /* find good end */
|
{
|
||||||
|
/*
|
||||||
|
* We have room to lengthen the headline, so search forward
|
||||||
|
* until it's full or we find a good stopping point. We'll
|
||||||
|
* reconsider the word at "q", then move forward.
|
||||||
|
*/
|
||||||
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
|
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
|
||||||
{
|
{
|
||||||
if (i != q)
|
if (i > q)
|
||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
curlen++;
|
curlen++;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
poslen++;
|
poslen++;
|
||||||
}
|
}
|
||||||
pose = i;
|
pose = i;
|
||||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
if (BADENDPOINT(i))
|
||||||
continue;
|
continue;
|
||||||
if (curlen >= min_words)
|
if (curlen >= min_words)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (curlen < min_words && i >= prs->curwords)
|
if (curlen < min_words)
|
||||||
{ /* got end of text and our cover is shorter
|
{
|
||||||
* than min_words */
|
/*
|
||||||
|
* Reached end of text and our headline is still shorter
|
||||||
|
* than min_words, so try to extend it to the left.
|
||||||
|
*/
|
||||||
for (i = p - 1; i >= 0; i--)
|
for (i = p - 1; i >= 0; i--)
|
||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
curlen++;
|
curlen++;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
poslen++;
|
poslen++;
|
||||||
if (curlen >= max_words)
|
if (curlen >= max_words)
|
||||||
break;
|
break;
|
||||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
if (BADENDPOINT(i))
|
||||||
continue;
|
continue;
|
||||||
if (curlen >= min_words)
|
if (curlen >= min_words)
|
||||||
break;
|
break;
|
||||||
@ -2367,34 +2440,48 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{ /* shorter cover :((( */
|
{
|
||||||
|
/*
|
||||||
|
* Can't make headline longer, so consider making it shorter
|
||||||
|
* if needed to avoid a bad endpoint.
|
||||||
|
*/
|
||||||
if (i > q)
|
if (i > q)
|
||||||
i = q;
|
i = q;
|
||||||
for (; curlen > min_words; i--)
|
for (; curlen > min_words; i--)
|
||||||
{
|
{
|
||||||
if (!NONWORDTOKEN(prs->words[i].type))
|
if (!NONWORDTOKEN(prs->words[i].type))
|
||||||
curlen--;
|
curlen--;
|
||||||
if (prs->words[i].item && !prs->words[i].repeated)
|
if (INTERESTINGWORD(i))
|
||||||
poslen--;
|
poslen--;
|
||||||
pose = i;
|
pose = i;
|
||||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
if (!BADENDPOINT(i))
|
||||||
continue;
|
break;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
|
/*
|
||||||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
|
* Adopt this headline if it's the first, or if it has more
|
||||||
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
|
* interesting words and isn't ending at a bad endpoint, or if it
|
||||||
|
* replaces a bad endpoint with a good one (XXX even if it has
|
||||||
|
* fewer interesting words? Really?)
|
||||||
|
*/
|
||||||
|
if (bestlen < 0 ||
|
||||||
|
(poslen > bestlen && !BADENDPOINT(pose)) ||
|
||||||
|
(!BADENDPOINT(pose) && BADENDPOINT(beste)))
|
||||||
{
|
{
|
||||||
bestb = posb;
|
bestb = posb;
|
||||||
beste = pose;
|
beste = pose;
|
||||||
bestlen = poslen;
|
bestlen = poslen;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* move p to generate the next cover */
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we found nothing acceptable, select min_words words starting at
|
||||||
|
* the beginning.
|
||||||
|
*/
|
||||||
if (bestlen < 0)
|
if (bestlen < 0)
|
||||||
{
|
{
|
||||||
curlen = 0;
|
curlen = 0;
|
||||||
@ -2410,32 +2497,17 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
/* highlightall mode: headline is whole document */
|
||||||
bestb = 0;
|
bestb = 0;
|
||||||
beste = prs->curwords - 1;
|
beste = prs->curwords - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = bestb; i <= beste; i++)
|
mark_fragment(prs, highlightall, bestb, beste);
|
||||||
{
|
|
||||||
if (prs->words[i].item)
|
|
||||||
prs->words[i].selected = 1;
|
|
||||||
if (highlight == 0)
|
|
||||||
{
|
|
||||||
if (HLIDREPLACE(prs->words[i].type))
|
|
||||||
prs->words[i].replace = 1;
|
|
||||||
else if (HLIDSKIP(prs->words[i].type))
|
|
||||||
prs->words[i].skip = 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (XMLHLIDSKIP(prs->words[i].type))
|
|
||||||
prs->words[i].skip = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Default parser's prsheadline function
|
||||||
|
*/
|
||||||
Datum
|
Datum
|
||||||
prsd_headline(PG_FUNCTION_ARGS)
|
prsd_headline(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
@ -2443,17 +2515,18 @@ prsd_headline(PG_FUNCTION_ARGS)
|
|||||||
List *prsoptions = (List *) PG_GETARG_POINTER(1);
|
List *prsoptions = (List *) PG_GETARG_POINTER(1);
|
||||||
TSQuery query = PG_GETARG_TSQUERY(2);
|
TSQuery query = PG_GETARG_TSQUERY(2);
|
||||||
|
|
||||||
/* from opt + start and end tag */
|
/* default option values: */
|
||||||
int min_words = 15;
|
int min_words = 15;
|
||||||
int max_words = 35;
|
int max_words = 35;
|
||||||
int shortword = 3;
|
int shortword = 3;
|
||||||
int max_fragments = 0;
|
int max_fragments = 0;
|
||||||
int highlight = 0;
|
bool highlightall = false;
|
||||||
ListCell *l;
|
ListCell *l;
|
||||||
|
|
||||||
/* config */
|
/* Extract configuration option values */
|
||||||
prs->startsel = NULL;
|
prs->startsel = NULL;
|
||||||
prs->stopsel = NULL;
|
prs->stopsel = NULL;
|
||||||
|
prs->fragdelim = NULL;
|
||||||
foreach(l, prsoptions)
|
foreach(l, prsoptions)
|
||||||
{
|
{
|
||||||
DefElem *defel = (DefElem *) lfirst(l);
|
DefElem *defel = (DefElem *) lfirst(l);
|
||||||
@ -2474,12 +2547,12 @@ prsd_headline(PG_FUNCTION_ARGS)
|
|||||||
else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
|
else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
|
||||||
prs->fragdelim = pstrdup(val);
|
prs->fragdelim = pstrdup(val);
|
||||||
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
|
else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
|
||||||
highlight = (pg_strcasecmp(val, "1") == 0 ||
|
highlightall = (pg_strcasecmp(val, "1") == 0 ||
|
||||||
pg_strcasecmp(val, "on") == 0 ||
|
pg_strcasecmp(val, "on") == 0 ||
|
||||||
pg_strcasecmp(val, "true") == 0 ||
|
pg_strcasecmp(val, "true") == 0 ||
|
||||||
pg_strcasecmp(val, "t") == 0 ||
|
pg_strcasecmp(val, "t") == 0 ||
|
||||||
pg_strcasecmp(val, "y") == 0 ||
|
pg_strcasecmp(val, "y") == 0 ||
|
||||||
pg_strcasecmp(val, "yes") == 0);
|
pg_strcasecmp(val, "yes") == 0);
|
||||||
else
|
else
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
@ -2487,7 +2560,8 @@ prsd_headline(PG_FUNCTION_ARGS)
|
|||||||
defel->defname)));
|
defel->defname)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (highlight == 0)
|
/* in HighlightAll mode these parameters are ignored */
|
||||||
|
if (!highlightall)
|
||||||
{
|
{
|
||||||
if (min_words >= max_words)
|
if (min_words >= max_words)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
@ -2507,18 +2581,23 @@ prsd_headline(PG_FUNCTION_ARGS)
|
|||||||
errmsg("MaxFragments should be >= 0")));
|
errmsg("MaxFragments should be >= 0")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Apply appropriate headline selector */
|
||||||
if (max_fragments == 0)
|
if (max_fragments == 0)
|
||||||
/* call the default headline generator */
|
mark_hl_words(prs, query, highlightall, shortword,
|
||||||
mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
|
min_words, max_words);
|
||||||
else
|
else
|
||||||
mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
|
mark_hl_fragments(prs, query, highlightall, shortword,
|
||||||
|
min_words, max_words, max_fragments);
|
||||||
|
|
||||||
|
/* Fill in default values for string options */
|
||||||
if (!prs->startsel)
|
if (!prs->startsel)
|
||||||
prs->startsel = pstrdup("<b>");
|
prs->startsel = pstrdup("<b>");
|
||||||
if (!prs->stopsel)
|
if (!prs->stopsel)
|
||||||
prs->stopsel = pstrdup("</b>");
|
prs->stopsel = pstrdup("</b>");
|
||||||
if (!prs->fragdelim)
|
if (!prs->fragdelim)
|
||||||
prs->fragdelim = pstrdup(" ... ");
|
prs->fragdelim = pstrdup(" ... ");
|
||||||
|
|
||||||
|
/* Caller will need these lengths, too */
|
||||||
prs->startsellen = strlen(prs->startsel);
|
prs->startsellen = strlen(prs->startsel);
|
||||||
prs->stopsellen = strlen(prs->stopsel);
|
prs->stopsellen = strlen(prs->stopsel);
|
||||||
prs->fragdelimlen = strlen(prs->fragdelim);
|
prs->fragdelimlen = strlen(prs->fragdelim);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user