1 Eliminate duplicate field HLWORD->skip
2 Rework support for html tags in parser 3 add HighlightAll to headline function for generating highlighted whole text with saved html tags
This commit is contained in:
parent
e48cfacb84
commit
bb89237531
@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
|
||||
12 |
|
||||
1 | asdf
|
||||
12 |
|
||||
13 |
|
||||
13 | <fr>
|
||||
1 | qwer
|
||||
12 |
|
||||
1 | jf
|
||||
12 |
|
||||
1 | sdjk
|
||||
13 |
|
||||
13 | <we hjwer <werrwe>
|
||||
12 |
|
||||
3 | ewr1
|
||||
12 | >
|
||||
12 |
|
||||
3 | ewri2
|
||||
12 |
|
||||
13 |
|
||||
13 | <a href="qwe<qwe>">
|
||||
12 |
|
||||
|
||||
19 | /usr/local/fff
|
||||
@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
|
||||
22 | 234
|
||||
12 |
|
||||
|
||||
13 |
|
||||
13 | <i <b>
|
||||
12 |
|
||||
1 | wow
|
||||
12 |
|
||||
@ -2130,6 +2130,35 @@ A thousand years to trace
|
||||
The granite features of this cliff
|
||||
(1 row)
|
||||
|
||||
select headline('
|
||||
<html>
|
||||
<!-- some comment -->
|
||||
<body>
|
||||
Sea view wow <u>foo bar</u> <i>qq</i>
|
||||
<a href="http://www.google.com/foo.bar.html" target="_blank">YES </a>
|
||||
ff-bg
|
||||
<script>
|
||||
document.write(15);
|
||||
</script>
|
||||
</body>
|
||||
</html>',
|
||||
to_tsquery('sea&foo'), 'HighlightAll=true');
|
||||
headline
|
||||
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
<html>
|
||||
<!-- some comment -->
|
||||
<body>
|
||||
<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i>
|
||||
<a href="http://www.google.com/foo.bar.html" target="_blank">YES </a>
|
||||
ff-bg
|
||||
<script>
|
||||
document.write(15);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
(1 row)
|
||||
|
||||
--check debug
|
||||
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
|
||||
ts_name | tok_type | description | token | dict_name | tsvector
|
||||
|
@ -253,6 +253,20 @@ The sculpture of these granite seams,
|
||||
Upon a woman s face. E. J. Pratt (1882 1964)
|
||||
', to_tsquery('sea'));
|
||||
|
||||
|
||||
select headline('
|
||||
<html>
|
||||
<!-- some comment -->
|
||||
<body>
|
||||
Sea view wow <u>foo bar</u> <i>qq</i>
|
||||
<a href="http://www.google.com/foo.bar.html" target="_blank">YES </a>
|
||||
ff-bg
|
||||
<script>
|
||||
document.write(15);
|
||||
</script>
|
||||
</body>
|
||||
</html>',
|
||||
to_tsquery('sea&foo'), 'HighlightAll=true');
|
||||
--check debug
|
||||
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
|
||||
|
||||
|
@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs)
|
||||
ptr = ((char *) out) + dist;
|
||||
}
|
||||
|
||||
if (wrd->in && !wrd->skip && !wrd->repeated)
|
||||
if (wrd->in && !wrd->repeated)
|
||||
{
|
||||
if (wrd->replace)
|
||||
{
|
||||
@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs)
|
||||
ptr += prs->stopsellen;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else
|
||||
|
||||
if (!wrd->repeated)
|
||||
pfree(wrd->word);
|
||||
|
@ -46,13 +46,13 @@ typedef struct
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint16 len;
|
||||
uint8 selected:1,
|
||||
uint32 selected:1,
|
||||
in:1,
|
||||
skip:1,
|
||||
replace:1,
|
||||
repeated:1;
|
||||
uint8 type;
|
||||
repeated:1,
|
||||
unused:4,
|
||||
type:8,
|
||||
len:16;
|
||||
char *word;
|
||||
ITEM *item;
|
||||
} HLWORD;
|
||||
|
@ -10,10 +10,48 @@
|
||||
|
||||
char *token = NULL; /* pointer to token */
|
||||
int tokenlen;
|
||||
char *s = NULL; /* to return WHOLE hyphenated-word */
|
||||
static char *s = NULL; /* to return WHOLE hyphenated-word */
|
||||
|
||||
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
|
||||
|
||||
typedef struct {
|
||||
int tlen;
|
||||
int clen;
|
||||
char *str;
|
||||
} TagStorage;
|
||||
|
||||
static TagStorage ts={0,0,NULL};
|
||||
|
||||
static void
|
||||
addTag() {
|
||||
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
|
||||
ts.tlen*=2;
|
||||
ts.str=realloc(ts.str,ts.tlen);
|
||||
if (!ts.str)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
}
|
||||
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
|
||||
ts.clen+=tsearch2_yyleng;
|
||||
ts.str[ts.clen]='\0';
|
||||
}
|
||||
|
||||
static void
|
||||
startTag() {
|
||||
if ( ts.str==NULL ) {
|
||||
ts.tlen=tsearch2_yyleng+1;
|
||||
ts.str=malloc(ts.tlen);
|
||||
if (!ts.str)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
}
|
||||
ts.clen=0;
|
||||
ts.str[0]='\0';
|
||||
addTag();
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
%option 8bit
|
||||
@ -46,47 +84,46 @@ URI [-_[:alnum:]/%,\.;=&?#]+
|
||||
|
||||
%%
|
||||
|
||||
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
|
||||
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
|
||||
|
||||
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
|
||||
BEGIN INITIAL;
|
||||
*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
"<!--" { BEGIN INCOMMENT; }
|
||||
|
||||
<INCOMMENT>"-->" {
|
||||
BEGIN INITIAL;
|
||||
*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
|
||||
"<"[\![:alpha:]] { BEGIN INTAG; }
|
||||
|
||||
"</"[[:alpha:]] { BEGIN INTAG; }
|
||||
|
||||
<INTAG>"\"" { BEGIN QINTAG; }
|
||||
|
||||
<QINTAG>"\\\"" ;
|
||||
|
||||
<QINTAG>"\"" { BEGIN INTAG; }
|
||||
|
||||
<INTAG>">" {
|
||||
BEGIN INITIAL;
|
||||
token = tsearch2_yytext;
|
||||
*tsearch2_yytext=' ';
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = 1;
|
||||
addTag();
|
||||
token = ts.str;
|
||||
tokenlen = ts.clen;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
|
||||
"<!--" { BEGIN INCOMMENT; startTag(); }
|
||||
|
||||
<INCOMMENT>"-->" {
|
||||
BEGIN INITIAL;
|
||||
addTag();
|
||||
token = ts.str;
|
||||
tokenlen = ts.clen;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
|
||||
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
|
||||
|
||||
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
|
||||
|
||||
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
|
||||
|
||||
<QINTAG>"\\\"" { addTag(); }
|
||||
|
||||
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
|
||||
|
||||
<INTAG>">" {
|
||||
BEGIN INITIAL;
|
||||
addTag();
|
||||
token = ts.str;
|
||||
tokenlen = ts.clen;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
|
||||
|
||||
\&(quot|amp|nbsp|lt|gt)\; {
|
||||
token = tsearch2_yytext;
|
||||
@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) {
|
||||
tsearch2_yy_switch_to_buffer( buf );
|
||||
BEGIN INITIAL;
|
||||
}
|
||||
|
||||
|
@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS)
|
||||
|
||||
#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
|
||||
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
|
||||
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
|
||||
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
|
||||
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
|
||||
|
||||
@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS)
|
||||
curlen;
|
||||
|
||||
int i;
|
||||
int highlight=0;
|
||||
|
||||
/* config */
|
||||
prs->startsel = NULL;
|
||||
@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS)
|
||||
prs->startsel = pstrdup(mptr->value);
|
||||
else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
|
||||
prs->stopsel = pstrdup(mptr->value);
|
||||
else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
|
||||
highlight = (
|
||||
pg_strcasecmp(mptr->value, "1")==0 ||
|
||||
pg_strcasecmp(mptr->value, "on")==0 ||
|
||||
pg_strcasecmp(mptr->value, "true")==0 ||
|
||||
pg_strcasecmp(mptr->value, "t")==0 ||
|
||||
pg_strcasecmp(mptr->value, "y")==0 ||
|
||||
pg_strcasecmp(mptr->value, "yes")==0 ) ?
|
||||
1 : 0;
|
||||
|
||||
pfree(mptr->key);
|
||||
pfree(mptr->value);
|
||||
@ -228,124 +239,133 @@ prsd_headline(PG_FUNCTION_ARGS)
|
||||
}
|
||||
pfree(map);
|
||||
|
||||
if (min_words >= max_words)
|
||||
ereport(ERROR,
|
||||
if (highlight==0) {
|
||||
if (min_words >= max_words)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("MinWords should be less than MaxWords")));
|
||||
if (min_words <= 0)
|
||||
ereport(ERROR,
|
||||
if (min_words <= 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("MinWords should be positive")));
|
||||
if (shortword < 0)
|
||||
ereport(ERROR,
|
||||
if (shortword < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("ShortWord should be >= 0")));
|
||||
}
|
||||
}
|
||||
|
||||
while (hlCover(prs, query, &p, &q))
|
||||
{
|
||||
/* find cover len in words */
|
||||
curlen = 0;
|
||||
poslen = 0;
|
||||
for (i = p; i <= q && curlen < max_words; i++)
|
||||
if (highlight==0) {
|
||||
while (hlCover(prs, query, &p, &q))
|
||||
{
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen++;
|
||||
pose = i;
|
||||
}
|
||||
|
||||
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
|
||||
{
|
||||
/* best already finded, so try one more cover */
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
posb=p;
|
||||
if (curlen < max_words)
|
||||
{ /* find good end */
|
||||
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
|
||||
/* find cover len in words */
|
||||
curlen = 0;
|
||||
poslen = 0;
|
||||
for (i = p; i <= q && curlen < max_words; i++)
|
||||
{
|
||||
if (i != q)
|
||||
{
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen++;
|
||||
}
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen++;
|
||||
pose = i;
|
||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
||||
continue;
|
||||
if (curlen >= min_words)
|
||||
break;
|
||||
}
|
||||
if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
|
||||
for(i=p; i>= 0; i--) {
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen++;
|
||||
|
||||
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
|
||||
{
|
||||
/* best already finded, so try one more cover */
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
posb=p;
|
||||
if (curlen < max_words)
|
||||
{ /* find good end */
|
||||
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
|
||||
{
|
||||
if (i != q)
|
||||
{
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen++;
|
||||
}
|
||||
pose = i;
|
||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
||||
continue;
|
||||
if (curlen >= min_words)
|
||||
break;
|
||||
}
|
||||
posb=(i>=0) ? i : 0;
|
||||
if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
|
||||
for(i=p; i>= 0; i--) {
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen++;
|
||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
||||
continue;
|
||||
if (curlen >= min_words)
|
||||
break;
|
||||
}
|
||||
posb=(i>=0) ? i : 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{ /* shorter cover :((( */
|
||||
for (; curlen > min_words; i--)
|
||||
{
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen--;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen--;
|
||||
pose = i;
|
||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
|
||||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
|
||||
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
|
||||
{
|
||||
bestb = posb;
|
||||
beste = pose;
|
||||
bestlen = poslen;
|
||||
}
|
||||
|
||||
p++;
|
||||
}
|
||||
else
|
||||
{ /* shorter cover :((( */
|
||||
for (; curlen > min_words; i--)
|
||||
|
||||
if (bestlen < 0)
|
||||
{
|
||||
curlen = 0;
|
||||
for (i = 0; i < prs->curwords && curlen < min_words; i++)
|
||||
{
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen--;
|
||||
if (prs->words[i].item && !prs->words[i].repeated)
|
||||
poslen--;
|
||||
curlen++;
|
||||
pose = i;
|
||||
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
|
||||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
|
||||
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
|
||||
{
|
||||
bestb = posb;
|
||||
bestb = 0;
|
||||
beste = pose;
|
||||
bestlen = poslen;
|
||||
}
|
||||
|
||||
p++;
|
||||
}
|
||||
|
||||
if (bestlen < 0)
|
||||
{
|
||||
curlen = 0;
|
||||
poslen = 0;
|
||||
for (i = 0; i < prs->curwords && curlen < min_words; i++)
|
||||
{
|
||||
if (!NONWORDTOKEN(prs->words[i].type))
|
||||
curlen++;
|
||||
pose = i;
|
||||
}
|
||||
bestb = 0;
|
||||
beste = pose;
|
||||
} else {
|
||||
bestb=0;
|
||||
beste=prs->curwords-1;
|
||||
}
|
||||
|
||||
for (i = bestb; i <= beste; i++)
|
||||
{
|
||||
if (prs->words[i].item)
|
||||
prs->words[i].selected = 1;
|
||||
if (prs->words[i].repeated)
|
||||
prs->words[i].skip = 1;
|
||||
if (HLIDIGNORE(prs->words[i].type))
|
||||
prs->words[i].replace = 1;
|
||||
if ( highlight==0 ) {
|
||||
if (HLIDIGNORE(prs->words[i].type))
|
||||
prs->words[i].replace = 1;
|
||||
} else {
|
||||
if (HTMLHLIDIGNORE(prs->words[i].type))
|
||||
prs->words[i].replace = 1;
|
||||
}
|
||||
|
||||
prs->words[i].in = 1;
|
||||
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
|
||||
}
|
||||
|
||||
if (!prs->startsel)
|
||||
|
Loading…
Reference in New Issue
Block a user