1 Eliminate duplicate field HLWORD->skip

2 Rework support for html tags in parser
3 add HighlightAll to headline function for generating highlighted
  whole text with saved html tags
This commit is contained in:
Teodor Sigaev 2004-06-28 16:19:09 +00:00
parent e48cfacb84
commit bb89237531
6 changed files with 234 additions and 133 deletions

View File

@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
12 |
1 | asdf
12 |
13 |
13 | <fr>
1 | qwer
12 |
1 | jf
12 |
1 | sdjk
13 |
13 | <we hjwer <werrwe>
12 |
3 | ewr1
12 | >
12 |
3 | ewri2
12 |
13 |
13 | <a href="qwe<qwe>">
12 |
19 | /usr/local/fff
@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
22 | 234
12 |
13 |
13 | <i <b>
12 |
1 | wow
12 |
@ -2130,6 +2130,35 @@ A thousand years to trace
The granite features of this cliff
(1 row)
select headline('
<html>
<!-- some comment -->
<body>
Sea view wow <u>foo bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>',
to_tsquery('sea&foo'), 'HighlightAll=true');
headline
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
<html>
<!-- some comment -->
<body>
<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>
(1 row)
--check debug
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
ts_name | tok_type | description | token | dict_name | tsvector

View File

@ -253,6 +253,20 @@ The sculpture of these granite seams,
Upon a woman s face. E. J. Pratt (1882 1964)
', to_tsquery('sea'));
select headline('
<html>
<!-- some comment -->
<body>
Sea view wow <u>foo bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>',
to_tsquery('sea&foo'), 'HighlightAll=true');
--check debug
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');

View File

@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs)
ptr = ((char *) out) + dist;
}
if (wrd->in && !wrd->skip && !wrd->repeated)
if (wrd->in && !wrd->repeated)
{
if (wrd->replace)
{
@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs)
ptr += prs->stopsellen;
}
}
}
} else
if (!wrd->repeated)
pfree(wrd->word);

View File

@ -46,13 +46,13 @@ typedef struct
typedef struct
{
uint16 len;
uint8 selected:1,
uint32 selected:1,
in:1,
skip:1,
replace:1,
repeated:1;
uint8 type;
repeated:1,
unused:4,
type:8,
len:16;
char *word;
ITEM *item;
} HLWORD;

View File

@ -10,10 +10,48 @@
char *token = NULL; /* pointer to token */
int tokenlen;
char *s = NULL; /* to return WHOLE hyphenated-word */
static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
typedef struct {
int tlen;
int clen;
char *str;
} TagStorage;
static TagStorage ts={0,0,NULL};
static void
addTag() {
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
ts.tlen*=2;
ts.str=realloc(ts.str,ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
ts.clen+=tsearch2_yyleng;
ts.str[ts.clen]='\0';
}
static void
startTag() {
if ( ts.str==NULL ) {
ts.tlen=tsearch2_yyleng+1;
ts.str=malloc(ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ts.clen=0;
ts.str[0]='\0';
addTag();
}
%}
%option 8bit
@ -46,47 +84,46 @@ URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
"<!--" { BEGIN INCOMMENT; }
<INCOMMENT>"-->" {
BEGIN INITIAL;
*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
"<"[\![:alpha:]] { BEGIN INTAG; }
"</"[[:alpha:]] { BEGIN INTAG; }
<INTAG>"\"" { BEGIN QINTAG; }
<QINTAG>"\\\"" ;
<QINTAG>"\"" { BEGIN INTAG; }
<INTAG>">" {
BEGIN INITIAL;
token = tsearch2_yytext;
*tsearch2_yytext=' ';
token = tsearch2_yytext;
tokenlen = 1;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
"<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
<QINTAG>"\\\"" { addTag(); }
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext;
@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) {
tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}

View File

@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS)
#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS)
curlen;
int i;
int highlight=0;
/* config */
prs->startsel = NULL;
@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS)
prs->startsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
prs->stopsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
highlight = (
pg_strcasecmp(mptr->value, "1")==0 ||
pg_strcasecmp(mptr->value, "on")==0 ||
pg_strcasecmp(mptr->value, "true")==0 ||
pg_strcasecmp(mptr->value, "t")==0 ||
pg_strcasecmp(mptr->value, "y")==0 ||
pg_strcasecmp(mptr->value, "yes")==0 ) ?
1 : 0;
pfree(mptr->key);
pfree(mptr->value);
@ -228,124 +239,133 @@ prsd_headline(PG_FUNCTION_ARGS)
}
pfree(map);
if (min_words >= max_words)
ereport(ERROR,
if (highlight==0) {
if (min_words >= max_words)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be less than MaxWords")));
if (min_words <= 0)
ereport(ERROR,
if (min_words <= 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be positive")));
if (shortword < 0)
ereport(ERROR,
if (shortword < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0")));
}
}
while (hlCover(prs, query, &p, &q))
{
/* find cover len in words */
curlen = 0;
poslen = 0;
for (i = p; i <= q && curlen < max_words; i++)
if (highlight==0) {
while (hlCover(prs, query, &p, &q))
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
pose = i;
}
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
{
/* best already finded, so try one more cover */
p++;
continue;
}
posb=p;
if (curlen < max_words)
{ /* find good end */
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
/* find cover len in words */
curlen = 0;
poslen = 0;
for (i = p; i <= q && curlen < max_words; i++)
{
if (i != q)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
}
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
for(i=p; i>= 0; i--) {
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
{
/* best already finded, so try one more cover */
p++;
continue;
}
posb=p;
if (curlen < max_words)
{ /* find good end */
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
{
if (i != q)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
}
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
posb=(i>=0) ? i : 0;
if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
for(i=p; i>= 0; i--) {
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
posb=(i>=0) ? i : 0;
}
}
else
{ /* shorter cover :((( */
for (; curlen > min_words; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
if (prs->words[i].item && !prs->words[i].repeated)
poslen--;
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
break;
}
}
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
{
bestb = posb;
beste = pose;
bestlen = poslen;
}
p++;
}
else
{ /* shorter cover :((( */
for (; curlen > min_words; i--)
if (bestlen < 0)
{
curlen = 0;
for (i = 0; i < prs->curwords && curlen < min_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
if (prs->words[i].item && !prs->words[i].repeated)
poslen--;
curlen++;
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
break;
}
}
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
{
bestb = posb;
bestb = 0;
beste = pose;
bestlen = poslen;
}
p++;
}
if (bestlen < 0)
{
curlen = 0;
poslen = 0;
for (i = 0; i < prs->curwords && curlen < min_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
pose = i;
}
bestb = 0;
beste = pose;
} else {
bestb=0;
beste=prs->curwords-1;
}
for (i = bestb; i <= beste; i++)
{
if (prs->words[i].item)
prs->words[i].selected = 1;
if (prs->words[i].repeated)
prs->words[i].skip = 1;
if (HLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
if ( highlight==0 ) {
if (HLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
} else {
if (HTMLHLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
}
prs->words[i].in = 1;
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}
if (!prs->startsel)