From bb89237531439816c3f8d59b7b1891735e5f1f47 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 28 Jun 2004 16:19:09 +0000 Subject: [PATCH] 1 Eliminate duplicate field HLWORD->skip 2 Rework support for html tags in parser 3 add HighlightAll to headline function for generating highlighted whole text with saved html tags --- contrib/tsearch2/expected/tsearch2.out | 37 ++++- contrib/tsearch2/sql/tsearch2.sql | 14 ++ contrib/tsearch2/ts_cfg.c | 4 +- contrib/tsearch2/ts_cfg.h | 10 +- contrib/tsearch2/wordparser/parser.l | 110 +++++++++----- contrib/tsearch2/wparser_def.c | 192 ++++++++++++++----------- 6 files changed, 234 insertions(+), 133 deletions(-) diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out index fb836c087a..93fc11dad1 100644 --- a/contrib/tsearch2/expected/tsearch2.out +++ b/contrib/tsearch2/expected/tsearch2.out @@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc 12 | 1 | asdf 12 | - 13 | + 13 | 1 | qwer 12 | 1 | jf 12 | 1 | sdjk - 13 | + 13 | 12 | 3 | ewr1 12 | > 12 | 3 | ewri2 12 | - 13 | + 13 | 12 | 19 | /usr/local/fff @@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc 22 | 234 12 | - 13 | + 13 | 12 | 1 | wow 12 | @@ -2130,6 +2130,35 @@ A thousand years to trace The granite features of this cliff (1 row) +select headline(' + + + +Sea view wow foo bar qq +YES   +ff-bg + + +', +to_tsquery('sea&foo'), 'HighlightAll=true'); + headline +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + + + +Sea view wow foo bar qq +YES   + ff-bg + + + +(1 row) + --check debug select * from ts_debug('Tsearch module for PostgreSQL 7.3.3'); ts_name | tok_type | description | token | dict_name | tsvector diff --git a/contrib/tsearch2/sql/tsearch2.sql b/contrib/tsearch2/sql/tsearch2.sql index 231ddaebe5..0a980608f7 100644 --- a/contrib/tsearch2/sql/tsearch2.sql +++ b/contrib/tsearch2/sql/tsearch2.sql @@ -253,6 +253,20 @@ The sculpture of these granite seams, Upon a woman s face. E. J. Pratt (1882 1964) ', to_tsquery('sea')); + +select headline(' + + + +Sea view wow foo bar qq +YES   +ff-bg + + +', +to_tsquery('sea&foo'), 'HighlightAll=true'); --check debug select * from ts_debug('Tsearch module for PostgreSQL 7.3.3'); diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c index efd79a1e32..4e0a0bb904 100644 --- a/contrib/tsearch2/ts_cfg.c +++ b/contrib/tsearch2/ts_cfg.c @@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs) ptr = ((char *) out) + dist; } - if (wrd->in && !wrd->skip && !wrd->repeated) + if (wrd->in && !wrd->repeated) { if (wrd->replace) { @@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs) ptr += prs->stopsellen; } } - } + } else if (!wrd->repeated) pfree(wrd->word); diff --git a/contrib/tsearch2/ts_cfg.h b/contrib/tsearch2/ts_cfg.h index 9bf65144b2..e000233178 100644 --- a/contrib/tsearch2/ts_cfg.h +++ b/contrib/tsearch2/ts_cfg.h @@ -46,13 +46,13 @@ typedef struct typedef struct { - uint16 len; - uint8 selected:1, + uint32 selected:1, in:1, - skip:1, replace:1, - repeated:1; - uint8 type; + repeated:1, + unused:4, + type:8, + len:16; char *word; ITEM *item; } HLWORD; diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l index e80f5fea90..8c46edf7b8 100644 --- a/contrib/tsearch2/wordparser/parser.l +++ b/contrib/tsearch2/wordparser/parser.l @@ -10,10 +10,48 @@ char *token = NULL; /* pointer to token */ int tokenlen; -char *s = NULL; /* to return WHOLE hyphenated-word */ +static char *s = NULL; /* to return WHOLE hyphenated-word */ YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ +typedef struct { + int tlen; + int clen; + char *str; +} TagStorage; + +static TagStorage ts={0,0,NULL}; + +static void +addTag() { + while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) { + ts.tlen*=2; + ts.str=realloc(ts.str,ts.tlen); + if (!ts.str) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng); + ts.clen+=tsearch2_yyleng; + ts.str[ts.clen]='\0'; +} + +static void +startTag() { + if ( ts.str==NULL ) { + ts.tlen=tsearch2_yyleng+1; + ts.str=malloc(ts.tlen); + if (!ts.str) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + ts.clen=0; + ts.str[0]='\0'; + addTag(); +} + %} %option 8bit @@ -46,47 +84,46 @@ URI [-_[:alnum:]/%,\.;=&?#]+ %% -"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; } +"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); } "" { BEGIN INITIAL; - *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SPACE; -} - -"" { - BEGIN INITIAL; - *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SPACE; -} - - -"<"[\![:alpha:]] { BEGIN INTAG; } - -""\"" { BEGIN QINTAG; } - -"\\\"" ; - -"\"" { BEGIN INTAG; } - -">" { - BEGIN INITIAL; - token = tsearch2_yytext; - *tsearch2_yytext=' '; - token = tsearch2_yytext; - tokenlen = 1; + addTag(); + token = ts.str; + tokenlen = ts.clen; return TAG; } -.|\n ; +"" { + BEGIN INITIAL; + addTag(); + token = ts.str; + tokenlen = ts.clen; + return TAG; +} + + +"<"[\![:alpha:]] { BEGIN INTAG; startTag(); } + +""\"" { BEGIN QINTAG; addTag(); } + +"\\\"" { addTag(); } + +"\"" { BEGIN INTAG; addTag(); } + +">" { + BEGIN INITIAL; + addTag(); + token = ts.str; + tokenlen = ts.clen; + return TAG; +} + +.|\n { addTag(); } \&(quot|amp|nbsp|lt|gt)\; { token = tsearch2_yytext; @@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) { tsearch2_yy_switch_to_buffer( buf ); BEGIN INITIAL; } + diff --git a/contrib/tsearch2/wparser_def.c b/contrib/tsearch2/wparser_def.c index a3d6112628..035e5f2495 100644 --- a/contrib/tsearch2/wparser_def.c +++ b/contrib/tsearch2/wparser_def.c @@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS) #define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 ) #define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 ) +#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 ) #define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) ) #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) ) @@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS) curlen; int i; + int highlight=0; /* config */ prs->startsel = NULL; @@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS) prs->startsel = pstrdup(mptr->value); else if (pg_strcasecmp(mptr->key, "StopSel") == 0) prs->stopsel = pstrdup(mptr->value); + else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0) + highlight = ( + pg_strcasecmp(mptr->value, "1")==0 || + pg_strcasecmp(mptr->value, "on")==0 || + pg_strcasecmp(mptr->value, "true")==0 || + pg_strcasecmp(mptr->value, "t")==0 || + pg_strcasecmp(mptr->value, "y")==0 || + pg_strcasecmp(mptr->value, "yes")==0 ) ? + 1 : 0; pfree(mptr->key); pfree(mptr->value); @@ -228,124 +239,133 @@ prsd_headline(PG_FUNCTION_ARGS) } pfree(map); - if (min_words >= max_words) - ereport(ERROR, + if (highlight==0) { + if (min_words >= max_words) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("MinWords should be less than MaxWords"))); - if (min_words <= 0) - ereport(ERROR, + if (min_words <= 0) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("MinWords should be positive"))); - if (shortword < 0) - ereport(ERROR, + if (shortword < 0) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ShortWord should be >= 0"))); + } } - while (hlCover(prs, query, &p, &q)) - { - /* find cover len in words */ - curlen = 0; - poslen = 0; - for (i = p; i <= q && curlen < max_words; i++) + if (highlight==0) { + while (hlCover(prs, query, &p, &q)) { - if (!NONWORDTOKEN(prs->words[i].type)) - curlen++; - if (prs->words[i].item && !prs->words[i].repeated) - poslen++; - pose = i; - } - - if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)) - { - /* best already finded, so try one more cover */ - p++; - continue; - } - - posb=p; - if (curlen < max_words) - { /* find good end */ - for (i = i - 1; i < prs->curwords && curlen < max_words; i++) + /* find cover len in words */ + curlen = 0; + poslen = 0; + for (i = p; i <= q && curlen < max_words; i++) { - if (i != q) - { - if (!NONWORDTOKEN(prs->words[i].type)) - curlen++; - if (prs->words[i].item && !prs->words[i].repeated) - poslen++; - } + if (!NONWORDTOKEN(prs->words[i].type)) + curlen++; + if (prs->words[i].item && !prs->words[i].repeated) + poslen++; pose = i; - if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) - continue; - if (curlen >= min_words) - break; } - if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */ - for(i=p; i>= 0; i--) { - if (!NONWORDTOKEN(prs->words[i].type)) - curlen++; - if (prs->words[i].item && !prs->words[i].repeated) - poslen++; + + if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)) + { + /* best already finded, so try one more cover */ + p++; + continue; + } + + posb=p; + if (curlen < max_words) + { /* find good end */ + for (i = i - 1; i < prs->curwords && curlen < max_words; i++) + { + if (i != q) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen++; + if (prs->words[i].item && !prs->words[i].repeated) + poslen++; + } + pose = i; if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) continue; if (curlen >= min_words) break; } - posb=(i>=0) ? i : 0; + if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */ + for(i=p; i>= 0; i--) { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen++; + if (prs->words[i].item && !prs->words[i].repeated) + poslen++; + if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) + continue; + if (curlen >= min_words) + break; + } + posb=(i>=0) ? i : 0; + } } + else + { /* shorter cover :((( */ + for (; curlen > min_words; i--) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen--; + if (prs->words[i].item && !prs->words[i].repeated) + poslen--; + pose = i; + if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) + continue; + break; + } + } + + if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) || + (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) && + (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))) + { + bestb = posb; + beste = pose; + bestlen = poslen; + } + + p++; } - else - { /* shorter cover :((( */ - for (; curlen > min_words; i--) + + if (bestlen < 0) + { + curlen = 0; + for (i = 0; i < prs->curwords && curlen < min_words; i++) { if (!NONWORDTOKEN(prs->words[i].type)) - curlen--; - if (prs->words[i].item && !prs->words[i].repeated) - poslen--; + curlen++; pose = i; - if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword) - continue; - break; } - } - - if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) || - (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) && - (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))) - { - bestb = posb; + bestb = 0; beste = pose; - bestlen = poslen; } - - p++; - } - - if (bestlen < 0) - { - curlen = 0; - poslen = 0; - for (i = 0; i < prs->curwords && curlen < min_words; i++) - { - if (!NONWORDTOKEN(prs->words[i].type)) - curlen++; - pose = i; - } - bestb = 0; - beste = pose; + } else { + bestb=0; + beste=prs->curwords-1; } for (i = bestb; i <= beste; i++) { if (prs->words[i].item) prs->words[i].selected = 1; - if (prs->words[i].repeated) - prs->words[i].skip = 1; - if (HLIDIGNORE(prs->words[i].type)) - prs->words[i].replace = 1; + if ( highlight==0 ) { + if (HLIDIGNORE(prs->words[i].type)) + prs->words[i].replace = 1; + } else { + if (HTMLHLIDIGNORE(prs->words[i].type)) + prs->words[i].replace = 1; + } - prs->words[i].in = 1; + prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; } if (!prs->startsel)