1 Eliminate duplicate field HLWORD->skip

2 Rework support for html tags in parser 3 add HighlightAll to headline function for generating highlighted whole text with saved html tags
2004-06-28 16:19:09 +00:00 · 2004-06-28 16:19:09 +00:00 · bb89237531
commit bb89237531
parent e48cfacb84
6 changed files with 234 additions and 133 deletions
--- a/contrib/tsearch2/expected/tsearch2.out
+++ b/contrib/tsearch2/expected/tsearch2.out
@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
    12 |  
     1 | asdf
    12 |  
-    13 |  
+    13 | <fr>
     1 | qwer
    12 |  
     1 | jf
    12 |  
     1 | sdjk
-    13 |  
+    13 | <we hjwer <werrwe>
    12 |  
     3 | ewr1
    12 | >
    12 |  
     3 | ewri2
    12 |  
-    13 |  
+    13 | <a href="qwe<qwe>">
    12 | 

    19 | /usr/local/fff
@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
    22 | 234
    12 |  

-    13 |  
+    13 | <i <b>
    12 |  
     1 | wow
    12 |   
@ -2130,6 +2130,35 @@ A thousand years to trace
 The granite features of this cliff
 (1 row)

+select headline('
+<html>
+<!-- some comment -->
+<body>
+Sea view wow <u>foo bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ff-bg
+<script>
+	document.write(15);
+</script>
+</body>
+</html>', 
+to_tsquery('sea&foo'), 'HighlightAll=true');
+                                                                                                              headline                                                                                                               
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 
+<html>
+<!-- some comment -->
+<body>
+<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ ff-bg
+<script>
+	document.write(15);
+</script>
+</body>
+</html>
+(1 row)
+
 --check debug
 select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
 ts_name | tok_type | description |   token    | dict_name |   tsvector   
--- a/contrib/tsearch2/sql/tsearch2.sql
+++ b/contrib/tsearch2/sql/tsearch2.sql
@ -253,6 +253,20 @@ The sculpture of these granite seams,
 Upon a woman s face. E.  J.  Pratt  (1882 1964)
 ', to_tsquery('sea'));

+
+select headline('
+<html>
+<!-- some comment -->
+<body>
+Sea view wow <u>foo bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ff-bg
+<script>
+	document.write(15);
+</script>
+</body>
+</html>', 
+to_tsquery('sea&foo'), 'HighlightAll=true');
 --check debug
 select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');

--- a/contrib/tsearch2/ts_cfg.c
+++ b/contrib/tsearch2/ts_cfg.c
@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs)
 			ptr = ((char *) out) + dist;
 		}

-		if (wrd->in && !wrd->skip && !wrd->repeated)
+		if (wrd->in && !wrd->repeated)
 		{
 			if (wrd->replace)
 			{
@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs)
 					ptr += prs->stopsellen;
 				}
 			}
-		}
+		} else

 		if (!wrd->repeated)
 			pfree(wrd->word);
--- a/contrib/tsearch2/ts_cfg.h
+++ b/contrib/tsearch2/ts_cfg.h
@ -46,13 +46,13 @@ typedef struct

 typedef struct
 {
-	uint16		len;
-	uint8		selected:1,
+	uint32		selected:1,
 				in:1,
-				skip:1,
 				replace:1,
-				repeated:1;
-	uint8		type;
+				repeated:1,
+				unused:4,
+				type:8,
+				len:16;
 	char	   *word;
 	ITEM	   *item;
 }	HLWORD;
--- a/contrib/tsearch2/wordparser/parser.l
+++ b/contrib/tsearch2/wordparser/parser.l
@ -10,10 +10,48 @@

 char *token = NULL;  /* pointer to token */
 int tokenlen;
-char *s     = NULL;  /* to return WHOLE hyphenated-word */
+static char *s     = NULL;  /* to return WHOLE hyphenated-word */

 YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */

+typedef struct {
+	int tlen;
+	int clen;
+	char *str;
+} TagStorage;
+
+static TagStorage ts={0,0,NULL};
+
+static void
+addTag() {
+	while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
+		ts.tlen*=2;
+		ts.str=realloc(ts.str,ts.tlen);
+		if (!ts.str)
+                	ereport(ERROR,
+                               	(errcode(ERRCODE_OUT_OF_MEMORY),
+                               	 errmsg("out of memory")));
+        }
+        memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
+        ts.clen+=tsearch2_yyleng;
+	ts.str[ts.clen]='\0';
+}
+
+static void
+startTag() {
+	if ( ts.str==NULL ) {
+		ts.tlen=tsearch2_yyleng+1;
+		ts.str=malloc(ts.tlen);
+		if (!ts.str)
+                	ereport(ERROR,
+                                (errcode(ERRCODE_OUT_OF_MEMORY),
+                                 errmsg("out of memory")));
+	}
+	ts.clen=0;
+	ts.str[0]='\0';
+	addTag();
+}
+
 %}

 %option 8bit
@ -46,47 +84,46 @@ URI		[-_[:alnum:]/%,\.;=&?#]+

 %%

-"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }

 <INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
 	BEGIN INITIAL; 
-	*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; 
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return SPACE;
-}
-
-"<!--"	{ BEGIN INCOMMENT; }
-
-<INCOMMENT>"-->"	{ 
-	BEGIN INITIAL;
-	*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; 
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return SPACE;
-}
-
-
-"<"[\![:alpha:]]	{ BEGIN INTAG; }
-
-"</"[[:alpha:]]	{ BEGIN INTAG; }
-
-<INTAG>"\""	{ BEGIN QINTAG; }
-
-<QINTAG>"\\\""	;
-
-<QINTAG>"\""	{ BEGIN INTAG; }
-
-<INTAG>">"	{ 
-	BEGIN INITIAL;
-	token = tsearch2_yytext;
-	*tsearch2_yytext=' '; 
-	token = tsearch2_yytext;
-	tokenlen = 1;
+	addTag();
+	token = ts.str;
+	tokenlen = ts.clen;
 	return TAG;
 }

-<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n 	;
+"<!--"	{ BEGIN INCOMMENT; startTag(); }
+
+<INCOMMENT>"-->"	{ 
+	BEGIN INITIAL;
+	addTag();
+	token = ts.str;
+	tokenlen = ts.clen;
+	return TAG;
+}
+
+
+"<"[\![:alpha:]]	{ BEGIN INTAG; startTag(); }
+
+"</"[[:alpha:]]	{ BEGIN INTAG; startTag(); }
+
+<INTAG>"\""	{ BEGIN QINTAG; addTag(); }
+
+<QINTAG>"\\\""	{ addTag(); }
+
+<QINTAG>"\""	{ BEGIN INTAG; addTag(); }
+
+<INTAG>">"	{ 
+	BEGIN INITIAL;
+	addTag();
+	token = ts.str;
+	tokenlen = ts.clen;
+	return TAG;
+}
+
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }	

 \&(quot|amp|nbsp|lt|gt)\;   {
 	token = tsearch2_yytext;
@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) {
 	tsearch2_yy_switch_to_buffer( buf );
 	BEGIN INITIAL;
 }
+
--- a/contrib/tsearch2/wparser_def.c
+++ b/contrib/tsearch2/wparser_def.c
@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS)

 #define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
 #define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
+#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
 #define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
 #define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )

@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS)
 				curlen;

 	int			i;
+	int 			highlight=0;

 	/* config */
 	prs->startsel = NULL;
@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS)
 				prs->startsel = pstrdup(mptr->value);
 			else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
 				prs->stopsel = pstrdup(mptr->value);
+			else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
+				highlight = (
+					pg_strcasecmp(mptr->value, "1")==0 || 
+					pg_strcasecmp(mptr->value, "on")==0 || 
+					pg_strcasecmp(mptr->value, "true")==0 || 
+					pg_strcasecmp(mptr->value, "t")==0 || 
+					pg_strcasecmp(mptr->value, "y")==0 || 
+					pg_strcasecmp(mptr->value, "yes")==0 ) ?
+				1 : 0;

 			pfree(mptr->key);
 			pfree(mptr->value);
@ -228,124 +239,133 @@ prsd_headline(PG_FUNCTION_ARGS)
 		}
 		pfree(map);

-		if (min_words >= max_words)
-			ereport(ERROR,
+		if (highlight==0) {
+			if (min_words >= max_words)
+				ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("MinWords should be less than MaxWords")));
-		if (min_words <= 0)
-			ereport(ERROR,
+			if (min_words <= 0)
+				ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("MinWords should be positive")));
-		if (shortword < 0)
-			ereport(ERROR,
+			if (shortword < 0)
+				ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("ShortWord should be >= 0")));
+		}
 	}

-	while (hlCover(prs, query, &p, &q))
-	{
-		/* find cover len in words */
-		curlen = 0;
-		poslen = 0;
-		for (i = p; i <= q && curlen < max_words; i++)
+	if (highlight==0) {
+		while (hlCover(prs, query, &p, &q))
 		{
-			if (!NONWORDTOKEN(prs->words[i].type))
-				curlen++;
-			if (prs->words[i].item && !prs->words[i].repeated)
-				poslen++;
-			pose = i;
-		}
-
-		if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
-		{
-			/* best already finded, so try one more cover */
-			p++;
-			continue;
-		}
-
-		posb=p;
-		if (curlen < max_words)
-		{						/* find good end */
-			for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+			/* find cover len in words */
+			curlen = 0;
+			poslen = 0;
+			for (i = p; i <= q && curlen < max_words; i++)
 			{
-				if (i != q)
-				{
-					if (!NONWORDTOKEN(prs->words[i].type))
-						curlen++;
-					if (prs->words[i].item && !prs->words[i].repeated)
-						poslen++;
-				}
+				if (!NONWORDTOKEN(prs->words[i].type))
+					curlen++;
+				if (prs->words[i].item && !prs->words[i].repeated)
+					poslen++;
 				pose = i;
-				if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
-					continue;
-				if (curlen >= min_words)
-					break;
 			}
-			if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
-				for(i=p; i>= 0; i--) {
-					if (!NONWORDTOKEN(prs->words[i].type))
-						curlen++;
-					if (prs->words[i].item && !prs->words[i].repeated)
-						poslen++;
+	
+			if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+			{
+				/* best already finded, so try one more cover */
+				p++;
+				continue;
+			}
+	
+			posb=p;
+			if (curlen < max_words)
+			{						/* find good end */
+				for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+				{
+					if (i != q)
+					{
+						if (!NONWORDTOKEN(prs->words[i].type))
+							curlen++;
+						if (prs->words[i].item && !prs->words[i].repeated)
+							poslen++;
+					}
+					pose = i;
 					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
 						continue;
 					if (curlen >= min_words)
 						break;
 				}
-				posb=(i>=0) ? i : 0;
+				if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
+					for(i=p; i>= 0; i--) {
+						if (!NONWORDTOKEN(prs->words[i].type))
+							curlen++;
+						if (prs->words[i].item && !prs->words[i].repeated)
+							poslen++;
+						if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+							continue;
+						if (curlen >= min_words)
+							break;
+					}
+					posb=(i>=0) ? i : 0;
+				}
 			}
+			else
+			{						/* shorter cover :((( */
+				for (; curlen > min_words; i--)
+				{
+					if (!NONWORDTOKEN(prs->words[i].type))
+						curlen--;
+					if (prs->words[i].item && !prs->words[i].repeated)
+						poslen--;
+					pose = i;
+					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+						continue;
+					break;
+				}
+			}
+	
+			if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
+				(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
+				 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+			{
+				bestb = posb;
+				beste = pose;
+				bestlen = poslen;
+			}
+	
+			p++;
 		}
-		else
-		{						/* shorter cover :((( */
-			for (; curlen > min_words; i--)
+
+		if (bestlen < 0)
+		{
+			curlen = 0;
+			for (i = 0; i < prs->curwords && curlen < min_words; i++)
 			{
 				if (!NONWORDTOKEN(prs->words[i].type))
-					curlen--;
-				if (prs->words[i].item && !prs->words[i].repeated)
-					poslen--;
+					curlen++;
 				pose = i;
-				if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
-					continue;
-				break;
 			}
-		}
-
-		if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
-			(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
-			 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
-		{
-			bestb = posb;
+			bestb = 0;
 			beste = pose;
-			bestlen = poslen;
 		}
-
-		p++;
-	}
-
-	if (bestlen < 0)
-	{
-		curlen = 0;
-		poslen = 0;
-		for (i = 0; i < prs->curwords && curlen < min_words; i++)
-		{
-			if (!NONWORDTOKEN(prs->words[i].type))
-				curlen++;
-			pose = i;
-		}
-		bestb = 0;
-		beste = pose;
+	} else {
+		bestb=0;
+		beste=prs->curwords-1;
 	}

 	for (i = bestb; i <= beste; i++)
 	{
 		if (prs->words[i].item)
 			prs->words[i].selected = 1;
-		if (prs->words[i].repeated)
-			prs->words[i].skip = 1;
-		if (HLIDIGNORE(prs->words[i].type))
-			prs->words[i].replace = 1;
+		if ( highlight==0 ) { 
+			if (HLIDIGNORE(prs->words[i].type))
+				prs->words[i].replace = 1;
+		} else {
+			if (HTMLHLIDIGNORE(prs->words[i].type))
+				prs->words[i].replace = 1;
+		}

-		prs->words[i].in = 1;
+		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
 	}

 	if (!prs->startsel)