Have text search thesaurus files use "?" for stop words.

Throw an error for actual stop words, rather than a warning. This fixes problems with cache reloading causing warning messages. Re-enable stop words in regression tests; was disabled by Tom. Document "?" as API change.
2007-11-10 15:39:34 +00:00 · 2007-11-10 15:39:34 +00:00 · d009992ba3
commit d009992ba3
parent 82748bc253
4 changed files with 56 additions and 51 deletions
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.30 2007/11/05 15:55:53 mha Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.31 2007/11/10 15:39:34 momjian Exp $ -->

 <chapter id="textsearch">
 <title id="textsearch-title">Full Text Search</title>
@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s)
   </para>

   <para>
-    Stop words recognized by the subdictionary are replaced by a <quote>stop
-    word placeholder</quote> to record their position. To illustrate this,
-    consider these phrases:
+    Specific stop words recognized by the subdictionary cannot be
+    specified;  instead use <literal>?</> to mark the location where any
+    stop word can appear.  For example, assuming that <literal>a</> and
+    <literal>the</> are stop words according to the subdictionary:

 <programlisting>
-a one the two : swsw
-the one a two : swsw2
+? one ? two : swsw
 </programlisting>

-    Assuming that <literal>a</> and <literal>the</> are stop words according
-    to the subdictionary, these two phrases are identical to the thesaurus:
-    they both look like <replaceable>stopword</> <literal>one</>
-    <replaceable>stopword</> <literal>two</>.  Input matching this pattern
-    will be replaced by <literal>swsw2</>, according to the tie-breaking rule.
+    matches <literal>a one the two</> and <literal>the one a two</>;
+    both would be replaced by <literal>swsw</>.
   </para>

   <para>
@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default"
    </para>
   </listitem>

+   <listitem>
+    <para>
+     Thesaurus files now use <literal>?</> for stop words.
+    </para>
+   </listitem>
+
   <listitem>
    <para>
     What else?
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -412,47 +412,48 @@ compileTheLexeme(DictThesaurus * d)
 	{
 		TSLexeme   *ptr;

-		ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
-									   PointerGetDatum(d->subdict->dictData),
-										  PointerGetDatum(d->wrds[i].lexeme),
-									Int32GetDatum(strlen(d->wrds[i].lexeme)),
-													 PointerGetDatum(NULL)));
-
-		if (!ptr)
-			elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
-				 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-		else if (!(ptr->lexeme))
-		{
-			elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
-				 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-
+		if (strcmp(d->wrds[i].lexeme, "?") == 0)	/* Is stop word marker? */
 			newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
-		}
 		else
 		{
-			while (ptr->lexeme)
+			ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+										   PointerGetDatum(d->subdict->dictData),
+											  PointerGetDatum(d->wrds[i].lexeme),
+										Int32GetDatum(strlen(d->wrds[i].lexeme)),
+														 PointerGetDatum(NULL)));
+	
+			if (!ptr)
+				elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
+					 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+			else if (!(ptr->lexeme))
+				elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)",
+					 d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+			else
 			{
-				TSLexeme   *remptr = ptr + 1;
-				int			tnvar = 1;
-				int			curvar = ptr->nvariant;
-
-				/* compute n words in one variant */
-				while (remptr->lexeme)
+				while (ptr->lexeme)
 				{
-					if (remptr->nvariant != (remptr - 1)->nvariant)
-						break;
-					tnvar++;
-					remptr++;
+					TSLexeme   *remptr = ptr + 1;
+					int			tnvar = 1;
+					int			curvar = ptr->nvariant;
+	
+					/* compute n words in one variant */
+					while (remptr->lexeme)
+					{
+						if (remptr->nvariant != (remptr - 1)->nvariant)
+							break;
+						tnvar++;
+						remptr++;
+					}
+	
+					remptr = ptr;
+					while (remptr->lexeme && remptr->nvariant == curvar)
+					{
+						newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+						remptr++;
+					}
+	
+					ptr = remptr;
 				}
-
-				remptr = ptr;
-				while (remptr->lexeme && remptr->nvariant == curvar)
-				{
-					newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
-					remptr++;
-				}
-
-				ptr = remptr;
 			}
 		}

--- a/src/backend/tsearch/thesaurus_sample.ths
+++ b/src/backend/tsearch/thesaurus_sample.ths
@ -14,4 +14,5 @@ two : *2
 supernovae stars : *sn
 supernovae : *sn
 booking tickets : order invitation cards
-# booking the tickets : order invitation Cards
+booking ? tickets : order invitation Cards
+
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@ -311,8 +311,8 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall
 (1 row)

 SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-                             to_tsvector                             
---------------------------------------------------------------------
- 'book':8 'card':3 'like':6 'look':5 'invit':2 'order':1 'ticket':10
+                      to_tsvector                      
+-------------------------------------------------------
+ 'card':3,10 'like':6 'look':5 'invit':2,9 'order':1,8
 (1 row)