Add an Accept parameter to "simple" dictionaries. The default of true

gives the old behavior; selecting false allows the dictionary to be used as a filter ahead of other dictionaries, because it will pass on rather than accept words that aren't in its stopword list. Jan Urbanski
2007-11-14 18:36:37 +00:00 · 2007-11-14 18:36:37 +00:00 · ca450a07ee
commit ca450a07ee
parent a44c81d1b7
2 changed files with 67 additions and 9 deletions
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
 <chapter id="textsearch">
 <title id="textsearch-title">Full Text Search</title>
@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a
   <para>
    The <literal>simple</> dictionary template operates by converting the
    input token to lower case and checking it against a file of stop words.
-    If it is found in the file then <literal>NULL</> is returned, causing
+    If it is found in the file then an empty array is returned, causing
    the token to be discarded.  If not, the lower-cased form of the word
-    is returned as the normalized lexeme.
+    is returned as the normalized lexeme.  Alternatively, the dictionary
    can be configured to report non-stop-words as unrecognized, allowing
    them to be passed on to the next dictionary in the list.
   </para>
   <para>
@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The');
 </programlisting>
   </para>
   <para>
    We can also choose to return <literal>NULL</>, instead of the lower-cased
    word, if it is not found in the stop words file.  This behavior is
    selected by setting the dictionary's <literal>Accept</> parameter to
    <literal>false</>.  Continuing the example:
 <programlisting>
 ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
 SELECT ts_lexize('public.simple_dict','YeS');
 ts_lexize
 -----------
 SELECT ts_lexize('public.simple_dict','The');
 ts_lexize
 -----------
 {}
 </programlisting>
   </para>
   <para>
    With the default setting of <literal>Accept</> = <literal>true</>,
    it is only useful to place a <literal>simple</> dictionary at the end
    of a list of dictionaries, since it will never pass on any token to
    a following dictionary.  Conversely, <literal>Accept</> = <literal>false</>
    is only useful when there is at least one following dictionary.
   </para>
   <caution>
    <para>
     Most types of dictionaries rely on configuration files, such as files of
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -23,6 +23,7 @@
 typedef struct
 {
 	StopList	stoplist;
 	bool		accept;
 } DictSimple;
@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS)
 {
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 	DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
-	bool		stoploaded = false;
+	bool		stoploaded = false,
 				acceptloaded = false;
 	ListCell   *l;
 	d->accept = true;			/* default */
 	foreach(l, dictoptions)
 	{
 		DefElem    *defel = (DefElem *) lfirst(l);
@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS)
 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 			stoploaded = true;
 		}
 		else if (pg_strcasecmp("Accept", defel->defname) == 0)
 		{
 			if (acceptloaded)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple Accept parameters")));
 			d->accept = defGetBoolean(defel);
 			acceptloaded = true;
 		}
 		else
 		{
 			ereport(ERROR,
@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS)
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32	   len = PG_GETARG_INT32(2);
 	char	   *txt;
-	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
+	TSLexeme   *res;
 	txt = lowerstr_with_len(in, len);
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
 		/* reject as stopword */
 		pfree(txt);
-	else
+		res = palloc0(sizeof(TSLexeme) * 2);
 		res[0].lexeme = txt;
 		PG_RETURN_POINTER(res);
 	}
 	else if (d->accept)
 	{
 		/* accept */
 		res = palloc0(sizeof(TSLexeme) * 2);
 		res[0].lexeme = txt;
 		PG_RETURN_POINTER(res);
 	}
 	else
 	{
 		/* report as unrecognized */
 		pfree(txt);
 		PG_RETURN_POINTER(NULL);
 	}
 }