diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 0ba401c2a4..31753791cd 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a The simple dictionary template operates by converting the input token to lower case and checking it against a file of stop words. - If it is found in the file then NULL is returned, causing + If it is found in the file then an empty array is returned, causing the token to be discarded. If not, the lower-cased form of the word - is returned as the normalized lexeme. + is returned as the normalized lexeme. Alternatively, the dictionary + can be configured to report non-stop-words as unrecognized, allowing + them to be passed on to the next dictionary in the list. @@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The'); + + We can also choose to return NULL, instead of the lower-cased + word, if it is not found in the stop words file. This behavior is + selected by setting the dictionary's Accept parameter to + false. Continuing the example: + + +ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false ); + +SELECT ts_lexize('public.simple_dict','YeS'); + ts_lexize +----------- + + +SELECT ts_lexize('public.simple_dict','The'); + ts_lexize +----------- + {} + + + + + With the default setting of Accept = true, + it is only useful to place a simple dictionary at the end + of a list of dictionaries, since it will never pass on any token to + a following dictionary. Conversely, Accept = false + is only useful when there is at least one following dictionary. + + Most types of dictionaries rely on configuration files, such as files of diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c index aea2c0963b..8248d3987d 100644 --- a/src/backend/tsearch/dict_simple.c +++ b/src/backend/tsearch/dict_simple.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -23,6 +23,7 @@ typedef struct { StopList stoplist; + bool accept; } DictSimple; @@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS) { List *dictoptions = (List *) PG_GETARG_POINTER(0); DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple)); - bool stoploaded = false; + bool stoploaded = false, + acceptloaded = false; ListCell *l; + d->accept = true; /* default */ + foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); @@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS) readstoplist(defGetString(defel), &d->stoplist, lowerstr); stoploaded = true; } + else if (pg_strcasecmp("Accept", defel->defname) == 0) + { + if (acceptloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Accept parameters"))); + d->accept = defGetBoolean(defel); + acceptloaded = true; + } else { ereport(ERROR, @@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS) char *in = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); char *txt; - TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); + TSLexeme *res; txt = lowerstr_with_len(in, len); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) + { + /* reject as stopword */ pfree(txt); - else + res = palloc0(sizeof(TSLexeme) * 2); + PG_RETURN_POINTER(res); + } + else if (d->accept) + { + /* accept */ + res = palloc0(sizeof(TSLexeme) * 2); res[0].lexeme = txt; - - PG_RETURN_POINTER(res); + PG_RETURN_POINTER(res); + } + else + { + /* report as unrecognized */ + pfree(txt); + PG_RETURN_POINTER(NULL); + } }