diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 26fdad0c6f..e556c6dd78 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s) - Stop words recognized by the subdictionary are replaced by a stop - word placeholder to record their position. To illustrate this, - consider these phrases: + Specific stop words recognized by the subdictionary cannot be + specified; instead use ? to mark the location where any + stop word can appear. For example, assuming that a and + the are stop words according to the subdictionary: -a one the two : swsw -the one a two : swsw2 +? one ? two : swsw - Assuming that a and the are stop words according - to the subdictionary, these two phrases are identical to the thesaurus: - they both look like stopword one - stopword two. Input matching this pattern - will be replaced by swsw2, according to the tie-breaking rule. + matches a one the two and the one a two; + both would be replaced by swsw. @@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default" + + + Thesaurus files now use ? for stop words. + + + What else? diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 7a0ae4afd3..31564a7899 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -412,47 +412,48 @@ compileTheLexeme(DictThesaurus * d) { TSLexeme *ptr; - ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), - PointerGetDatum(d->subdict->dictData), - PointerGetDatum(d->wrds[i].lexeme), - Int32GetDatum(strlen(d->wrds[i].lexeme)), - PointerGetDatum(NULL))); - - if (!ptr) - elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)", - d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); - else if (!(ptr->lexeme)) - { - elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)", - d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); - + if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */ newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0); - } else { - while (ptr->lexeme) + ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), + PointerGetDatum(d->subdict->dictData), + PointerGetDatum(d->wrds[i].lexeme), + Int32GetDatum(strlen(d->wrds[i].lexeme)), + PointerGetDatum(NULL))); + + if (!ptr) + elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)", + d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); + else if (!(ptr->lexeme)) + elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)", + d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); + else { - TSLexeme *remptr = ptr + 1; - int tnvar = 1; - int curvar = ptr->nvariant; - - /* compute n words in one variant */ - while (remptr->lexeme) + while (ptr->lexeme) { - if (remptr->nvariant != (remptr - 1)->nvariant) - break; - tnvar++; - remptr++; + TSLexeme *remptr = ptr + 1; + int tnvar = 1; + int curvar = ptr->nvariant; + + /* compute n words in one variant */ + while (remptr->lexeme) + { + if (remptr->nvariant != (remptr - 1)->nvariant) + break; + tnvar++; + remptr++; + } + + remptr = ptr; + while (remptr->lexeme && remptr->nvariant == curvar) + { + newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); + remptr++; + } + + ptr = remptr; } - - remptr = ptr; - while (remptr->lexeme && remptr->nvariant == curvar) - { - newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar); - remptr++; - } - - ptr = remptr; } } diff --git a/src/backend/tsearch/thesaurus_sample.ths b/src/backend/tsearch/thesaurus_sample.ths index 77a32a75d0..0b4857ec33 100644 --- a/src/backend/tsearch/thesaurus_sample.ths +++ b/src/backend/tsearch/thesaurus_sample.ths @@ -14,4 +14,5 @@ two : *2 supernovae stars : *sn supernovae : *sn booking tickets : order invitation cards -# booking the tickets : order invitation Cards +booking ? tickets : order invitation Cards + diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out index 3520baceac..4b8929361a 100644 --- a/src/test/regress/expected/tsdicts.out +++ b/src/test/regress/expected/tsdicts.out @@ -311,8 +311,8 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall (1 row) SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); - to_tsvector ---------------------------------------------------------------------- - 'book':8 'card':3 'like':6 'look':5 'invit':2 'order':1 'ticket':10 + to_tsvector +------------------------------------------------------- + 'card':3,10 'like':6 'look':5 'invit':2,9 'order':1,8 (1 row)