Have text search thesaurus files use "?" for stop words.
Throw an error for actual stop words, rather than a warning. This fixes problems with cache reloading causing warning messages. Re-enable stop words in regression tests; was disabled by Tom. Document "?" as API change.
This commit is contained in:
parent
82748bc253
commit
d009992ba3
@ -1,4 +1,4 @@
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.30 2007/11/05 15:55:53 mha Exp $ -->
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.31 2007/11/10 15:39:34 momjian Exp $ -->
|
||||
|
||||
<chapter id="textsearch">
|
||||
<title id="textsearch-title">Full Text Search</title>
|
||||
@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s)
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Stop words recognized by the subdictionary are replaced by a <quote>stop
|
||||
word placeholder</quote> to record their position. To illustrate this,
|
||||
consider these phrases:
|
||||
Specific stop words recognized by the subdictionary cannot be
|
||||
specified; instead use <literal>?</> to mark the location where any
|
||||
stop word can appear. For example, assuming that <literal>a</> and
|
||||
<literal>the</> are stop words according to the subdictionary:
|
||||
|
||||
<programlisting>
|
||||
a one the two : swsw
|
||||
the one a two : swsw2
|
||||
? one ? two : swsw
|
||||
</programlisting>
|
||||
|
||||
Assuming that <literal>a</> and <literal>the</> are stop words according
|
||||
to the subdictionary, these two phrases are identical to the thesaurus:
|
||||
they both look like <replaceable>stopword</> <literal>one</>
|
||||
<replaceable>stopword</> <literal>two</>. Input matching this pattern
|
||||
will be replaced by <literal>swsw2</>, according to the tie-breaking rule.
|
||||
matches <literal>a one the two</> and <literal>the one a two</>;
|
||||
both would be replaced by <literal>swsw</>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default"
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Thesaurus files now use <literal>?</> for stop words.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
What else?
|
||||
|
@ -7,7 +7,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -412,47 +412,48 @@ compileTheLexeme(DictThesaurus * d)
|
||||
{
|
||||
TSLexeme *ptr;
|
||||
|
||||
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
||||
PointerGetDatum(d->subdict->dictData),
|
||||
PointerGetDatum(d->wrds[i].lexeme),
|
||||
Int32GetDatum(strlen(d->wrds[i].lexeme)),
|
||||
PointerGetDatum(NULL)));
|
||||
|
||||
if (!ptr)
|
||||
elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
|
||||
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
|
||||
else if (!(ptr->lexeme))
|
||||
{
|
||||
elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
|
||||
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
|
||||
|
||||
if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
|
||||
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
while (ptr->lexeme)
|
||||
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
||||
PointerGetDatum(d->subdict->dictData),
|
||||
PointerGetDatum(d->wrds[i].lexeme),
|
||||
Int32GetDatum(strlen(d->wrds[i].lexeme)),
|
||||
PointerGetDatum(NULL)));
|
||||
|
||||
if (!ptr)
|
||||
elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
|
||||
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
|
||||
else if (!(ptr->lexeme))
|
||||
elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)",
|
||||
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
|
||||
else
|
||||
{
|
||||
TSLexeme *remptr = ptr + 1;
|
||||
int tnvar = 1;
|
||||
int curvar = ptr->nvariant;
|
||||
|
||||
/* compute n words in one variant */
|
||||
while (remptr->lexeme)
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
if (remptr->nvariant != (remptr - 1)->nvariant)
|
||||
break;
|
||||
tnvar++;
|
||||
remptr++;
|
||||
TSLexeme *remptr = ptr + 1;
|
||||
int tnvar = 1;
|
||||
int curvar = ptr->nvariant;
|
||||
|
||||
/* compute n words in one variant */
|
||||
while (remptr->lexeme)
|
||||
{
|
||||
if (remptr->nvariant != (remptr - 1)->nvariant)
|
||||
break;
|
||||
tnvar++;
|
||||
remptr++;
|
||||
}
|
||||
|
||||
remptr = ptr;
|
||||
while (remptr->lexeme && remptr->nvariant == curvar)
|
||||
{
|
||||
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
||||
remptr++;
|
||||
}
|
||||
|
||||
ptr = remptr;
|
||||
}
|
||||
|
||||
remptr = ptr;
|
||||
while (remptr->lexeme && remptr->nvariant == curvar)
|
||||
{
|
||||
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
||||
remptr++;
|
||||
}
|
||||
|
||||
ptr = remptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14,4 +14,5 @@ two : *2
|
||||
supernovae stars : *sn
|
||||
supernovae : *sn
|
||||
booking tickets : order invitation cards
|
||||
# booking the tickets : order invitation Cards
|
||||
booking ? tickets : order invitation Cards
|
||||
|
||||
|
@ -311,8 +311,8 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall
|
||||
(1 row)
|
||||
|
||||
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
|
||||
to_tsvector
|
||||
---------------------------------------------------------------------
|
||||
'book':8 'card':3 'like':6 'look':5 'invit':2 'order':1 'ticket':10
|
||||
to_tsvector
|
||||
-------------------------------------------------------
|
||||
'card':3,10 'like':6 'look':5 'invit':2,9 'order':1,8
|
||||
(1 row)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user