Add an Accept parameter to "simple" dictionaries. The default of true
gives the old behavior; selecting false allows the dictionary to be used as a filter ahead of other dictionaries, because it will pass on rather than accept words that aren't in its stopword list. Jan Urbanski
This commit is contained in:
parent
a44c81d1b7
commit
ca450a07ee
@ -1,4 +1,4 @@
|
|||||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ -->
|
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
|
||||||
|
|
||||||
<chapter id="textsearch">
|
<chapter id="textsearch">
|
||||||
<title id="textsearch-title">Full Text Search</title>
|
<title id="textsearch-title">Full Text Search</title>
|
||||||
@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a
|
|||||||
<para>
|
<para>
|
||||||
The <literal>simple</> dictionary template operates by converting the
|
The <literal>simple</> dictionary template operates by converting the
|
||||||
input token to lower case and checking it against a file of stop words.
|
input token to lower case and checking it against a file of stop words.
|
||||||
If it is found in the file then <literal>NULL</> is returned, causing
|
If it is found in the file then an empty array is returned, causing
|
||||||
the token to be discarded. If not, the lower-cased form of the word
|
the token to be discarded. If not, the lower-cased form of the word
|
||||||
is returned as the normalized lexeme.
|
is returned as the normalized lexeme. Alternatively, the dictionary
|
||||||
|
can be configured to report non-stop-words as unrecognized, allowing
|
||||||
|
them to be passed on to the next dictionary in the list.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The');
|
|||||||
</programlisting>
|
</programlisting>
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
We can also choose to return <literal>NULL</>, instead of the lower-cased
|
||||||
|
word, if it is not found in the stop words file. This behavior is
|
||||||
|
selected by setting the dictionary's <literal>Accept</> parameter to
|
||||||
|
<literal>false</>. Continuing the example:
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
|
||||||
|
|
||||||
|
SELECT ts_lexize('public.simple_dict','YeS');
|
||||||
|
ts_lexize
|
||||||
|
-----------
|
||||||
|
|
||||||
|
|
||||||
|
SELECT ts_lexize('public.simple_dict','The');
|
||||||
|
ts_lexize
|
||||||
|
-----------
|
||||||
|
{}
|
||||||
|
</programlisting>
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
With the default setting of <literal>Accept</> = <literal>true</>,
|
||||||
|
it is only useful to place a <literal>simple</> dictionary at the end
|
||||||
|
of a list of dictionaries, since it will never pass on any token to
|
||||||
|
a following dictionary. Conversely, <literal>Accept</> = <literal>false</>
|
||||||
|
is only useful when there is at least one following dictionary.
|
||||||
|
</para>
|
||||||
|
|
||||||
<caution>
|
<caution>
|
||||||
<para>
|
<para>
|
||||||
Most types of dictionaries rely on configuration files, such as files of
|
Most types of dictionaries rely on configuration files, such as files of
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -23,6 +23,7 @@
|
|||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
StopList stoplist;
|
StopList stoplist;
|
||||||
|
bool accept;
|
||||||
} DictSimple;
|
} DictSimple;
|
||||||
|
|
||||||
|
|
||||||
@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS)
|
|||||||
{
|
{
|
||||||
List *dictoptions = (List *) PG_GETARG_POINTER(0);
|
List *dictoptions = (List *) PG_GETARG_POINTER(0);
|
||||||
DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
|
DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
|
||||||
bool stoploaded = false;
|
bool stoploaded = false,
|
||||||
|
acceptloaded = false;
|
||||||
ListCell *l;
|
ListCell *l;
|
||||||
|
|
||||||
|
d->accept = true; /* default */
|
||||||
|
|
||||||
foreach(l, dictoptions)
|
foreach(l, dictoptions)
|
||||||
{
|
{
|
||||||
DefElem *defel = (DefElem *) lfirst(l);
|
DefElem *defel = (DefElem *) lfirst(l);
|
||||||
@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS)
|
|||||||
readstoplist(defGetString(defel), &d->stoplist, lowerstr);
|
readstoplist(defGetString(defel), &d->stoplist, lowerstr);
|
||||||
stoploaded = true;
|
stoploaded = true;
|
||||||
}
|
}
|
||||||
|
else if (pg_strcasecmp("Accept", defel->defname) == 0)
|
||||||
|
{
|
||||||
|
if (acceptloaded)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("multiple Accept parameters")));
|
||||||
|
d->accept = defGetBoolean(defel);
|
||||||
|
acceptloaded = true;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS)
|
|||||||
char *in = (char *) PG_GETARG_POINTER(1);
|
char *in = (char *) PG_GETARG_POINTER(1);
|
||||||
int32 len = PG_GETARG_INT32(2);
|
int32 len = PG_GETARG_INT32(2);
|
||||||
char *txt;
|
char *txt;
|
||||||
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
|
TSLexeme *res;
|
||||||
|
|
||||||
txt = lowerstr_with_len(in, len);
|
txt = lowerstr_with_len(in, len);
|
||||||
|
|
||||||
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
|
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
|
||||||
|
{
|
||||||
|
/* reject as stopword */
|
||||||
pfree(txt);
|
pfree(txt);
|
||||||
else
|
res = palloc0(sizeof(TSLexeme) * 2);
|
||||||
res[0].lexeme = txt;
|
|
||||||
|
|
||||||
PG_RETURN_POINTER(res);
|
PG_RETURN_POINTER(res);
|
||||||
}
|
}
|
||||||
|
else if (d->accept)
|
||||||
|
{
|
||||||
|
/* accept */
|
||||||
|
res = palloc0(sizeof(TSLexeme) * 2);
|
||||||
|
res[0].lexeme = txt;
|
||||||
|
PG_RETURN_POINTER(res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* report as unrecognized */
|
||||||
|
pfree(txt);
|
||||||
|
PG_RETURN_POINTER(NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user