mirror of https://github.com/postgres/postgres
Add matchorig, matchsynonyms, and keepsynonyms options to contrib/dict_xsyn.
Sergey Karpov
This commit is contained in:
parent
23dc89d2c3
commit
25bd9ce31b
|
@ -6,7 +6,7 @@
|
|||
* Copyright (c) 2007-2009, PostgreSQL Global Development Group
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.6 2009/01/01 17:23:32 momjian Exp $
|
||||
* $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.7 2009/08/05 18:06:49 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
@ -33,7 +33,10 @@ typedef struct
|
|||
int len;
|
||||
Syn *syn;
|
||||
|
||||
bool matchorig;
|
||||
bool keeporig;
|
||||
bool matchsynonyms;
|
||||
bool keepsynonyms;
|
||||
} DictSyn;
|
||||
|
||||
|
||||
|
@ -88,7 +91,8 @@ read_dictionary(DictSyn *d, char *filename)
|
|||
{
|
||||
char *value;
|
||||
char *key;
|
||||
char *end = NULL;
|
||||
char *pos;
|
||||
char *end;
|
||||
|
||||
if (*line == '\0')
|
||||
continue;
|
||||
|
@ -96,26 +100,36 @@ read_dictionary(DictSyn *d, char *filename)
|
|||
value = lowerstr(line);
|
||||
pfree(line);
|
||||
|
||||
key = find_word(value, &end);
|
||||
if (!key)
|
||||
pos = value;
|
||||
while ((key = find_word(pos, &end)) != NULL)
|
||||
{
|
||||
pfree(value);
|
||||
continue;
|
||||
/* Enlarge syn structure if full */
|
||||
if (cur == d->len)
|
||||
{
|
||||
d->len = (d->len > 0) ? 2 * d->len : 16;
|
||||
if (d->syn)
|
||||
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
|
||||
else
|
||||
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
|
||||
}
|
||||
|
||||
/* Save first word only if we will match it */
|
||||
if (pos != value || d->matchorig)
|
||||
{
|
||||
d->syn[cur].key = pnstrdup(key, end - key);
|
||||
d->syn[cur].value = pstrdup(value);
|
||||
|
||||
cur++;
|
||||
}
|
||||
|
||||
pos = end;
|
||||
|
||||
/* Don't bother scanning synonyms if we will not match them */
|
||||
if (!d->matchsynonyms)
|
||||
break;
|
||||
}
|
||||
|
||||
if (cur == d->len)
|
||||
{
|
||||
d->len = (d->len > 0) ? 2 * d->len : 16;
|
||||
if (d->syn)
|
||||
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
|
||||
else
|
||||
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
|
||||
}
|
||||
|
||||
d->syn[cur].key = pnstrdup(key, end - key);
|
||||
d->syn[cur].value = value;
|
||||
|
||||
cur++;
|
||||
pfree(value);
|
||||
}
|
||||
|
||||
tsearch_readline_end(&trst);
|
||||
|
@ -133,23 +147,40 @@ dxsyn_init(PG_FUNCTION_ARGS)
|
|||
List *dictoptions = (List *) PG_GETARG_POINTER(0);
|
||||
DictSyn *d;
|
||||
ListCell *l;
|
||||
char *filename = NULL;
|
||||
|
||||
d = (DictSyn *) palloc0(sizeof(DictSyn));
|
||||
d->len = 0;
|
||||
d->syn = NULL;
|
||||
d->matchorig = true;
|
||||
d->keeporig = true;
|
||||
d->matchsynonyms = false;
|
||||
d->keepsynonyms = true;
|
||||
|
||||
foreach(l, dictoptions)
|
||||
{
|
||||
DefElem *defel = (DefElem *) lfirst(l);
|
||||
|
||||
if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
|
||||
if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0)
|
||||
{
|
||||
d->matchorig = defGetBoolean(defel);
|
||||
}
|
||||
else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
|
||||
{
|
||||
d->keeporig = defGetBoolean(defel);
|
||||
}
|
||||
else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0)
|
||||
{
|
||||
d->matchsynonyms = defGetBoolean(defel);
|
||||
}
|
||||
else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0)
|
||||
{
|
||||
d->keepsynonyms = defGetBoolean(defel);
|
||||
}
|
||||
else if (pg_strcasecmp(defel->defname, "RULES") == 0)
|
||||
{
|
||||
read_dictionary(d, defGetString(defel));
|
||||
/* we can't read the rules before parsing all options! */
|
||||
filename = defGetString(defel);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -160,6 +191,9 @@ dxsyn_init(PG_FUNCTION_ARGS)
|
|||
}
|
||||
}
|
||||
|
||||
if (filename)
|
||||
read_dictionary(d, filename);
|
||||
|
||||
PG_RETURN_POINTER(d);
|
||||
}
|
||||
|
||||
|
@ -194,41 +228,33 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
|
|||
|
||||
/* Parse string of synonyms and return array of words */
|
||||
{
|
||||
char *value = pstrdup(found->value);
|
||||
int value_length = strlen(value);
|
||||
char *pos = value;
|
||||
char *value = found->value;
|
||||
char *syn;
|
||||
char *pos;
|
||||
char *end;
|
||||
int nsyns = 0;
|
||||
bool is_first = true;
|
||||
|
||||
res = palloc(0);
|
||||
res = palloc(sizeof(TSLexeme));
|
||||
|
||||
while (pos < value + value_length)
|
||||
pos = value;
|
||||
while ((syn = find_word(pos, &end)) != NULL)
|
||||
{
|
||||
char *end;
|
||||
char *syn = find_word(pos, &end);
|
||||
|
||||
if (!syn)
|
||||
break;
|
||||
*end = '\0';
|
||||
|
||||
res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
|
||||
res[nsyns].lexeme = NULL;
|
||||
|
||||
/* first word is added to result only if KEEPORIG flag is set */
|
||||
if (d->keeporig || !is_first)
|
||||
/* The first word is output only if keeporig=true */
|
||||
if (pos != value || d->keeporig)
|
||||
{
|
||||
res[nsyns].lexeme = pstrdup(syn);
|
||||
res[nsyns + 1].lexeme = NULL;
|
||||
|
||||
res[nsyns].lexeme = pnstrdup(syn, end - syn);
|
||||
nsyns++;
|
||||
}
|
||||
|
||||
is_first = false;
|
||||
pos = end;
|
||||
|
||||
pos = end + 1;
|
||||
/* Stop if we are not to output the synonyms */
|
||||
if (!d->keepsynonyms)
|
||||
break;
|
||||
}
|
||||
|
||||
pfree(value);
|
||||
res[nsyns].lexeme = NULL;
|
||||
}
|
||||
|
||||
PG_RETURN_POINTER(res);
|
||||
|
|
|
@ -5,10 +5,76 @@
|
|||
SET client_min_messages = warning;
|
||||
\set ECHO none
|
||||
RESET client_min_messages;
|
||||
--configuration
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
|
||||
-- default configuration - match first word and return it among with all synonyms
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
|
||||
--lexize
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
--------------------------
|
||||
{supernova,sn,sne,1987a}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- the same, but return only synonyms
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
----------------
|
||||
{sn,sne,1987a}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- match any word and return all words
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
--------------------------
|
||||
{supernova,sn,sne,1987a}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
--------------------------
|
||||
{supernova,sn,sne,1987a}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- match any word and return all words except first one
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
----------------
|
||||
{sn,sne,1987a}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
----------------
|
||||
{sn,sne,1987a}
|
||||
|
@ -20,3 +86,63 @@ SELECT ts_lexize('xsyn', 'grb');
|
|||
|
||||
(1 row)
|
||||
|
||||
-- match any synonym but not first word, and return first word instead
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
-------------
|
||||
{supernova}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- do not match or return anything
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
-- match any word but return nothing
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
ts_lexize
|
||||
-----------
|
||||
{}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
ts_lexize
|
||||
-----------
|
||||
{}
|
||||
(1 row)
|
||||
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
ts_lexize
|
||||
-----------
|
||||
|
||||
(1 row)
|
||||
|
||||
|
|
|
@ -8,9 +8,46 @@ SET client_min_messages = warning;
|
|||
\set ECHO all
|
||||
RESET client_min_messages;
|
||||
|
||||
--configuration
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
|
||||
-- default configuration - match first word and return it among with all synonyms
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
|
||||
|
||||
--lexize
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
||||
-- the same, but return only synonyms
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
||||
-- match any word and return all words
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
||||
-- match any word and return all words except first one
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
||||
-- match any synonym but not first word, and return first word instead
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
||||
-- do not match or return anything
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
||||
-- match any word but return nothing
|
||||
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
|
||||
SELECT ts_lexize('xsyn', 'supernova');
|
||||
SELECT ts_lexize('xsyn', 'sn');
|
||||
SELECT ts_lexize('xsyn', 'grb');
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<!-- $PostgreSQL: pgsql/doc/src/sgml/dict-xsyn.sgml,v 1.2 2007/12/06 04:12:10 tgl Exp $ -->
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/dict-xsyn.sgml,v 1.3 2009/08/05 18:06:49 tgl Exp $ -->
|
||||
|
||||
<sect1 id="dict-xsyn">
|
||||
<title>dict_xsyn</title>
|
||||
|
@ -23,9 +23,26 @@
|
|||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<literal>keeporig</> controls whether the original word is included (if
|
||||
<literal>true</>), or only its synonyms (if <literal>false</>). Default
|
||||
is <literal>true</>.
|
||||
<literal>matchorig</> controls whether the original word is accepted by
|
||||
the dictionary. Default is <literal>true</>.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<literal>matchsynonyms</> controls whether the synonyms are
|
||||
accepted by the dictionary. Default is <literal>false</>.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<literal>keeporig</> controls whether the original word is included in
|
||||
the dictionary's output. Default is <literal>true</>.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<literal>keepsynonyms</> controls whether the synonyms are included in
|
||||
the dictionary's output. Default is <literal>true</>.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
|
@ -87,13 +104,37 @@ ALTER TEXT SEARCH DICTIONARY
|
|||
To test the dictionary, you can try
|
||||
|
||||
<programlisting>
|
||||
mydb=# SELECT ts_lexize('xsyn', 'word');
|
||||
ts_lexize
|
||||
-----------------------
|
||||
{syn1,syn2,syn3}
|
||||
|
||||
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true);
|
||||
ALTER TEXT SEARCH DICTIONARY
|
||||
|
||||
mydb=# SELECT ts_lexize('xsyn', 'word');
|
||||
ts_lexize
|
||||
-----------------------
|
||||
{word,syn1,syn2,syn3}
|
||||
|
||||
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true);
|
||||
ALTER TEXT SEARCH DICTIONARY
|
||||
|
||||
mydb=# SELECT ts_lexize('xsyn', 'syn1');
|
||||
ts_lexize
|
||||
-----------------------
|
||||
{syn1,syn2,syn3}
|
||||
|
||||
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false);
|
||||
ALTER TEXT SEARCH DICTIONARY
|
||||
|
||||
mydb=# SELECT ts_lexize('xsyn', 'syn1');
|
||||
ts_lexize
|
||||
-----------------------
|
||||
{word}
|
||||
</programlisting>
|
||||
|
||||
but real-world usage will involve including it in a text search
|
||||
Real-world usage will involve including it in a text search
|
||||
configuration as described in <xref linkend="textsearch">.
|
||||
That might look like this:
|
||||
|
||||
|
|
Loading…
Reference in New Issue