Add sample text search dictionary templates and parsers, to replace the

hard-to-maintain textual examples currently in the SGML docs. From Sergey Karpov.
2007-10-15 21:36:50 +00:00 · 2007-10-15 21:36:50 +00:00 · 5fcb079858
commit 5fcb079858
parent fb631dba2a
24 changed files with 1324 additions and 9 deletions
--- a/contrib/Makefile
+++ b/contrib/Makefile
@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.80 2007/10/13 22:59:43 tgl Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.81 2007/10/15 21:36:49 tgl Exp $

 subdir = contrib
 top_builddir = ..
@ -10,6 +10,8 @@ WANTED_DIRS = \
 		chkpass		\
 		cube		\
 		dblink		\
+		dict_int	\
+		dict_xsyn	\
 		earthdistance	\
 		fuzzystrmatch	\
 		hstore		\
@ -31,6 +33,7 @@ WANTED_DIRS = \
 		seg		\
 		spi		\
 		tablefunc	\
+		test_parser	\
 		vacuumlo

 ifeq ($(with_openssl),yes)
--- a/contrib/README
+++ b/contrib/README
@ -1,4 +1,3 @@
-
 The PostgreSQL contrib tree
 ---------------------------

@ -29,8 +28,8 @@ adminpack -
 	by Dave Page <dpage@vale-housing.co.uk>

 btree_gist -
-      Support for emulating BTREE indexing in GiST
-      by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
+	Support for emulating BTREE indexing in GiST
+	by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>

 chkpass -
 	An auto-encrypted password datatype
@ -44,8 +43,16 @@ dblink -
 	Allows remote query execution
 	by Joe Conway <mail@joeconway.com>

+dict_int -
+	Text search dictionary template for integers
+	by Sergey Karpov <karpov@sao.ru>
+
+dict_xsyn -
+	Text search dictionary template for extended synonym processing
+	by Sergey Karpov <karpov@sao.ru>
+
 earthdistance -
-	Operator for computing earth distance for two points
+	Operator for computing earth distance between two points
 	by Hal Snyder <hal@vailsys.com>

 fuzzystrmatch -
@ -53,8 +60,8 @@ fuzzystrmatch -
 	by Joe Conway <mail@joeconway.com>, Joel Burton <jburton@scw.org>

 hstore -
-	Hstore - module for storing (key,value) pairs
-    by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>
+	Module for storing (key, value) pairs
+	by Oleg Bartunov <oleg@sai.msu.su> and Teodor Sigaev <teodor@sigaev.ru>

 intagg -
 	Integer aggregator
@ -92,6 +99,10 @@ pg_freespacemap -
 	Displays the contents of the free space map (FSM)
 	by Mark Kirkwood <markir@paradise.net.nz>

+pg_standby -
+	Sample archive_command for warm standby operation
+	by Simon Riggs <simon@2ndquadrant.com>
+
 pg_trgm -
 	Functions for determining the similarity of text based on trigram
 	matching.
@ -110,7 +121,7 @@ pgrowlocks -
 	by Tatsuo Ishii <ishii@sraoss.co.jp>

 pgstattuple -
-	A function to return statistics about "dead" tuples and free
+	Functions to return statistics about "dead" tuples and free
 	space within a table
 	by Tatsuo Ishii <ishii@sraoss.co.jp>

@ -126,12 +137,16 @@ sslinfo -
 	by Victor Wagner <vitus@cryptocom.ru>

 start-scripts - 
-	Scripts for starting the server at boot time.
+	Scripts for starting the server at boot time on various platforms.

 tablefunc -
 	Examples of functions returning tables
 	by Joe Conway <mail@joeconway.com>

+test_parser -
+	Sample text search parser
+	by Sergey Karpov <karpov@sao.ru>
+
 tsearch2 -
 	Full-text-index support using GiST
 	by Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov
--- a/contrib/dict_int/Makefile
+++ b/contrib/dict_int/Makefile
@ -0,0 +1,19 @@
+# $PostgreSQL: pgsql/contrib/dict_int/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+MODULE_big = dict_int
+OBJS = dict_int.o
+DATA_built = dict_int.sql
+DATA = uninstall_dict_int.sql
+DOCS = README.dict_int
+REGRESS = dict_int
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/dict_int
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
--- a/contrib/dict_int/README.dict_int
+++ b/contrib/dict_int/README.dict_int
@ -0,0 +1,41 @@
+Dictionary for integers
+=======================
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number of
+unique words which greatly affect the performance of searching.
+
+* Configuration
+
+The dictionary accepts two options: 
+
+  - The MAXLEN parameter specifies the maximum length (number of digits)
+    allowed in an integer word.  The default value is 6.
+
+  - The REJECTLONG parameter specifies if an overlength integer should be
+    truncated or ignored. If REJECTLONG=FALSE (default), the dictionary returns
+    the first MAXLEN digits of the integer. If REJECTLONG=TRUE, the
+    dictionary treats an overlength integer as a stop word, so that it will
+    not be indexed.
+
+* Usage
+
+1. Compile and install
+
+2. Load dictionary
+
+   psql mydb < dict_int.sql
+
+3. Test it
+ 
+   mydb# select ts_lexize('intdict', '12345678');
+    ts_lexize
+   -----------
+    {123456}
+
+4. Change its options as you wish
+
+   mydb# ALTER TEXT SEARCH DICTIONARY intdict (MAXLEN = 4, REJECTLONG = true);
+   ALTER TEXT SEARCH DICTIONARY
+
+That's all.
--- a/contrib/dict_int/dict_int.c
+++ b/contrib/dict_int/dict_int.c
@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_int.c
+ *	  Text search dictionary for integers
+ *
+ * Copyright (c) 2007, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/contrib/dict_int/dict_int.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "tsearch/ts_public.h"
+
+PG_MODULE_MAGIC;
+
+
+typedef struct {
+	int     maxlen;
+	bool    rejectlong;
+} DictInt;
+
+
+PG_FUNCTION_INFO_V1(dintdict_init);
+Datum dintdict_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(dintdict_lexize);
+Datum dintdict_lexize(PG_FUNCTION_ARGS);
+
+Datum
+dintdict_init(PG_FUNCTION_ARGS)
+{
+	List		*dictoptions = (List *) PG_GETARG_POINTER(0);
+	DictInt 	*d;
+	ListCell	*l;
+
+	d = (DictInt *) palloc0(sizeof(DictInt));
+	d->maxlen = 6;
+	d->rejectlong = false;
+
+	foreach(l, dictoptions)
+	{
+		DefElem *defel = (DefElem *) lfirst(l);
+
+		if (pg_strcasecmp(defel->defname, "MAXLEN") == 0)
+		{
+			d->maxlen = atoi(defGetString(defel));
+		}
+		else if (pg_strcasecmp(defel->defname, "REJECTLONG") == 0)
+		{
+			d->rejectlong = defGetBoolean(defel);
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized intdict parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+	
+	PG_RETURN_POINTER(d);
+}
+
+Datum
+dintdict_lexize(PG_FUNCTION_ARGS)
+{
+	DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+	char       *in = (char*)PG_GETARG_POINTER(1);
+	char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+	TSLexeme *res=palloc(sizeof(TSLexeme)*2);
+
+	res[1].lexeme = NULL;
+	if  (PG_GETARG_INT32(2) > d->maxlen)
+	{
+		if ( d->rejectlong )
+		{
+			/* reject by returning void array */
+			pfree(txt);
+			res[0].lexeme = NULL;
+		}
+		else
+		{
+			/* trim integer */
+			txt[d->maxlen] = '\0';
+			res[0].lexeme = txt;
+		}
+	}
+	else
+	{
+		res[0].lexeme = txt;
+	}
+
+	PG_RETURN_POINTER(res);
+}
--- a/contrib/dict_int/dict_int.sql.in
+++ b/contrib/dict_int/dict_int.sql.in
@ -0,0 +1,29 @@
+-- $PostgreSQL: pgsql/contrib/dict_int/dict_int.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+BEGIN;
+
+CREATE FUNCTION dintdict_init(internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE FUNCTION dintdict_lexize(internal, internal, internal, internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH TEMPLATE intdict_template (
+        LEXIZE = dintdict_lexize,
+	INIT   = dintdict_init
+);
+
+CREATE TEXT SEARCH DICTIONARY intdict (
+	TEMPLATE = intdict_template
+);
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'dictionary for integers';
+
+END;
--- a/contrib/dict_int/expected/dict_int.out
+++ b/contrib/dict_int/expected/dict_int.out
@ -0,0 +1,308 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+--lexize
+select ts_lexize('intdict', '511673');
+ ts_lexize 
+-----------
+ {511673}
+(1 row)
+
+select ts_lexize('intdict', '129');
+ ts_lexize 
+-----------
+ {129}
+(1 row)
+
+select ts_lexize('intdict', '40865854');
+ ts_lexize 
+-----------
+ {408658}
+(1 row)
+
+select ts_lexize('intdict', '952');
+ ts_lexize 
+-----------
+ {952}
+(1 row)
+
+select ts_lexize('intdict', '654980341');
+ ts_lexize 
+-----------
+ {654980}
+(1 row)
+
+select ts_lexize('intdict', '09810106');
+ ts_lexize 
+-----------
+ {098101}
+(1 row)
+
+select ts_lexize('intdict', '14262713');
+ ts_lexize 
+-----------
+ {142627}
+(1 row)
+
+select ts_lexize('intdict', '6532082986');
+ ts_lexize 
+-----------
+ {653208}
+(1 row)
+
+select ts_lexize('intdict', '0150061');
+ ts_lexize 
+-----------
+ {015006}
+(1 row)
+
+select ts_lexize('intdict', '7778');
+ ts_lexize 
+-----------
+ {7778}
+(1 row)
+
+select ts_lexize('intdict', '9547');
+ ts_lexize 
+-----------
+ {9547}
+(1 row)
+
+select ts_lexize('intdict', '753395478');
+ ts_lexize 
+-----------
+ {753395}
+(1 row)
+
+select ts_lexize('intdict', '647652');
+ ts_lexize 
+-----------
+ {647652}
+(1 row)
+
+select ts_lexize('intdict', '6988655574');
+ ts_lexize 
+-----------
+ {698865}
+(1 row)
+
+select ts_lexize('intdict', '1279');
+ ts_lexize 
+-----------
+ {1279}
+(1 row)
+
+select ts_lexize('intdict', '1266645909');
+ ts_lexize 
+-----------
+ {126664}
+(1 row)
+
+select ts_lexize('intdict', '7594193969');
+ ts_lexize 
+-----------
+ {759419}
+(1 row)
+
+select ts_lexize('intdict', '16928207');
+ ts_lexize 
+-----------
+ {169282}
+(1 row)
+
+select ts_lexize('intdict', '196850350328');
+ ts_lexize 
+-----------
+ {196850}
+(1 row)
+
+select ts_lexize('intdict', '22026985592');
+ ts_lexize 
+-----------
+ {220269}
+(1 row)
+
+select ts_lexize('intdict', '2063765');
+ ts_lexize 
+-----------
+ {206376}
+(1 row)
+
+select ts_lexize('intdict', '242387310');
+ ts_lexize 
+-----------
+ {242387}
+(1 row)
+
+select ts_lexize('intdict', '93595');
+ ts_lexize 
+-----------
+ {93595}
+(1 row)
+
+select ts_lexize('intdict', '9374');
+ ts_lexize 
+-----------
+ {9374}
+(1 row)
+
+select ts_lexize('intdict', '996969');
+ ts_lexize 
+-----------
+ {996969}
+(1 row)
+
+select ts_lexize('intdict', '353595982');
+ ts_lexize 
+-----------
+ {353595}
+(1 row)
+
+select ts_lexize('intdict', '925860');
+ ts_lexize 
+-----------
+ {925860}
+(1 row)
+
+select ts_lexize('intdict', '11848378337');
+ ts_lexize 
+-----------
+ {118483}
+(1 row)
+
+select ts_lexize('intdict', '333');
+ ts_lexize 
+-----------
+ {333}
+(1 row)
+
+select ts_lexize('intdict', '799287416765');
+ ts_lexize 
+-----------
+ {799287}
+(1 row)
+
+select ts_lexize('intdict', '745939');
+ ts_lexize 
+-----------
+ {745939}
+(1 row)
+
+select ts_lexize('intdict', '67601305734');
+ ts_lexize 
+-----------
+ {676013}
+(1 row)
+
+select ts_lexize('intdict', '3361113');
+ ts_lexize 
+-----------
+ {336111}
+(1 row)
+
+select ts_lexize('intdict', '9033778607');
+ ts_lexize 
+-----------
+ {903377}
+(1 row)
+
+select ts_lexize('intdict', '7507648');
+ ts_lexize 
+-----------
+ {750764}
+(1 row)
+
+select ts_lexize('intdict', '1166');
+ ts_lexize 
+-----------
+ {1166}
+(1 row)
+
+select ts_lexize('intdict', '9360498');
+ ts_lexize 
+-----------
+ {936049}
+(1 row)
+
+select ts_lexize('intdict', '917795');
+ ts_lexize 
+-----------
+ {917795}
+(1 row)
+
+select ts_lexize('intdict', '9387894');
+ ts_lexize 
+-----------
+ {938789}
+(1 row)
+
+select ts_lexize('intdict', '42764329');
+ ts_lexize 
+-----------
+ {427643}
+(1 row)
+
+select ts_lexize('intdict', '564062');
+ ts_lexize 
+-----------
+ {564062}
+(1 row)
+
+select ts_lexize('intdict', '5413377');
+ ts_lexize 
+-----------
+ {541337}
+(1 row)
+
+select ts_lexize('intdict', '060965');
+ ts_lexize 
+-----------
+ {060965}
+(1 row)
+
+select ts_lexize('intdict', '08273593');
+ ts_lexize 
+-----------
+ {082735}
+(1 row)
+
+select ts_lexize('intdict', '593556010144');
+ ts_lexize 
+-----------
+ {593556}
+(1 row)
+
+select ts_lexize('intdict', '17988843352');
+ ts_lexize 
+-----------
+ {179888}
+(1 row)
+
+select ts_lexize('intdict', '252281774');
+ ts_lexize 
+-----------
+ {252281}
+(1 row)
+
+select ts_lexize('intdict', '313425');
+ ts_lexize 
+-----------
+ {313425}
+(1 row)
+
+select ts_lexize('intdict', '641439323669');
+ ts_lexize 
+-----------
+ {641439}
+(1 row)
+
+select ts_lexize('intdict', '314532610153');
+ ts_lexize 
+-----------
+ {314532}
+(1 row)
+
--- a/contrib/dict_int/sql/dict_int.sql
+++ b/contrib/dict_int/sql/dict_int.sql
@ -0,0 +1,61 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+\i dict_int.sql
+\set ECHO all
+RESET client_min_messages;
+
+--lexize
+select ts_lexize('intdict', '511673');
+select ts_lexize('intdict', '129');
+select ts_lexize('intdict', '40865854');
+select ts_lexize('intdict', '952');
+select ts_lexize('intdict', '654980341');
+select ts_lexize('intdict', '09810106');
+select ts_lexize('intdict', '14262713');
+select ts_lexize('intdict', '6532082986');
+select ts_lexize('intdict', '0150061');
+select ts_lexize('intdict', '7778');
+select ts_lexize('intdict', '9547');
+select ts_lexize('intdict', '753395478');
+select ts_lexize('intdict', '647652');
+select ts_lexize('intdict', '6988655574');
+select ts_lexize('intdict', '1279');
+select ts_lexize('intdict', '1266645909');
+select ts_lexize('intdict', '7594193969');
+select ts_lexize('intdict', '16928207');
+select ts_lexize('intdict', '196850350328');
+select ts_lexize('intdict', '22026985592');
+select ts_lexize('intdict', '2063765');
+select ts_lexize('intdict', '242387310');
+select ts_lexize('intdict', '93595');
+select ts_lexize('intdict', '9374');
+select ts_lexize('intdict', '996969');
+select ts_lexize('intdict', '353595982');
+select ts_lexize('intdict', '925860');
+select ts_lexize('intdict', '11848378337');
+select ts_lexize('intdict', '333');
+select ts_lexize('intdict', '799287416765');
+select ts_lexize('intdict', '745939');
+select ts_lexize('intdict', '67601305734');
+select ts_lexize('intdict', '3361113');
+select ts_lexize('intdict', '9033778607');
+select ts_lexize('intdict', '7507648');
+select ts_lexize('intdict', '1166');
+select ts_lexize('intdict', '9360498');
+select ts_lexize('intdict', '917795');
+select ts_lexize('intdict', '9387894');
+select ts_lexize('intdict', '42764329');
+select ts_lexize('intdict', '564062');
+select ts_lexize('intdict', '5413377');
+select ts_lexize('intdict', '060965');
+select ts_lexize('intdict', '08273593');
+select ts_lexize('intdict', '593556010144');
+select ts_lexize('intdict', '17988843352');
+select ts_lexize('intdict', '252281774');
+select ts_lexize('intdict', '313425');
+select ts_lexize('intdict', '641439323669');
+select ts_lexize('intdict', '314532610153');
--- a/contrib/dict_int/uninstall_dict_int.sql
+++ b/contrib/dict_int/uninstall_dict_int.sql
@ -0,0 +1,9 @@
+SET search_path = public;
+
+DROP TEXT SEARCH DICTIONARY intdict;
+
+DROP TEXT SEARCH TEMPLATE intdict_template;
+
+DROP FUNCTION dintdict_init(internal);
+
+DROP FUNCTION dintdict_lexize(internal,internal,internal,internal);
--- a/contrib/dict_xsyn/Makefile
+++ b/contrib/dict_xsyn/Makefile
@ -0,0 +1,38 @@
+# $PostgreSQL: pgsql/contrib/dict_xsyn/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+MODULE_big = dict_xsyn
+OBJS = dict_xsyn.o
+DATA_built = dict_xsyn.sql
+DATA = uninstall_dict_xsyn.sql
+DOCS = README.dict_xsyn
+REGRESS = dict_xsyn
+
+DICTDIR = tsearch_data
+DICTFILES = xsyn_sample.rules
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/dict_xsyn
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+install: install-data
+
+.PHONY: install-data
+install-data: $(DICTFILES)
+	for i in $(DICTFILES); \
+		do $(INSTALL_DATA) $(srcdir)/$$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \
+	done
+
+uninstall: uninstall-data
+
+.PHONY: uninstall-data
+uninstall-data:
+	for i in $(DICTFILES); \
+		do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \
+	done
--- a/contrib/dict_xsyn/README.dict_xsyn
+++ b/contrib/dict_xsyn/README.dict_xsyn
@ -0,0 +1,52 @@
+Extended Synonym dictionary
+===========================
+
+This is a simple synonym dictionary. It replaces words with groups of their
+synonyms, and so makes it possible to search for a word using any of its
+synonyms.
+
+* Configuration
+
+It accepts the following options:
+ 
+ - KEEPORIG controls whether the original word is included, or only its
+   synonyms. Default is 'true'.
+
+ - RULES is the base name of the file containing the list of synonyms.
+   This file must be in $(prefix)/share/tsearch_data/, and its name must
+   end in ".rules" (which is not included in the RULES parameter).
+
+The rules file has the following format:
+
+ - Each line represents a group of synonyms for a single word, which is
+   given first on the line. Synonyms are separated by whitespace:
+   
+   word syn1 syn2 syn3
+
+ - Sharp ('#') sign is a comment delimiter. It may appear at any position
+   inside the line.  The rest of the line will be skipped.
+
+Look at xsyn_sample.rules, which is installed in $(prefix)/share/tsearch_data/,
+for an example.
+
+* Usage
+
+1. Compile and install
+
+2. Load dictionary
+
+   psql mydb < dict_xsyn.sql
+
+3. Test it
+ 
+   mydb=# SELECT ts_lexize('xsyn','word');
+   ts_lexize
+   ----------------
+   {word,syn1,syn2,syn3)
+
+4. Change the dictionary options as you wish
+
+   mydb# ALTER TEXT SEARCH DICTIONARY xsyn (KEEPORIG=false);
+   ALTER TEXT SEARCH DICTIONARY
+
+That's all.
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@ -0,0 +1,235 @@
+/*-------------------------------------------------------------------------
+ *
+ * dict_xsyn.c
+ *	  Extended synonym dictionary
+ *
+ * Copyright (c) 2007, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "storage/fd.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+
+PG_MODULE_MAGIC;
+
+typedef struct
+{
+	char *key; /* Word */
+	char *value; /* Unparsed list of synonyms, including the word itself */
+}	Syn;
+
+typedef struct
+{
+	int len;
+	Syn *syn;
+
+	bool keeporig;
+}	DictSyn;
+
+
+PG_FUNCTION_INFO_V1(dxsyn_init);
+Datum dxsyn_init(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(dxsyn_lexize);
+Datum dxsyn_lexize(PG_FUNCTION_ARGS);
+
+static char *
+find_word(char *in, char **end)
+{
+	char *start;
+
+	*end = NULL;
+	while (*in && t_isspace(in))
+		in += pg_mblen(in);
+
+	if (!*in || *in == '#')
+		return NULL;
+	start = in;
+
+	while (*in && !t_isspace(in))
+		in += pg_mblen(in);
+
+	*end = in;
+
+	return start;
+}
+
+static int
+compare_syn(const void *a, const void *b)
+{
+	return strcmp(((Syn *) a)->key, ((Syn *) b)->key);
+}
+
+static void
+read_dictionary(DictSyn *d, char *filename)
+{
+	char *real_filename = get_tsearch_config_filename(filename, "rules");
+	FILE *fin;
+	char *line;
+	int cur = 0;
+
+	if ((fin = AllocateFile(real_filename, "r")) == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open synonym file \"%s\": %m",
+						real_filename)));
+
+	while ((line = t_readline(fin)) != NULL)
+	{
+		char *value;
+		char *key;
+		char *end = NULL;
+
+		if (*line == '\0')
+			continue;
+
+		value = lowerstr(line);
+		pfree(line);
+
+		key = find_word(value, &end);
+		if (!key)
+		{
+			pfree(value);
+			continue;
+		}
+
+		if (cur == d->len)
+		{
+			d->len = (d->len > 0) ? 2 * d->len : 16;
+			if (d->syn)
+				d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
+			else
+				d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
+		}
+
+		d->syn[cur].key = pnstrdup(key, end - key);
+		d->syn[cur].value = value;
+
+		cur++;
+	}
+
+	FreeFile(fin);
+
+	d->len = cur;
+	if (cur > 1)
+		qsort(d->syn, d->len, sizeof(Syn), compare_syn);
+
+	pfree(real_filename);
+}
+
+Datum
+dxsyn_init(PG_FUNCTION_ARGS)
+{
+	List *dictoptions = (List *) PG_GETARG_POINTER(0);
+	DictSyn *d;
+	ListCell *l;
+
+	d = (DictSyn *) palloc0(sizeof(DictSyn));
+	d->len = 0;
+	d->syn = NULL;
+	d->keeporig = true;
+
+	foreach(l, dictoptions)
+	{
+		DefElem *defel = (DefElem *) lfirst(l);
+
+		if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
+		{
+			d->keeporig = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel->defname, "RULES") == 0)
+		{
+			read_dictionary(d, defGetString(defel));
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized xsyn parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+
+	PG_RETURN_POINTER(d);
+}
+
+Datum
+dxsyn_lexize(PG_FUNCTION_ARGS)
+{
+	DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
+	char *in = (char *) PG_GETARG_POINTER(1);
+	int length = PG_GETARG_INT32(2);
+	Syn word;
+	Syn *found;
+	TSLexeme *res = NULL;
+
+	if (!length || d->len == 0)
+		PG_RETURN_POINTER(NULL);
+
+	/* Create search pattern */
+	{
+		char *temp = pnstrdup(in, length);
+
+		word.key = lowerstr(temp);
+		pfree(temp);
+		word.value = NULL;
+	}
+
+	/* Look for matching syn */
+	found = (Syn *)bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
+	pfree(word.key);
+
+	if (!found)
+		PG_RETURN_POINTER(NULL);
+
+	/* Parse string of synonyms and return array of words */
+	{
+		char *value = pstrdup(found->value);
+		int value_length = strlen(value);
+		char *pos = value;
+		int nsyns = 0;
+		bool is_first = true;
+
+		res = palloc(0);
+
+		while(pos < value + value_length)
+		{
+			char *end;
+			char *syn = find_word(pos, &end);
+
+			if (!syn)
+				break;
+			*end = '\0';
+
+			res = repalloc(res, sizeof(TSLexeme)*(nsyns + 2));
+			res[nsyns].lexeme = NULL;
+
+			/* first word is added to result only if KEEPORIG flag is set */
+			if(d->keeporig || !is_first)
+			{
+				res[nsyns].lexeme = pstrdup(syn);
+				res[nsyns + 1].lexeme = NULL;
+
+				nsyns++;
+			}
+
+			is_first = false;
+
+			pos = end + 1;
+		}
+
+		pfree(value);
+	}
+
+	PG_RETURN_POINTER(res);
+}
--- a/contrib/dict_xsyn/dict_xsyn.sql.in
+++ b/contrib/dict_xsyn/dict_xsyn.sql.in
@ -0,0 +1,29 @@
+-- $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+BEGIN;
+
+CREATE FUNCTION dxsyn_init(internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal)
+        RETURNS internal
+        AS 'MODULE_PATHNAME'
+        LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH TEMPLATE xsyn_template (
+        LEXIZE = dxsyn_lexize,
+	INIT   = dxsyn_init
+);
+
+CREATE TEXT SEARCH DICTIONARY xsyn (
+	TEMPLATE = xsyn_template
+);
+
+COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary';
+
+END;
--- a/contrib/dict_xsyn/expected/dict_xsyn.out
+++ b/contrib/dict_xsyn/expected/dict_xsyn.out
@ -0,0 +1,22 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+--configuration
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+--lexize
+SELECT ts_lexize('xsyn', 'supernova');
+   ts_lexize    
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
--- a/contrib/dict_xsyn/sql/dict_xsyn.sql
+++ b/contrib/dict_xsyn/sql/dict_xsyn.sql
@ -0,0 +1,16 @@
+--
+-- first, define the datatype.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+\i dict_xsyn.sql
+\set ECHO all
+RESET client_min_messages;
+
+--configuration
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+
+--lexize
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'grb');
--- a/contrib/dict_xsyn/uninstall_dict_xsyn.sql
+++ b/contrib/dict_xsyn/uninstall_dict_xsyn.sql
@ -0,0 +1,9 @@
+SET search_path = public;
+
+DROP TEXT SEARCH DICTIONARY xsyn;
+
+DROP TEXT SEARCH TEMPLATE xsyn_template;
+
+DROP FUNCTION dxsyn_init(internal);
+
+DROP FUNCTION dxsyn_lexize(internal,internal,internal,internal);
--- a/contrib/dict_xsyn/xsyn_sample.rules
+++ b/contrib/dict_xsyn/xsyn_sample.rules
@ -0,0 +1,6 @@
+# Sample rules file for eXtended Synonym (xsyn) dictionary
+# format is as follows:
+#
+# word synonym1 synonym2 ...
+#
+supernova sn sne 1987a
--- a/contrib/test_parser/Makefile
+++ b/contrib/test_parser/Makefile
@ -0,0 +1,19 @@
+# $PostgreSQL: pgsql/contrib/test_parser/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+DATA_built = test_parser.sql
+DATA = uninstall_test_parser.sql
+DOCS = README.test_parser
+REGRESS = test_parser
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
--- a/contrib/test_parser/README.test_parser
+++ b/contrib/test_parser/README.test_parser
@ -0,0 +1,52 @@
+Example parser
+==============
+
+This is an example of a custom parser for full text search.
+
+It recognizes space-delimited words and returns only two token types:
+
+ - 3,  word,  Word
+
+ - 12, blank, Space symbols
+
+The token numbers have been chosen to keep compatibility with the default
+ts_headline() function, since we do not want to implement our own version.
+
+* Configuration
+
+The parser has no user-configurable parameters.
+
+* Usage
+
+1. Compile and install
+
+2. Load dictionary
+
+   psql mydb < test_parser.sql
+
+3. Test it
+
+   mydb# SELECT * FROM ts_parse('testparser','That''s my first own parser');
+    tokid | token
+   -------+--------
+        3 | That's
+       12 |
+        3 | my
+       12 |
+        3 | first
+       12 |
+        3 | own
+       12 |
+        3 | parser
+
+   mydb# SELECT to_tsvector('testcfg','That''s my first own parser');
+   to_tsvector
+   -------------------------------------------------
+   'my':2 'own':4 'first':3 'parser':5 'that''s':1
+   
+   mydb# SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+   headline
+   -----------------------------------------------------------------
+   Supernovae <b>stars</b> are the brightest phenomena in galaxies
+   
+That's all.
--- a/contrib/test_parser/expected/test_parser.out
+++ b/contrib/test_parser/expected/test_parser.out
@ -0,0 +1,50 @@
+--
+-- first, define the parser.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+-- make test configuration using parser
+CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
+ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+-- ts_parse
+SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
+ tokid |         token         
+-------+-----------------------
+     3 | That's
+    12 |  
+     3 | simple
+    12 |  
+     3 | parser
+    12 |  
+     3 | can't
+    12 |  
+     3 | parse
+    12 |  
+     3 | urls
+    12 |  
+     3 | like
+    12 |  
+     3 | http://some.url/here/
+(15 rows)
+
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector                   
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+(1 row)
+
+SELECT to_tsquery('testcfg', 'star');
+ to_tsquery 
+------------
+ 'star'
+(1 row)
+
+SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', 
+       to_tsquery('testcfg', 'stars'));
+                           ts_headline                           
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+(1 row)
+
--- a/contrib/test_parser/sql/test_parser.sql
+++ b/contrib/test_parser/sql/test_parser.sql
@ -0,0 +1,26 @@
+--
+-- first, define the parser.  Turn off echoing so that expected file
+-- does not depend on contents of this file.
+--
+SET client_min_messages = warning;
+\set ECHO none
+\i test_parser.sql
+\set ECHO all
+RESET client_min_messages;
+
+-- make test configuration using parser
+
+CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
+
+ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+-- ts_parse
+
+SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
+
+SELECT to_tsvector('testcfg','That''s my first own parser');
+
+SELECT to_tsquery('testcfg', 'star');
+
+SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', 
+       to_tsquery('testcfg', 'stars'));
--- a/contrib/test_parser/test_parser.c
+++ b/contrib/test_parser/test_parser.c
@ -0,0 +1,130 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_parser.c
+ *	  Simple example of a text search parser
+ *
+ * Copyright (c) 2007, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/contrib/test_parser/test_parser.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+
+PG_MODULE_MAGIC;
+
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+	char *	buffer; /* text to parse */
+	int		len;	/* length of the text in buffer */
+	int		pos;	/* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+	int		lexid;
+	char	*alias;
+	char	*descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+	ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));
+	pst->buffer = (char *) PG_GETARG_POINTER(0);
+	pst->len = PG_GETARG_INT32(1);
+	pst->pos = 0;
+
+	PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+	ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+	char		**t	   = (char **) PG_GETARG_POINTER(1);
+	int			*tlen  = (int *) PG_GETARG_POINTER(2);
+	int			type;
+
+	*tlen = pst->pos;
+	*t = pst->buffer +	pst->pos;
+
+	if ((pst->buffer)[pst->pos] == ' ')
+	{
+		/* blank type */
+		type = 12;
+		/* go to the next non-white-space character */
+		while ((pst->buffer)[pst->pos] == ' ' &&
+			   pst->pos < pst->len)
+			(pst->pos)++;
+	} else {
+		/* word type */
+		type = 3;
+		/* go to the next white-space character */
+		while ((pst->buffer)[pst->pos] != ' ' &&
+			   pst->pos < pst->len)
+			(pst->pos)++;
+	}
+
+	*tlen = pst->pos - *tlen;
+
+	/* we are finished if (*tlen == 0) */
+	if (*tlen == 0)
+		type=0;
+
+	PG_RETURN_INT32(type);
+}
+
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+	ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+	pfree(pst);
+	PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+	/*
+	 * Remarks:
+	 * - we have to return the blanks for headline reason
+	 * - we use the same lexids like Teodor in the default
+	 * word parser; in this way we can reuse the headline
+	 * function of the default word parser.
+	 */
+	LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+	/* there are only two types in this parser */
+	descr[0].lexid = 3;
+	descr[0].alias = pstrdup("word");
+	descr[0].descr = pstrdup("Word");
+	descr[1].lexid = 12;
+	descr[1].alias = pstrdup("blank");
+	descr[1].descr = pstrdup("Space symbols");
+	descr[2].lexid = 0;
+
+	PG_RETURN_POINTER(descr);
+}
--- a/contrib/test_parser/test_parser.sql.in
+++ b/contrib/test_parser/test_parser.sql.in
@ -0,0 +1,36 @@
+-- $PostgreSQL: pgsql/contrib/test_parser/test_parser.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
+
+-- Adjust this setting to control where the objects get created.
+SET search_path = public;
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal, int4)
+    RETURNS internal
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE FUNCTION testprs_getlexeme(internal, internal, internal)
+    RETURNS internal
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE FUNCTION testprs_end(internal)
+    RETURNS void
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE FUNCTION testprs_lextype(internal)
+    RETURNS internal
+    AS 'MODULE_PATHNAME'
+    LANGUAGE C STRICT;
+
+CREATE TEXT SEARCH PARSER testparser (
+    START    = testprs_start,
+    GETTOKEN = testprs_getlexeme,
+    END      = testprs_end,
+    HEADLINE = pg_catalog.prsd_headline,
+    LEXTYPES = testprs_lextype
+);
+
+END;
--- a/contrib/test_parser/uninstall_test_parser.sql
+++ b/contrib/test_parser/uninstall_test_parser.sql
@ -0,0 +1,11 @@
+SET search_path = public;
+
+DROP TEXT SEARCH PARSER testparser;
+
+DROP FUNCTION testprs_start(internal, int4);
+
+DROP FUNCTION testprs_getlexeme(internal, internal, internal);
+
+DROP FUNCTION testprs_end(internal);
+
+DROP FUNCTION testprs_lextype(internal);