From 324300bc7ccba6988f16915468ee2b870ef3ae5f Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Tue, 25 Jan 2005 15:24:38 +0000
Subject: [PATCH] improve support of agglutinative languages (query with
 compound words).

regression=# select to_tsquery( '\'fotballklubber\'');
                   to_tsquery
------------------------------------------------
 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb'
(1 row)

So, changed interface to dictionaries, lexize method of dictionary shoud return
pointer to aray of TSLexeme structs instead of char**. Last element should
have TSLexeme->lexeme == NULL.

typedef struct {
        /* number of variant of split word , for example
                Word 'fotballklubber' (norwegian) has two varian to split:
                ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
                should return:
                nvariant        lexeme
                1               fotball
                1               klubb
                2               fot
                2               ball
                2               klubb

        */
        uint16  nvariant;

        /* currently unused */
        uint16  flags;

        /* C-string */
        char    *lexeme;
} TSLexeme;
---
 contrib/tsearch2/dict.c                 | 16 +++----
 contrib/tsearch2/dict.h                 | 23 +++++++++
 contrib/tsearch2/dict_ex.c              |  8 ++--
 contrib/tsearch2/dict_ispell.c          | 19 ++++----
 contrib/tsearch2/dict_snowball.c        |  8 ++--
 contrib/tsearch2/dict_syn.c             |  9 ++--
 contrib/tsearch2/gendict/dict_tmpl.c.IN | 10 ++--
 contrib/tsearch2/ispell/spell.c         | 62 ++++++++++++++++---------
 contrib/tsearch2/ispell/spell.h         |  7 +--
 contrib/tsearch2/query.c                | 43 +++++++++++++----
 contrib/tsearch2/ts_cfg.c               | 25 +++++-----
 contrib/tsearch2/ts_cfg.h               |  1 +
 12 files changed, 146 insertions(+), 85 deletions(-)

diff --git a/contrib/tsearch2/dict.c b/contrib/tsearch2/dict.c
index 357097681e..7a3626b3bc 100644
--- a/contrib/tsearch2/dict.c
+++ b/contrib/tsearch2/dict.c
@@ -183,15 +183,15 @@ lexize(PG_FUNCTION_ARGS)
 {
 	text	   *in = PG_GETARG_TEXT_P(1);
 	DictInfo   *dict;
-	char	  **res,
-			  **ptr;
+	TSLexeme	  *res,
+			  *ptr;
 	Datum	   *da;
 	ArrayType  *a;
 
 	SET_FUNCOID();
 	dict = finddict(PG_GETARG_OID(0));
 
-	ptr = res = (char **) DatumGetPointer(
+	ptr = res = (TSLexeme *) DatumGetPointer(
 									  FunctionCall3(&(dict->lexize_info),
 									   PointerGetDatum(dict->dictionary),
 											PointerGetDatum(VARDATA(in)),
@@ -207,13 +207,13 @@ lexize(PG_FUNCTION_ARGS)
 			PG_RETURN_NULL();
 	}
 
-	while (*ptr)
+	while (ptr->lexeme)
 		ptr++;
 	da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1));
 	ptr = res;
-	while (*ptr)
+	while (ptr->lexeme)
 	{
-		da[ptr - res] = PointerGetDatum(char2text(*ptr));
+		da[ptr - res] = PointerGetDatum(char2text(ptr->lexeme));
 		ptr++;
 	}
 
@@ -227,10 +227,10 @@ lexize(PG_FUNCTION_ARGS)
 		);
 
 	ptr = res;
-	while (*ptr)
+	while (ptr->lexeme)
 	{
 		pfree(DatumGetPointer(da[ptr - res]));
-		pfree(*ptr);
+		pfree(ptr->lexeme);
 		ptr++;
 	}
 	pfree(res);
diff --git a/contrib/tsearch2/dict.h b/contrib/tsearch2/dict.h
index 86ea42263e..a21086a49d 100644
--- a/contrib/tsearch2/dict.h
+++ b/contrib/tsearch2/dict.h
@@ -38,4 +38,27 @@ typedef struct
 
 void		parse_cfgdict(text *in, Map ** m);
 
+/* return struct for any lexize function */
+typedef struct {
+	/* number of variant of split word , for example
+		Word 'fotballklubber' (norwegian) has two varian to split:
+		( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
+		should return:
+		nvariant	lexeme
+		1		fotball
+		1		klubb
+		2		fot
+		2		ball
+		2		klubb
+
+	*/
+	uint16	nvariant;
+
+	/* currently unused */
+	uint16	flags;
+
+	/* C-string */
+	char	*lexeme;
+} TSLexeme;
+
 #endif
diff --git a/contrib/tsearch2/dict_ex.c b/contrib/tsearch2/dict_ex.c
index a8fb20453b..241161a5c2 100644
--- a/contrib/tsearch2/dict_ex.c
+++ b/contrib/tsearch2/dict_ex.c
@@ -54,16 +54,16 @@ dex_lexize(PG_FUNCTION_ARGS)
 	DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	char	   *txt = pnstrdup(in, PG_GETARG_INT32(2));
-	char	  **res = palloc(sizeof(char *) * 2);
+	TSLexeme   *res = palloc(sizeof(TSLexeme) * 2);
+
+	memset(res,0,sizeof(TSLexeme) * 2);
 
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
 		pfree(txt);
-		res[0] = NULL;
 	}
 	else
-		res[0] = txt;
-	res[1] = NULL;
+		res[0].lexeme = txt;
 
 	PG_RETURN_POINTER(res);
 }
diff --git a/contrib/tsearch2/dict_ispell.c b/contrib/tsearch2/dict_ispell.c
index 5725c8fb36..9af11edf8e 100644
--- a/contrib/tsearch2/dict_ispell.c
+++ b/contrib/tsearch2/dict_ispell.c
@@ -159,14 +159,13 @@ spell_lexize(PG_FUNCTION_ARGS)
 	DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	char	   *txt;
-	char	  **res;
-	char	  **ptr,
-			  **cptr;
+	TSLexeme	  *res;
+	TSLexeme	  *ptr,
+			  *cptr;
 
 	if (!PG_GETARG_INT32(2))
 		PG_RETURN_POINTER(NULL);
 
-	res = palloc(sizeof(char *) * 2);
 	txt = pnstrdup(in, PG_GETARG_INT32(2));
 	res = NINormalizeWord(&(d->obj), txt);
 	pfree(txt);
@@ -175,22 +174,22 @@ spell_lexize(PG_FUNCTION_ARGS)
 		PG_RETURN_POINTER(NULL);
 
 	ptr = cptr = res;
-	while (*ptr)
+	while (ptr->lexeme)
 	{
-		if (searchstoplist(&(d->stoplist), *ptr))
+		if (searchstoplist(&(d->stoplist), ptr->lexeme))
 		{
-			pfree(*ptr);
-			*ptr = NULL;
+			pfree(ptr->lexeme);
+			ptr->lexeme = NULL;
 			ptr++;
 		}
 		else
 		{
-			*cptr = *ptr;
+			memcpy(cptr, ptr, sizeof(TSLexeme));
 			cptr++;
 			ptr++;
 		}
 	}
-	*cptr = NULL;
+	cptr->lexeme = NULL;
 
 	PG_RETURN_POINTER(res);
 }
diff --git a/contrib/tsearch2/dict_snowball.c b/contrib/tsearch2/dict_snowball.c
index 51dba04449..03850b33ea 100644
--- a/contrib/tsearch2/dict_snowball.c
+++ b/contrib/tsearch2/dict_snowball.c
@@ -105,12 +105,12 @@ snb_lexize(PG_FUNCTION_ARGS)
 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	char	   *txt = pnstrdup(in, PG_GETARG_INT32(2));
-	char	  **res = palloc(sizeof(char *) * 2);
+	TSLexeme	  *res = palloc(sizeof(TSLexeme) * 2);
 
+	memset(res, 0, sizeof(TSLexeme) * 2);
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
 		pfree(txt);
-		res[0] = NULL;
 	}
 	else
 	{
@@ -122,10 +122,8 @@ snb_lexize(PG_FUNCTION_ARGS)
 			memcpy(txt, d->z->p, d->z->l);
 			txt[d->z->l] = '\0';
 		}
-		res[0] = txt;
+		res->lexeme = txt;
 	}
-	res[1] = NULL;
-
 
 	PG_RETURN_POINTER(res);
 }
diff --git a/contrib/tsearch2/dict_syn.c b/contrib/tsearch2/dict_syn.c
index 046a594903..6e3ed86434 100644
--- a/contrib/tsearch2/dict_syn.c
+++ b/contrib/tsearch2/dict_syn.c
@@ -162,7 +162,7 @@ syn_lexize(PG_FUNCTION_ARGS)
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	Syn			key,
 			   *found;
-	char	  **res = NULL;
+	TSLexeme	  *res = NULL;
 
 	if (!PG_GETARG_INT32(2))
 		PG_RETURN_POINTER(NULL);
@@ -176,10 +176,9 @@ syn_lexize(PG_FUNCTION_ARGS)
 	if (!found)
 		PG_RETURN_POINTER(NULL);
 
-	res = palloc(sizeof(char *) * 2);
-
-	res[0] = pstrdup(found->out);
-	res[1] = NULL;
+	res = palloc(sizeof(TSLexeme) * 2);
+	memset(res,0,sizeof(TSLexeme) * 2);
+	res[0].lexeme = pstrdup(found->out);
 
 	PG_RETURN_POINTER(res);
 }
diff --git a/contrib/tsearch2/gendict/dict_tmpl.c.IN b/contrib/tsearch2/gendict/dict_tmpl.c.IN
index deafdcead8..e534ed30a7 100644
--- a/contrib/tsearch2/gendict/dict_tmpl.c.IN
+++ b/contrib/tsearch2/gendict/dict_tmpl.c.IN
@@ -52,15 +52,15 @@ dlexize_CFG_MODNAME(PG_FUNCTION_ARGS) {
 HASINIT 	DictExample *d = (DictExample*)PG_GETARG_POINTER(0);
 	char       *in = (char*)PG_GETARG_POINTER(1);
 	char *txt = pnstrdup(in, PG_GETARG_INT32(2));
-	char	**res=palloc(sizeof(char*)*2);
+	TSLexeme	*res=palloc(sizeof(TSLexeme*)*2);
 
-	/* Your INIT dictionary code */
+	/* Your LEXIZE dictionary code */
 HASINIT 	if ( *txt=='\0' || searchstoplist(&(d->stoplist),txt) ) {
 HASINIT 		pfree(txt);
-HASINIT 		res[0]=NULL;
+HASINIT 		res[0].lexeme=NULL;
 HASINIT 	} else 
-		res[0]=txt;
-	res[1]=NULL;
+		res[0].lexeme=txt;
+	res[1].lexeme=NULL;
 
 	PG_RETURN_POINTER(res);
 }
diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c
index 54b01e8ed7..f9053c6301 100644
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -1119,17 +1119,32 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
 	return var;
 }
 
-char	  **
+TSLexeme *
 NINormalizeWord(IspellDict * Conf, char *word)
 {
 	char	  **res = NormalizeSubWord(Conf, word, 0);
+	TSLexeme *lcur=NULL, *lres=NULL;
+	u_int16_t NVariant=1;
+
+	if (res) {
+		char **ptr = res;
+		lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
+		while(*ptr) {
+			lcur->lexeme=*ptr;
+			lcur->flags=0;
+			lcur->nvariant = NVariant++;
+			lcur++;
+			ptr++;
+		}
+		lcur->lexeme=NULL;
+		pfree(res);
+	}
 
 	if (Conf->compoundcontrol != '\t')
 	{
 		int			wordlen = strlen(word);
 		SplitVar   *ptr,
 				   *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
-		char	  **cur = res;
 		int			i;
 
 		while (var)
@@ -1140,30 +1155,31 @@ NINormalizeWord(IspellDict * Conf, char *word)
 
 				if (subres)
 				{
-					char	  **ptr = subres;
+					char	  **subptr = subres;
 
-					if (cur)
-					{
-						while (*cur)
-							cur++;
-					}
-					else
-						res = cur = (char **) palloc(MAX_NORM * sizeof(char *));
+					if ( !lcur )
+						lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
+		
+					while(*subptr) {
+						for(i=0;i<var->nstem-1;i++) {
+							lcur->lexeme=(subptr==subres) ? var->stem[ i ] : pstrdup(var->stem[ i ]);
+							lcur->flags=0;
+							lcur->nvariant = NVariant;
+							lcur++;
+						}
 
-					for (i = 0; i < var->nstem - 1; i++)
-					{
-						*cur = var->stem[i];
-						cur++;
-					}
-					while (*ptr)
-					{
-						*cur = *ptr;
-						cur++;
-						ptr++;
-					}
-					*cur = NULL;
+						lcur->lexeme=*subptr;
+						lcur->flags=0;
+						lcur->nvariant = NVariant;
+						lcur++;
+						subptr++;
+						NVariant++;
+					}	
+
+					lcur->lexeme=NULL;
 					pfree(subres);
 					var->stem[0] = NULL;
+					pfree( var->stem[ var->nstem-1 ] );	
 				}
 			}
 
@@ -1175,7 +1191,7 @@ NINormalizeWord(IspellDict * Conf, char *word)
 			var = ptr;
 		}
 	}
-	return res;
+	return lres;
 }
 
 
diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h
index cc7935fd74..a3695113a1 100644
--- a/contrib/tsearch2/ispell/spell.h
+++ b/contrib/tsearch2/ispell/spell.h
@@ -3,10 +3,11 @@
 
 #include <sys/types.h>
 #include "regex/regex.h"
-#include "regis.h"
 #include "c.h"
 
-
+#include "regis.h"
+#include "dict.h"
+ 
 struct SPNode;
 
 
@@ -116,7 +117,7 @@ typedef struct
 
 }	IspellDict;
 
-char	  **NINormalizeWord(IspellDict * Conf, char *word);
+TSLexeme	  *NINormalizeWord(IspellDict * Conf, char *word);
 int			NIImportAffixes(IspellDict * Conf, const char *filename);
 int			NIImportDictionary(IspellDict * Conf, const char *filename);
 
diff --git a/contrib/tsearch2/query.c b/contrib/tsearch2/query.c
index 6787b63ae8..ee4f779d58 100644
--- a/contrib/tsearch2/query.c
+++ b/contrib/tsearch2/query.c
@@ -265,6 +265,7 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
 {
 	int4		count = 0;
 	PRSTEXT		prs;
+	uint32		variant, pos, cntvar=0, cntpos=0, cnt=0;
 
 	prs.lenwords = 32;
 	prs.curwords = 0;
@@ -273,17 +274,39 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
 
 	parsetext_v2(findcfg(state->cfg_id), &prs, strval, lenval);
 
-	for (count = 0; count < prs.curwords; count++)
-	{
-		pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
-		pfree(prs.words[count].word);
-		if (count)
-			pushquery(state, OPR, (int4) '&', 0, 0, 0);
-	}
-	pfree(prs.words);
+	if ( prs.curwords>0 ) {
 
-	/* XXX */
-	if (prs.curwords == 0)
+		while (count < prs.curwords) {
+			pos = prs.words[count].pos.pos;
+			cntvar=0;
+			while(count < prs.curwords && pos==prs.words[count].pos.pos) {
+				variant = prs.words[count].nvariant;
+
+				cnt=0;
+				while(count < prs.curwords && pos==prs.words[count].pos.pos && variant==prs.words[count].nvariant)	{
+					
+					pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
+					pfree(prs.words[count].word);
+					if ( cnt ) 
+						pushquery(state, OPR, (int4) '&', 0, 0, 0);
+					cnt++;
+					count++;
+				}
+
+				if ( cntvar ) 
+					pushquery(state, OPR, (int4) '|', 0, 0, 0);
+				cntvar++;
+			}
+
+			if (cntpos) 
+				pushquery(state, OPR, (int4) '&', 0, 0, 0);
+		
+			cntpos++;
+		}
+
+		pfree(prs.words);
+
+	} else
 		pushval_asis(state, VALSTOP, NULL, 0, 0);
 }
 
diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c
index afebb11319..79f25c43d9 100644
--- a/contrib/tsearch2/ts_cfg.c
+++ b/contrib/tsearch2/ts_cfg.c
@@ -321,10 +321,10 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 		for (i = 0; i < cfg->map[type].len; i++)
 		{
 			DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-			char	  **norms,
-					  **ptr;
+			TSLexeme	  *norms,
+					  *ptr;
 
-			norms = ptr = (char **) DatumGetPointer(
+			norms = ptr = (TSLexeme *) DatumGetPointer(
 													FunctionCall3(
 													&(dict->lexize_info),
 									   PointerGetDatum(dict->dictionary),
@@ -337,7 +337,7 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 
 			prs->pos++;			/* set pos */
 
-			while (*ptr)
+			while (ptr->lexeme)
 			{
 				if (prs->curwords == prs->lenwords)
 				{
@@ -345,8 +345,9 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
 					prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
 				}
 
-				prs->words[prs->curwords].len = strlen(*ptr);
-				prs->words[prs->curwords].word = *ptr;
+				prs->words[prs->curwords].len = strlen(ptr->lexeme);
+				prs->words[prs->curwords].word = ptr->lexeme;
+				prs->words[prs->curwords].nvariant = ptr->nvariant;
 				prs->words[prs->curwords].alen = 0;
 				prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
 				ptr++;
@@ -458,10 +459,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
 		for (i = 0; i < cfg->map[type].len; i++)
 		{
 			DictInfo   *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
-			char	  **norms,
-					  **ptr;
+			TSLexeme	  *norms,
+					  *ptr;
 
-			norms = ptr = (char **) DatumGetPointer(
+			norms = ptr = (TSLexeme *) DatumGetPointer(
 													FunctionCall3(
 													&(dict->lexize_info),
 									   PointerGetDatum(dict->dictionary),
@@ -472,10 +473,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
 			if (!norms)			/* dictionary doesn't know this lexem */
 				continue;
 
-			while (*ptr)
+			while (ptr->lexeme)
 			{
-				hlfinditem(prs, query, *ptr, strlen(*ptr));
-				pfree(*ptr);
+				hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
+				pfree(ptr->lexeme);
 				ptr++;
 			}
 			pfree(norms);
diff --git a/contrib/tsearch2/ts_cfg.h b/contrib/tsearch2/ts_cfg.h
index e000233178..7bffdbcdd6 100644
--- a/contrib/tsearch2/ts_cfg.h
+++ b/contrib/tsearch2/ts_cfg.h
@@ -27,6 +27,7 @@ void		reset_cfg(void);
 typedef struct
 {
 	uint16		len;
+	uint16		nvariant;
 	union
 	{
 		uint16		pos;