From 324300bc7ccba6988f16915468ee2b870ef3ae5f Mon Sep 17 00:00:00 2001 From: Teodor Sigaev <teodor@sigaev.ru> Date: Tue, 25 Jan 2005 15:24:38 +0000 Subject: [PATCH] improve support of agglutinative languages (query with compound words). regression=# select to_tsquery( '\'fotballklubber\''); to_tsquery ------------------------------------------------ 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb' (1 row) So, changed interface to dictionaries, lexize method of dictionary shoud return pointer to aray of TSLexeme structs instead of char**. Last element should have TSLexeme->lexeme == NULL. typedef struct { /* number of variant of split word , for example Word 'fotballklubber' (norwegian) has two varian to split: ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary should return: nvariant lexeme 1 fotball 1 klubb 2 fot 2 ball 2 klubb */ uint16 nvariant; /* currently unused */ uint16 flags; /* C-string */ char *lexeme; } TSLexeme; --- contrib/tsearch2/dict.c | 16 +++---- contrib/tsearch2/dict.h | 23 +++++++++ contrib/tsearch2/dict_ex.c | 8 ++-- contrib/tsearch2/dict_ispell.c | 19 ++++---- contrib/tsearch2/dict_snowball.c | 8 ++-- contrib/tsearch2/dict_syn.c | 9 ++-- contrib/tsearch2/gendict/dict_tmpl.c.IN | 10 ++-- contrib/tsearch2/ispell/spell.c | 62 ++++++++++++++++--------- contrib/tsearch2/ispell/spell.h | 7 +-- contrib/tsearch2/query.c | 43 +++++++++++++---- contrib/tsearch2/ts_cfg.c | 25 +++++----- contrib/tsearch2/ts_cfg.h | 1 + 12 files changed, 146 insertions(+), 85 deletions(-) diff --git a/contrib/tsearch2/dict.c b/contrib/tsearch2/dict.c index 357097681e..7a3626b3bc 100644 --- a/contrib/tsearch2/dict.c +++ b/contrib/tsearch2/dict.c @@ -183,15 +183,15 @@ lexize(PG_FUNCTION_ARGS) { text *in = PG_GETARG_TEXT_P(1); DictInfo *dict; - char **res, - **ptr; + TSLexeme *res, + *ptr; Datum *da; ArrayType *a; SET_FUNCOID(); dict = finddict(PG_GETARG_OID(0)); - ptr = res = (char **) DatumGetPointer( + ptr = res = (TSLexeme *) DatumGetPointer( FunctionCall3(&(dict->lexize_info), PointerGetDatum(dict->dictionary), PointerGetDatum(VARDATA(in)), @@ -207,13 +207,13 @@ lexize(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } - while (*ptr) + while (ptr->lexeme) ptr++; da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1)); ptr = res; - while (*ptr) + while (ptr->lexeme) { - da[ptr - res] = PointerGetDatum(char2text(*ptr)); + da[ptr - res] = PointerGetDatum(char2text(ptr->lexeme)); ptr++; } @@ -227,10 +227,10 @@ lexize(PG_FUNCTION_ARGS) ); ptr = res; - while (*ptr) + while (ptr->lexeme) { pfree(DatumGetPointer(da[ptr - res])); - pfree(*ptr); + pfree(ptr->lexeme); ptr++; } pfree(res); diff --git a/contrib/tsearch2/dict.h b/contrib/tsearch2/dict.h index 86ea42263e..a21086a49d 100644 --- a/contrib/tsearch2/dict.h +++ b/contrib/tsearch2/dict.h @@ -38,4 +38,27 @@ typedef struct void parse_cfgdict(text *in, Map ** m); +/* return struct for any lexize function */ +typedef struct { + /* number of variant of split word , for example + Word 'fotballklubber' (norwegian) has two varian to split: + ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary + should return: + nvariant lexeme + 1 fotball + 1 klubb + 2 fot + 2 ball + 2 klubb + + */ + uint16 nvariant; + + /* currently unused */ + uint16 flags; + + /* C-string */ + char *lexeme; +} TSLexeme; + #endif diff --git a/contrib/tsearch2/dict_ex.c b/contrib/tsearch2/dict_ex.c index a8fb20453b..241161a5c2 100644 --- a/contrib/tsearch2/dict_ex.c +++ b/contrib/tsearch2/dict_ex.c @@ -54,16 +54,16 @@ dex_lexize(PG_FUNCTION_ARGS) DictExample *d = (DictExample *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); char *txt = pnstrdup(in, PG_GETARG_INT32(2)); - char **res = palloc(sizeof(char *) * 2); + TSLexeme *res = palloc(sizeof(TSLexeme) * 2); + + memset(res,0,sizeof(TSLexeme) * 2); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) { pfree(txt); - res[0] = NULL; } else - res[0] = txt; - res[1] = NULL; + res[0].lexeme = txt; PG_RETURN_POINTER(res); } diff --git a/contrib/tsearch2/dict_ispell.c b/contrib/tsearch2/dict_ispell.c index 5725c8fb36..9af11edf8e 100644 --- a/contrib/tsearch2/dict_ispell.c +++ b/contrib/tsearch2/dict_ispell.c @@ -159,14 +159,13 @@ spell_lexize(PG_FUNCTION_ARGS) DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); char *txt; - char **res; - char **ptr, - **cptr; + TSLexeme *res; + TSLexeme *ptr, + *cptr; if (!PG_GETARG_INT32(2)) PG_RETURN_POINTER(NULL); - res = palloc(sizeof(char *) * 2); txt = pnstrdup(in, PG_GETARG_INT32(2)); res = NINormalizeWord(&(d->obj), txt); pfree(txt); @@ -175,22 +174,22 @@ spell_lexize(PG_FUNCTION_ARGS) PG_RETURN_POINTER(NULL); ptr = cptr = res; - while (*ptr) + while (ptr->lexeme) { - if (searchstoplist(&(d->stoplist), *ptr)) + if (searchstoplist(&(d->stoplist), ptr->lexeme)) { - pfree(*ptr); - *ptr = NULL; + pfree(ptr->lexeme); + ptr->lexeme = NULL; ptr++; } else { - *cptr = *ptr; + memcpy(cptr, ptr, sizeof(TSLexeme)); cptr++; ptr++; } } - *cptr = NULL; + cptr->lexeme = NULL; PG_RETURN_POINTER(res); } diff --git a/contrib/tsearch2/dict_snowball.c b/contrib/tsearch2/dict_snowball.c index 51dba04449..03850b33ea 100644 --- a/contrib/tsearch2/dict_snowball.c +++ b/contrib/tsearch2/dict_snowball.c @@ -105,12 +105,12 @@ snb_lexize(PG_FUNCTION_ARGS) DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); char *txt = pnstrdup(in, PG_GETARG_INT32(2)); - char **res = palloc(sizeof(char *) * 2); + TSLexeme *res = palloc(sizeof(TSLexeme) * 2); + memset(res, 0, sizeof(TSLexeme) * 2); if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) { pfree(txt); - res[0] = NULL; } else { @@ -122,10 +122,8 @@ snb_lexize(PG_FUNCTION_ARGS) memcpy(txt, d->z->p, d->z->l); txt[d->z->l] = '\0'; } - res[0] = txt; + res->lexeme = txt; } - res[1] = NULL; - PG_RETURN_POINTER(res); } diff --git a/contrib/tsearch2/dict_syn.c b/contrib/tsearch2/dict_syn.c index 046a594903..6e3ed86434 100644 --- a/contrib/tsearch2/dict_syn.c +++ b/contrib/tsearch2/dict_syn.c @@ -162,7 +162,7 @@ syn_lexize(PG_FUNCTION_ARGS) char *in = (char *) PG_GETARG_POINTER(1); Syn key, *found; - char **res = NULL; + TSLexeme *res = NULL; if (!PG_GETARG_INT32(2)) PG_RETURN_POINTER(NULL); @@ -176,10 +176,9 @@ syn_lexize(PG_FUNCTION_ARGS) if (!found) PG_RETURN_POINTER(NULL); - res = palloc(sizeof(char *) * 2); - - res[0] = pstrdup(found->out); - res[1] = NULL; + res = palloc(sizeof(TSLexeme) * 2); + memset(res,0,sizeof(TSLexeme) * 2); + res[0].lexeme = pstrdup(found->out); PG_RETURN_POINTER(res); } diff --git a/contrib/tsearch2/gendict/dict_tmpl.c.IN b/contrib/tsearch2/gendict/dict_tmpl.c.IN index deafdcead8..e534ed30a7 100644 --- a/contrib/tsearch2/gendict/dict_tmpl.c.IN +++ b/contrib/tsearch2/gendict/dict_tmpl.c.IN @@ -52,15 +52,15 @@ dlexize_CFG_MODNAME(PG_FUNCTION_ARGS) { HASINIT DictExample *d = (DictExample*)PG_GETARG_POINTER(0); char *in = (char*)PG_GETARG_POINTER(1); char *txt = pnstrdup(in, PG_GETARG_INT32(2)); - char **res=palloc(sizeof(char*)*2); + TSLexeme *res=palloc(sizeof(TSLexeme*)*2); - /* Your INIT dictionary code */ + /* Your LEXIZE dictionary code */ HASINIT if ( *txt=='\0' || searchstoplist(&(d->stoplist),txt) ) { HASINIT pfree(txt); -HASINIT res[0]=NULL; +HASINIT res[0].lexeme=NULL; HASINIT } else - res[0]=txt; - res[1]=NULL; + res[0].lexeme=txt; + res[1].lexeme=NULL; PG_RETURN_POINTER(res); } diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c index 54b01e8ed7..f9053c6301 100644 --- a/contrib/tsearch2/ispell/spell.c +++ b/contrib/tsearch2/ispell/spell.c @@ -1119,17 +1119,32 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, return var; } -char ** +TSLexeme * NINormalizeWord(IspellDict * Conf, char *word) { char **res = NormalizeSubWord(Conf, word, 0); + TSLexeme *lcur=NULL, *lres=NULL; + u_int16_t NVariant=1; + + if (res) { + char **ptr = res; + lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) ); + while(*ptr) { + lcur->lexeme=*ptr; + lcur->flags=0; + lcur->nvariant = NVariant++; + lcur++; + ptr++; + } + lcur->lexeme=NULL; + pfree(res); + } if (Conf->compoundcontrol != '\t') { int wordlen = strlen(word); SplitVar *ptr, *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1); - char **cur = res; int i; while (var) @@ -1140,30 +1155,31 @@ NINormalizeWord(IspellDict * Conf, char *word) if (subres) { - char **ptr = subres; + char **subptr = subres; - if (cur) - { - while (*cur) - cur++; - } - else - res = cur = (char **) palloc(MAX_NORM * sizeof(char *)); + if ( !lcur ) + lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) ); + + while(*subptr) { + for(i=0;i<var->nstem-1;i++) { + lcur->lexeme=(subptr==subres) ? var->stem[ i ] : pstrdup(var->stem[ i ]); + lcur->flags=0; + lcur->nvariant = NVariant; + lcur++; + } - for (i = 0; i < var->nstem - 1; i++) - { - *cur = var->stem[i]; - cur++; - } - while (*ptr) - { - *cur = *ptr; - cur++; - ptr++; - } - *cur = NULL; + lcur->lexeme=*subptr; + lcur->flags=0; + lcur->nvariant = NVariant; + lcur++; + subptr++; + NVariant++; + } + + lcur->lexeme=NULL; pfree(subres); var->stem[0] = NULL; + pfree( var->stem[ var->nstem-1 ] ); } } @@ -1175,7 +1191,7 @@ NINormalizeWord(IspellDict * Conf, char *word) var = ptr; } } - return res; + return lres; } diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h index cc7935fd74..a3695113a1 100644 --- a/contrib/tsearch2/ispell/spell.h +++ b/contrib/tsearch2/ispell/spell.h @@ -3,10 +3,11 @@ #include <sys/types.h> #include "regex/regex.h" -#include "regis.h" #include "c.h" - +#include "regis.h" +#include "dict.h" + struct SPNode; @@ -116,7 +117,7 @@ typedef struct } IspellDict; -char **NINormalizeWord(IspellDict * Conf, char *word); +TSLexeme *NINormalizeWord(IspellDict * Conf, char *word); int NIImportAffixes(IspellDict * Conf, const char *filename); int NIImportDictionary(IspellDict * Conf, const char *filename); diff --git a/contrib/tsearch2/query.c b/contrib/tsearch2/query.c index 6787b63ae8..ee4f779d58 100644 --- a/contrib/tsearch2/query.c +++ b/contrib/tsearch2/query.c @@ -265,6 +265,7 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we { int4 count = 0; PRSTEXT prs; + uint32 variant, pos, cntvar=0, cntpos=0, cnt=0; prs.lenwords = 32; prs.curwords = 0; @@ -273,17 +274,39 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we parsetext_v2(findcfg(state->cfg_id), &prs, strval, lenval); - for (count = 0; count < prs.curwords; count++) - { - pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight); - pfree(prs.words[count].word); - if (count) - pushquery(state, OPR, (int4) '&', 0, 0, 0); - } - pfree(prs.words); + if ( prs.curwords>0 ) { - /* XXX */ - if (prs.curwords == 0) + while (count < prs.curwords) { + pos = prs.words[count].pos.pos; + cntvar=0; + while(count < prs.curwords && pos==prs.words[count].pos.pos) { + variant = prs.words[count].nvariant; + + cnt=0; + while(count < prs.curwords && pos==prs.words[count].pos.pos && variant==prs.words[count].nvariant) { + + pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight); + pfree(prs.words[count].word); + if ( cnt ) + pushquery(state, OPR, (int4) '&', 0, 0, 0); + cnt++; + count++; + } + + if ( cntvar ) + pushquery(state, OPR, (int4) '|', 0, 0, 0); + cntvar++; + } + + if (cntpos) + pushquery(state, OPR, (int4) '&', 0, 0, 0); + + cntpos++; + } + + pfree(prs.words); + + } else pushval_asis(state, VALSTOP, NULL, 0, 0); } diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c index afebb11319..79f25c43d9 100644 --- a/contrib/tsearch2/ts_cfg.c +++ b/contrib/tsearch2/ts_cfg.c @@ -321,10 +321,10 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) for (i = 0; i < cfg->map[type].len; i++) { DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i])); - char **norms, - **ptr; + TSLexeme *norms, + *ptr; - norms = ptr = (char **) DatumGetPointer( + norms = ptr = (TSLexeme *) DatumGetPointer( FunctionCall3( &(dict->lexize_info), PointerGetDatum(dict->dictionary), @@ -337,7 +337,7 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) prs->pos++; /* set pos */ - while (*ptr) + while (ptr->lexeme) { if (prs->curwords == prs->lenwords) { @@ -345,8 +345,9 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD)); } - prs->words[prs->curwords].len = strlen(*ptr); - prs->words[prs->curwords].word = *ptr; + prs->words[prs->curwords].len = strlen(ptr->lexeme); + prs->words[prs->curwords].word = ptr->lexeme; + prs->words[prs->curwords].nvariant = ptr->nvariant; prs->words[prs->curwords].alen = 0; prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); ptr++; @@ -458,10 +459,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 for (i = 0; i < cfg->map[type].len; i++) { DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i])); - char **norms, - **ptr; + TSLexeme *norms, + *ptr; - norms = ptr = (char **) DatumGetPointer( + norms = ptr = (TSLexeme *) DatumGetPointer( FunctionCall3( &(dict->lexize_info), PointerGetDatum(dict->dictionary), @@ -472,10 +473,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 if (!norms) /* dictionary doesn't know this lexem */ continue; - while (*ptr) + while (ptr->lexeme) { - hlfinditem(prs, query, *ptr, strlen(*ptr)); - pfree(*ptr); + hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme)); + pfree(ptr->lexeme); ptr++; } pfree(norms); diff --git a/contrib/tsearch2/ts_cfg.h b/contrib/tsearch2/ts_cfg.h index e000233178..7bffdbcdd6 100644 --- a/contrib/tsearch2/ts_cfg.h +++ b/contrib/tsearch2/ts_cfg.h @@ -27,6 +27,7 @@ void reset_cfg(void); typedef struct { uint16 len; + uint16 nvariant; union { uint16 pos;