From 46a25ce6a9f02b2ba9ef9266ac1729ba2dba10cf Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Thu, 9 Feb 2006 18:04:20 +0000 Subject: [PATCH] 1 Fix bug with very short word: prefix and suffix might be overlapped, sorry but fix can't be applyed to previous version: it's require refill tsvector... 2 Small optimize of load time for huge dictionaries 3 use palloc instead of malloc during load dict file --- contrib/tsearch2/ispell/spell.c | 107 +++++++++++++++++--------------- contrib/tsearch2/ispell/spell.h | 8 ++- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c index 69c91470f6..49b9e9dfbf 100644 --- a/contrib/tsearch2/ispell/spell.c +++ b/contrib/tsearch2/ispell/spell.c @@ -18,18 +18,19 @@ #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] ) #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T ) +static char *VoidString = ""; #define MEMOUT(X) if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))) static int cmpspell(const void *s1, const void *s2) { - return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word)); + return (strcmp((*(const SPELL **) s1)->word, (*(const SPELL **) s2)->word)); } static int cmpspellaffix(const void *s1, const void *s2) { - return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag)); + return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag)); } static char * @@ -128,18 +129,17 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag) if (Conf->mspell) { Conf->mspell += 1024 * 20; - Conf->Spell = (SPELL *) realloc(Conf->Spell, Conf->mspell * sizeof(SPELL)); + Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL*)); } else { Conf->mspell = 1024 * 20; - Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL)); + Conf->Spell = (SPELL **) palloc(Conf->mspell * sizeof(SPELL*)); } - MEMOUT(Conf->Spell); } - Conf->Spell[Conf->nspell].word = strdup(word); - MEMOUT(Conf->Spell[Conf->nspell].word); - strncpy(Conf->Spell[Conf->nspell].p.flag, flag, 16); + Conf->Spell[Conf->nspell] = (SPELL*)palloc(SPELLHDRSZ + strlen(word) + 1); + strcpy( Conf->Spell[Conf->nspell]->word ,word ); + strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16); Conf->nspell++; return (0); } @@ -261,13 +261,13 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const { Conf->Affix[Conf->naffixes].issimple = 1; Conf->Affix[Conf->naffixes].isregis = 0; - Conf->Affix[Conf->naffixes].mask = strdup(""); + Conf->Affix[Conf->naffixes].mask = VoidString; } else if (RS_isRegis(mask)) { Conf->Affix[Conf->naffixes].issimple = 0; Conf->Affix[Conf->naffixes].isregis = 1; - Conf->Affix[Conf->naffixes].mask = strdup(mask); + Conf->Affix[Conf->naffixes].mask = (mask && *mask) ? strdup(mask) : VoidString; } else { @@ -287,11 +287,13 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const Conf->Affix[Conf->naffixes].flag = flag; Conf->Affix[Conf->naffixes].type = type; - Conf->Affix[Conf->naffixes].find = strdup(find); + Conf->Affix[Conf->naffixes].find = (find && *find) ? strdup(find) : VoidString; MEMOUT(Conf->Affix[Conf->naffixes].find); - Conf->Affix[Conf->naffixes].repl = strdup(repl); - MEMOUT(Conf->Affix[Conf->naffixes].repl); - Conf->Affix[Conf->naffixes].replen = strlen(repl); + if ( (Conf->Affix[Conf->naffixes].replen = strlen(repl)) > 0 ) { + Conf->Affix[Conf->naffixes].repl = strdup(repl); + MEMOUT(Conf->Affix[Conf->naffixes].repl); + } else + Conf->Affix[Conf->naffixes].repl = VoidString; Conf->naffixes++; return (0); } @@ -506,10 +508,10 @@ mkSPNode(IspellDict * Conf, int low, int high, int level) int lownew = low; for (i = low; i < high; i++) - if (Conf->Spell[i].p.d.len > level && lastchar != Conf->Spell[i].word[level]) + if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level]) { nchar++; - lastchar = Conf->Spell[i].word[level]; + lastchar = Conf->Spell[i]->word[level]; } if (!nchar) @@ -523,9 +525,9 @@ mkSPNode(IspellDict * Conf, int low, int high, int level) lastchar = '\0'; for (i = low; i < high; i++) - if (Conf->Spell[i].p.d.len > level) + if (Conf->Spell[i]->p.d.len > level) { - if (lastchar != Conf->Spell[i].word[level]) + if (lastchar != Conf->Spell[i]->word[level]) { if (lastchar) { @@ -533,24 +535,24 @@ mkSPNode(IspellDict * Conf, int low, int high, int level) lownew = i; data++; } - lastchar = Conf->Spell[i].word[level]; + lastchar = Conf->Spell[i]->word[level]; } - data->val = ((uint8 *) (Conf->Spell[i].word))[level]; - if (Conf->Spell[i].p.d.len == level + 1) + data->val = ((uint8 *) (Conf->Spell[i]->word))[level]; + if (Conf->Spell[i]->p.d.len == level + 1) { - if (data->isword && data->affix != Conf->Spell[i].p.d.affix) + if (data->isword && data->affix != Conf->Spell[i]->p.d.affix) { /* * fprintf(stderr,"Word already exists: %s (affixes: '%s' - * and '%s')\n", Conf->Spell[i].word, + * and '%s')\n", Conf->Spell[i]->word, * Conf->AffixData[data->affix], - * Conf->AffixData[Conf->Spell[i].p.d.affix] ); + * Conf->AffixData[Conf->Spell[i]->p.d.affix] ); */ /* MergeAffix called a few times */ - data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i].p.d.affix); + data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix); } else - data->affix = Conf->Spell[i].p.d.affix; + data->affix = Conf->Spell[i]->p.d.affix; data->isword = 1; if (strchr(Conf->AffixData[data->affix], Conf->compoundcontrol)) data->compoundallow = 1; @@ -562,8 +564,6 @@ mkSPNode(IspellDict * Conf, int low, int high, int level) return rs; } - - void NISortDictionary(IspellDict * Conf) { @@ -571,9 +571,9 @@ NISortDictionary(IspellDict * Conf) int naffix = 3; /* compress affixes */ - qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspellaffix); + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL*), cmpspellaffix); for (i = 1; i < Conf->nspell; i++) - if (strcmp(Conf->Spell[i].p.flag, Conf->Spell[i - 1].p.flag)) + if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag)) naffix++; Conf->AffixData = (char **) malloc(naffix * sizeof(char *)); @@ -582,28 +582,28 @@ NISortDictionary(IspellDict * Conf) naffix = 1; Conf->AffixData[0] = strdup(""); MEMOUT(Conf->AffixData[0]); - Conf->AffixData[1] = strdup(Conf->Spell[0].p.flag); + Conf->AffixData[1] = strdup(Conf->Spell[0]->p.flag); MEMOUT(Conf->AffixData[1]); - Conf->Spell[0].p.d.affix = 1; - Conf->Spell[0].p.d.len = strlen(Conf->Spell[0].word); + Conf->Spell[0]->p.d.affix = 1; + Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word); for (i = 1; i < Conf->nspell; i++) { - if (strcmp(Conf->Spell[i].p.flag, Conf->AffixData[naffix])) + if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix])) { naffix++; - Conf->AffixData[naffix] = strdup(Conf->Spell[i].p.flag); + Conf->AffixData[naffix] = strdup(Conf->Spell[i]->p.flag); MEMOUT(Conf->AffixData[naffix]); } - Conf->Spell[i].p.d.affix = naffix; - Conf->Spell[i].p.d.len = strlen(Conf->Spell[i].word); + Conf->Spell[i]->p.d.affix = naffix; + Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); } - qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell); + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL*), cmpspell); Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); for (i = 0; i < Conf->nspell; i++) - free(Conf->Spell[i].word); - free(Conf->Spell); + pfree(Conf->Spell[i]); + pfree(Conf->Spell); Conf->Spell = NULL; } @@ -724,7 +724,6 @@ NISortAffixes(IspellDict * Conf) if (Conf->naffixes > 1) qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); - Conf->CompoundAffix = ptr = (CMPDAffix *) malloc(sizeof(CMPDAffix) * Conf->naffixes); MEMOUT(Conf->CompoundAffix); ptr->affix = NULL; @@ -803,7 +802,7 @@ FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type } static char * -CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) +CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword, int *baselen) { if (flagflags & FF_COMPOUNDONLYAFX) @@ -821,9 +820,15 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne { strcpy(newword, word); strcpy(newword + len - Affix->replen, Affix->find); + if ( baselen ) /* store length of non-changed part of word */ + *baselen = len - Affix->replen; } else { + /* if prefix is a all non-chaged part's length then all word contains only prefix and suffix, + so out */ + if ( baselen && *baselen + strlen(Affix->find) <= Affix->replen ) + return NULL; strcpy(newword, Affix->find); strcat(newword, word + Affix->replen); } @@ -927,7 +932,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) break; for (j = 0; j < prefix->naff; j++) { - if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword)) + if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL)) { /* prefix success */ if (FindWord(Conf, newword, prefix->aff[j]->flag, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1)) @@ -948,6 +953,8 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) */ while (snode) { + int baselen=0; + /* find possible suffix */ suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); if (!suffix) @@ -955,7 +962,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) /* foreach suffix check affix */ for (i = 0; i < suffix->naff; i++) { - if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword)) + if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen)) { /* suffix success */ if (FindWord(Conf, newword, suffix->aff[i]->flag, flag & FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM - 1)) @@ -976,7 +983,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) break; for (j = 0; j < prefix->naff; j++) { - if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword)) + if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen)) { /* prefix success */ int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ? @@ -1323,15 +1330,15 @@ NIFree(IspellDict * Conf) else pg_regfree(&(Affix[i].reg.regex)); } - free(Affix[i].mask); - free(Affix[i].find); - free(Affix[i].repl); + if ( Affix[i].mask != VoidString ) free(Affix[i].mask); + if ( Affix[i].find != VoidString ) free(Affix[i].find); + if ( Affix[i].repl != VoidString ) free(Affix[i].repl); } if (Conf->Spell) { for (i = 0; i < Conf->nspell; i++) - free(Conf->Spell[i].word); - free(Conf->Spell); + pfree(Conf->Spell[i]->word); + pfree(Conf->Spell); } if (Conf->Affix) diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h index ee86eac985..fc3240a1d8 100644 --- a/contrib/tsearch2/ispell/spell.h +++ b/contrib/tsearch2/ispell/spell.h @@ -32,7 +32,6 @@ typedef struct SPNode typedef struct spell_struct { - char *word; union { char flag[16]; @@ -41,9 +40,12 @@ typedef struct spell_struct int affix; int len; } d; - } p; + } p; + char word[1]; } SPELL; +#define SPELLHDRSZ (offsetof(SPELL, word)) + typedef struct aff_struct { uint32 @@ -106,7 +108,7 @@ typedef struct int nspell; int mspell; - SPELL *Spell; + SPELL **Spell; AffixNode *Suffix; AffixNode *Prefix;