Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.
It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.
This commit is contained in:
parent
3b7ed9ba9c
commit
22505f4703
@ -1,13 +1,13 @@
|
||||
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
|
||||
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
|
||||
|
||||
MODULE_big = tsearch2
|
||||
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
|
||||
dict_snowball.o dict_ispell.o dict_syn.o \
|
||||
dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
|
||||
wparser.o wparser_def.o \
|
||||
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
|
||||
tsvector_op.o rank.o ts_stat.o \
|
||||
query_util.o query_support.o query_rewrite.o query_gist.o \
|
||||
ts_locale.o ginidx.o
|
||||
ts_locale.o ts_lexize.o ginidx.o
|
||||
|
||||
SUBDIRS := snowball ispell wordparser
|
||||
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
|
||||
@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS)
|
||||
|
||||
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
|
||||
|
||||
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
|
||||
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
|
||||
DATA_built = tsearch2.sql untsearch2.sql
|
||||
DOCS = README.tsearch2
|
||||
REGRESS = tsearch2
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "catalog/pg_proc.h"
|
||||
#include "catalog/pg_namespace.h"
|
||||
#include "utils/syscache.h"
|
||||
#include "miscadmin.h"
|
||||
|
||||
#include "ts_cfg.h"
|
||||
#include "dict.h"
|
||||
@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)
|
||||
|
||||
return nspoid;
|
||||
}
|
||||
|
||||
/* if path is relative, take it as relative to share dir */
|
||||
char *
|
||||
to_absfilename(char *filename) {
|
||||
if (!is_absolute_path(filename)) {
|
||||
char sharepath[MAXPGPATH];
|
||||
char *absfn;
|
||||
#ifdef WIN32
|
||||
char delim = '\\';
|
||||
#else
|
||||
char delim = '/';
|
||||
#endif
|
||||
get_share_path(my_exec_path, sharepath);
|
||||
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
|
||||
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
|
||||
filename = absfn;
|
||||
}
|
||||
|
||||
return filename;
|
||||
}
|
||||
|
@ -16,6 +16,8 @@ text *mtextdup(text *in);
|
||||
|
||||
int text_cmp(text *a, text *b);
|
||||
|
||||
char * to_absfilename(char *filename);
|
||||
|
||||
#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
|
||||
#define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
|
||||
|
||||
/*
|
||||
* interface functions to dictionary
|
||||
@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
|
||||
Datum opt;
|
||||
Oid oid = InvalidOid;
|
||||
|
||||
/* setup dictlexize method */
|
||||
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
|
||||
if (isnull || oid == InvalidOid)
|
||||
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
|
||||
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
|
||||
|
||||
/* setup and call dictinit method, optinally */
|
||||
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
|
||||
if (!(isnull || oid == InvalidOid))
|
||||
{
|
||||
opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
|
||||
dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
|
||||
}
|
||||
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
|
||||
if (isnull || oid == InvalidOid)
|
||||
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
|
||||
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
|
||||
dict->dict_id = id;
|
||||
}
|
||||
else
|
||||
@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
|
||||
return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
|
||||
}
|
||||
|
||||
static void
|
||||
insertdict(Oid id) {
|
||||
DictInfo newdict;
|
||||
|
||||
if (DList.len == DList.reallen)
|
||||
{
|
||||
DictInfo *tmp;
|
||||
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
|
||||
|
||||
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
|
||||
if (!tmp)
|
||||
ts_error(ERROR, "No memory");
|
||||
DList.reallen = reallen;
|
||||
DList.list = tmp;
|
||||
}
|
||||
init_dict(id, &newdict);
|
||||
|
||||
DList.list[DList.len] = newdict;
|
||||
DList.len++;
|
||||
|
||||
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
|
||||
}
|
||||
|
||||
DictInfo *
|
||||
finddict(Oid id)
|
||||
{
|
||||
@ -117,23 +143,8 @@ finddict(Oid id)
|
||||
return DList.last_dict;
|
||||
}
|
||||
|
||||
/* last chance */
|
||||
if (DList.len == DList.reallen)
|
||||
{
|
||||
DictInfo *tmp;
|
||||
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
|
||||
|
||||
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
|
||||
if (!tmp)
|
||||
ts_error(ERROR, "No memory");
|
||||
DList.reallen = reallen;
|
||||
DList.list = tmp;
|
||||
}
|
||||
DList.last_dict = &(DList.list[DList.len]);
|
||||
init_dict(id, DList.last_dict);
|
||||
|
||||
DList.len++;
|
||||
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
|
||||
/* insert new dictionary */
|
||||
insertdict(id);
|
||||
return finddict(id); /* qsort changed order!! */ ;
|
||||
}
|
||||
|
||||
@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
|
||||
*ptr;
|
||||
Datum *da;
|
||||
ArrayType *a;
|
||||
DictSubState dstate = { false, false, NULL };
|
||||
|
||||
SET_FUNCOID();
|
||||
dict = finddict(PG_GETARG_OID(0));
|
||||
|
||||
ptr = res = (TSLexeme *) DatumGetPointer(
|
||||
FunctionCall3(&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(VARDATA(in)),
|
||||
Int32GetDatum(VARSIZE(in) - VARHDRSZ)
|
||||
FunctionCall4(&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(VARDATA(in)),
|
||||
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
|
||||
PointerGetDatum(&dstate)
|
||||
)
|
||||
);
|
||||
|
||||
if (dstate.getnext) {
|
||||
dstate.isend = true;
|
||||
ptr = res = (TSLexeme *) DatumGetPointer(
|
||||
FunctionCall4(&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(VARDATA(in)),
|
||||
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
|
||||
PointerGetDatum(&dstate)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
PG_FREE_IF_COPY(in, 1);
|
||||
if (!res)
|
||||
{
|
||||
|
@ -1,9 +1,10 @@
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
|
||||
|
||||
#ifndef __DICT_H__
|
||||
#define __DICT_H__
|
||||
#include "postgres.h"
|
||||
#include "fmgr.h"
|
||||
#include "ts_cfg.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@ -29,6 +30,11 @@ DictInfo *finddict(Oid id);
|
||||
Oid name2id_dict(text *name);
|
||||
void reset_dict(void);
|
||||
|
||||
typedef struct {
|
||||
bool isend; /* in: marks for lexize_info about text end is reached */
|
||||
bool getnext; /* out: dict wants next lexeme */
|
||||
void *private; /* internal dict state between calls with getnext == true */
|
||||
} DictSubState;
|
||||
|
||||
/* simple parser of cfg string */
|
||||
typedef struct
|
||||
@ -45,17 +51,61 @@ typedef struct
|
||||
/*
|
||||
* number of variant of split word , for example Word 'fotballklubber'
|
||||
* (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
|
||||
* ball, klubb ). So, dictionary should return: nvariant lexeme 1
|
||||
* fotball 1 klubb 2 fot 2 ball 2 klubb
|
||||
*
|
||||
* ball, klubb ). So, dictionary should return:
|
||||
* nvariant lexeme
|
||||
* 1 fotball
|
||||
* 1 klubb
|
||||
* 2 fot
|
||||
* 2 ball
|
||||
* 2 klubb
|
||||
*/
|
||||
uint16 nvariant;
|
||||
|
||||
/* currently unused */
|
||||
uint16 flags;
|
||||
|
||||
/* C-string */
|
||||
char *lexeme;
|
||||
} TSLexeme;
|
||||
|
||||
#define TSL_ADDPOS 0x01
|
||||
|
||||
|
||||
/*
|
||||
* Lexize subsystem
|
||||
*/
|
||||
|
||||
typedef struct ParsedLex {
|
||||
int type;
|
||||
char *lemm;
|
||||
int lenlemm;
|
||||
bool resfollow;
|
||||
struct ParsedLex *next;
|
||||
} ParsedLex;
|
||||
|
||||
typedef struct ListParsedLex {
|
||||
ParsedLex *head;
|
||||
ParsedLex *tail;
|
||||
} ListParsedLex;
|
||||
|
||||
typedef struct {
|
||||
TSCfgInfo *cfg;
|
||||
Oid curDictId;
|
||||
int posDict;
|
||||
DictSubState dictState;
|
||||
ParsedLex *curSub;
|
||||
ListParsedLex towork; /* current list to work */
|
||||
ListParsedLex waste; /* list of lexemes that already lexized */
|
||||
|
||||
/* fields to store last variant to lexize (basically, thesaurus
|
||||
or similar to, which wants several lexemes */
|
||||
|
||||
ParsedLex *lastRes;
|
||||
TSLexeme *tmpRes;
|
||||
} LexizeData;
|
||||
|
||||
|
||||
void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
|
||||
void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
|
||||
TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
|
||||
|
||||
#endif
|
||||
|
743
contrib/tsearch2/dict_thesaurus.c
Normal file
743
contrib/tsearch2/dict_thesaurus.c
Normal file
@ -0,0 +1,743 @@
|
||||
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.1 2006/05/31 14:05:31 teodor Exp $ */
|
||||
|
||||
/*
|
||||
* thesaurus
|
||||
* Teodor Sigaev <teodor@sigaev.ru>
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "executor/spi.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include "dict.h"
|
||||
#include "common.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
typedef struct LexemeInfo {
|
||||
uint16 idsubst; /* entry's number in DictThesaurus->subst */
|
||||
uint16 posinsubst; /* pos info in entry */
|
||||
uint16 tnvariant; /* total num lexemes in one variant */
|
||||
struct LexemeInfo *nextentry;
|
||||
struct LexemeInfo *nextvariant;
|
||||
} LexemeInfo;
|
||||
|
||||
typedef struct {
|
||||
char *lexeme;
|
||||
LexemeInfo *entries;
|
||||
} TheLexeme;
|
||||
|
||||
typedef struct {
|
||||
uint16 lastlexeme; /* number lexemes to substitute */
|
||||
uint16 reslen;
|
||||
TSLexeme *res; /* prepared substituted result */
|
||||
} TheSubstitute;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* subdictionary to normalize lexemes */
|
||||
DictInfo subdict;
|
||||
|
||||
/* Array to search lexeme by exact match */
|
||||
TheLexeme *wrds;
|
||||
int nwrds;
|
||||
int ntwrds;
|
||||
|
||||
/* Storage of substituted result, n-th element is for
|
||||
n-th expression */
|
||||
TheSubstitute *subst;
|
||||
int nsubst;
|
||||
} DictThesaurus;
|
||||
|
||||
PG_FUNCTION_INFO_V1(thesaurus_init);
|
||||
Datum thesaurus_init(PG_FUNCTION_ARGS);
|
||||
|
||||
PG_FUNCTION_INFO_V1(thesaurus_lexize);
|
||||
Datum thesaurus_lexize(PG_FUNCTION_ARGS);
|
||||
|
||||
static void
|
||||
freeDictThesaurus(DictThesaurus * d)
|
||||
{
|
||||
free(d);
|
||||
}
|
||||
|
||||
static void
|
||||
newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst ) {
|
||||
TheLexeme *ptr;
|
||||
|
||||
if ( d->nwrds >= d->ntwrds ) {
|
||||
if ( d->ntwrds == 0 ) {
|
||||
d->ntwrds = 16;
|
||||
d->wrds = (TheLexeme*)malloc(sizeof(TheLexeme) * d->ntwrds);
|
||||
} else {
|
||||
d->ntwrds *= 2;
|
||||
d->wrds = (TheLexeme*)realloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
|
||||
}
|
||||
if (!d->wrds)
|
||||
elog(ERROR,"Out of memory");
|
||||
}
|
||||
|
||||
ptr = d->wrds + d->nwrds;
|
||||
d->nwrds++;
|
||||
|
||||
if ( (ptr->lexeme = malloc(e-b+1)) == NULL )
|
||||
elog(ERROR,"Out of memory");
|
||||
|
||||
memcpy(ptr->lexeme, b, e-b);
|
||||
ptr->lexeme[e-b] = '\0';
|
||||
|
||||
if ( (ptr->entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) ))==NULL )
|
||||
elog(ERROR,"Out of memory");
|
||||
|
||||
ptr->entries->nextentry=NULL;
|
||||
ptr->entries->idsubst = idsubst;
|
||||
ptr->entries->posinsubst = posinsubst;
|
||||
}
|
||||
|
||||
static void
|
||||
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
|
||||
static int nres=0;
|
||||
static int ntres = 0;
|
||||
TheSubstitute *ptr;
|
||||
|
||||
if ( nwrd == 0 ) {
|
||||
nres = ntres = 0;
|
||||
|
||||
if ( idsubst <= d->nsubst ) {
|
||||
if ( d->nsubst == 0 ) {
|
||||
d->nsubst = 16;
|
||||
d->subst = (TheSubstitute*)malloc(sizeof(TheSubstitute) * d->nsubst);
|
||||
} else {
|
||||
d->nsubst *= 2;
|
||||
d->subst = (TheSubstitute*)realloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
|
||||
}
|
||||
if (!d->subst)
|
||||
elog(ERROR,"Out of memory");
|
||||
}
|
||||
}
|
||||
|
||||
ptr = d->subst + idsubst;
|
||||
|
||||
ptr->lastlexeme = posinsubst-1;
|
||||
|
||||
if ( nres+1 >= ntres ) {
|
||||
if ( ntres == 0 ) {
|
||||
ntres = 2;
|
||||
ptr->res = (TSLexeme*)malloc( sizeof(TSLexeme) * ntres );
|
||||
} else {
|
||||
ntres *= 2;
|
||||
ptr->res = (TSLexeme*)realloc( ptr->res, sizeof(TSLexeme) * ntres );
|
||||
}
|
||||
|
||||
if ( !ptr->res )
|
||||
elog(ERROR,"Out of memory");
|
||||
}
|
||||
|
||||
if ( (ptr->res[ nres ].lexeme = malloc(e-b+1))==0 )
|
||||
elog(ERROR,"Out of memory");
|
||||
memcpy(ptr->res[ nres ].lexeme, b, e-b);
|
||||
ptr->res[ nres ].lexeme[e-b] = '\0';
|
||||
|
||||
ptr->res[ nres ].nvariant = nwrd;
|
||||
ptr->res[ nres ].flags = TSL_ADDPOS;
|
||||
|
||||
ptr->res[ ++nres ].lexeme = NULL;
|
||||
}
|
||||
|
||||
#define TR_WAITLEX 1
|
||||
#define TR_INLEX 2
|
||||
#define TR_WAITSUBS 3
|
||||
#define TR_INSUBS 4
|
||||
|
||||
static void
|
||||
thesaurusRead( char *filename, DictThesaurus *d ) {
|
||||
FILE *fh;
|
||||
char str[BUFSIZ];
|
||||
int lineno=0;
|
||||
uint16 idsubst = 0;
|
||||
|
||||
fh = fopen(to_absfilename(filename), "r");
|
||||
if (!fh)
|
||||
elog(ERROR,"Thesaurus: can't open '%s' file", filename);
|
||||
|
||||
while( fgets(str, sizeof(str), fh)) {
|
||||
char *ptr = str;
|
||||
int state = TR_WAITLEX;
|
||||
char *beginwrd = NULL;
|
||||
uint16 posinsubst=0;
|
||||
uint16 nwrd=0;
|
||||
|
||||
lineno++;
|
||||
|
||||
/* is it comment ? */
|
||||
while( t_isspace(ptr) )
|
||||
ptr += pg_mblen(ptr);
|
||||
if ( t_iseq(str, '#') || *str=='\0' || t_iseq(str, '\n') || t_iseq(str, '\r') )
|
||||
continue;
|
||||
|
||||
pg_verifymbstr(ptr, strlen(ptr), false);
|
||||
while(*ptr) {
|
||||
if ( state == TR_WAITLEX ) {
|
||||
if ( t_iseq(ptr, ':' ) ) {
|
||||
if ( posinsubst == 0 ) {
|
||||
fclose(fh);
|
||||
elog(ERROR, "Thesaurus: Unexpected delimiter at %d line", lineno);
|
||||
}
|
||||
state = TR_WAITSUBS;
|
||||
} else if ( !t_isspace(ptr) ) {
|
||||
beginwrd = ptr;
|
||||
state = TR_INLEX;
|
||||
}
|
||||
} else if ( state == TR_INLEX ) {
|
||||
if ( t_iseq(ptr, ':') ) {
|
||||
newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
|
||||
state = TR_WAITSUBS;
|
||||
} else if ( t_isspace(ptr) ) {
|
||||
newLexeme( d, beginwrd, ptr, idsubst, posinsubst++ );
|
||||
state = TR_WAITLEX;
|
||||
}
|
||||
} else if ( state == TR_WAITSUBS ) {
|
||||
if ( !t_isspace(ptr) ) {
|
||||
beginwrd = ptr;
|
||||
state = TR_INSUBS;
|
||||
}
|
||||
} else if ( state == TR_INSUBS ) {
|
||||
if ( t_isspace(ptr) ) {
|
||||
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
|
||||
state = TR_WAITSUBS;
|
||||
}
|
||||
} else
|
||||
elog(ERROR,"Thesaurus: Unknown state: %d", state);
|
||||
|
||||
ptr += pg_mblen(ptr);
|
||||
}
|
||||
|
||||
if ( state == TR_INSUBS )
|
||||
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
|
||||
|
||||
idsubst++;
|
||||
|
||||
if ( !(nwrd && posinsubst) ) {
|
||||
fclose(fh);
|
||||
elog(ERROR, "Thesaurus: Unexpected end of line at %d line", lineno);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
d->nsubst = idsubst;
|
||||
|
||||
fclose(fh);
|
||||
}
|
||||
|
||||
static TheLexeme*
|
||||
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo* src, uint16 tnvariant) {
|
||||
|
||||
if ( *nnw >= *tnm ) {
|
||||
*tnm *= 2;
|
||||
newwrds = (TheLexeme*)realloc( newwrds, sizeof(TheLexeme) * *tnm);
|
||||
if (!newwrds)
|
||||
elog(ERROR,"Out of memory");
|
||||
}
|
||||
|
||||
newwrds[ *nnw ].entries = (LexemeInfo*)malloc( sizeof(LexemeInfo) );
|
||||
if (!newwrds[ *nnw ].entries)
|
||||
elog(ERROR,"Out of memory");
|
||||
|
||||
if ( lexeme && lexeme->lexeme ) {
|
||||
newwrds[ *nnw ].lexeme = strdup( lexeme->lexeme );
|
||||
if ( !newwrds[ *nnw ].lexeme )
|
||||
elog(ERROR,"Out of memory");
|
||||
|
||||
newwrds[ *nnw ].entries->tnvariant = tnvariant;
|
||||
} else {
|
||||
newwrds[ *nnw ].lexeme = NULL;
|
||||
newwrds[ *nnw ].entries->tnvariant = 1;
|
||||
}
|
||||
|
||||
newwrds[ *nnw ].entries->idsubst = src->idsubst;
|
||||
newwrds[ *nnw ].entries->posinsubst = src->posinsubst;
|
||||
|
||||
newwrds[ *nnw ].entries->nextentry = NULL;
|
||||
|
||||
(*nnw)++;
|
||||
return newwrds;
|
||||
}
|
||||
|
||||
static int
|
||||
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b) {
|
||||
if ( a==NULL || b==NULL )
|
||||
return 0;
|
||||
|
||||
if ( a->idsubst == b->idsubst ) {
|
||||
if ( a->posinsubst == b->posinsubst ) {
|
||||
if ( a->tnvariant == b->tnvariant )
|
||||
return 0;
|
||||
|
||||
return ( a->tnvariant > b->tnvariant ) ? 1 : -1;
|
||||
}
|
||||
|
||||
return ( a->posinsubst > b->posinsubst ) ? 1 : -1;
|
||||
}
|
||||
|
||||
return ( a->idsubst > b->idsubst ) ? 1 : -1;
|
||||
}
|
||||
|
||||
static int
|
||||
cmpLexeme(TheLexeme *a, TheLexeme* b) {
|
||||
if ( a->lexeme == NULL ) {
|
||||
if ( b->lexeme == NULL )
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
} else if ( b->lexeme == NULL )
|
||||
return -1;
|
||||
|
||||
return strcmp( a->lexeme, b->lexeme );
|
||||
}
|
||||
|
||||
static int
|
||||
cmpLexemeQ(const void *a, const void *b) {
|
||||
return cmpLexeme( (TheLexeme*)a, (TheLexeme*)b );
|
||||
}
|
||||
|
||||
static int cmpTheLexeme(const void *a, const void *b) {
|
||||
TheLexeme *la = (TheLexeme*)a;
|
||||
TheLexeme *lb = (TheLexeme*)b;
|
||||
int res;
|
||||
|
||||
if ( (res=cmpLexeme(la, lb)) != 0 )
|
||||
return res;
|
||||
|
||||
return -cmpLexemeInfo(la->entries, lb->entries);
|
||||
}
|
||||
|
||||
static void
|
||||
compileTheLexeme(DictThesaurus *d) {
|
||||
int i,nnw=0, tnm=16;
|
||||
TheLexeme *newwrds = (TheLexeme*)malloc(sizeof(TheLexeme)*tnm), *ptrwrds;
|
||||
|
||||
if (!newwrds)
|
||||
elog(ERROR,"Out of memory");
|
||||
|
||||
for(i=0;i<d->nwrds;i++) {
|
||||
TSLexeme *ptr = (TSLexeme*) DatumGetPointer(
|
||||
FunctionCall4(
|
||||
&(d->subdict.lexize_info),
|
||||
PointerGetDatum(d->subdict.dictionary),
|
||||
PointerGetDatum(d->wrds[i].lexeme),
|
||||
Int32GetDatum(strlen(d->wrds[i].lexeme)),
|
||||
PointerGetDatum(NULL)
|
||||
)
|
||||
);
|
||||
|
||||
if ( !(ptr && ptr->lexeme) ) {
|
||||
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
||||
elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, assign any non-recognized word", d->wrds[i].lexeme);
|
||||
} else {
|
||||
while( ptr->lexeme ) {
|
||||
TSLexeme *remptr = ptr+1;
|
||||
int tnvar = 1;
|
||||
int curvar = ptr->nvariant;
|
||||
|
||||
/* compute n words in one variant */
|
||||
while( remptr->lexeme ) {
|
||||
if ( remptr->nvariant != (remptr-1)->nvariant )
|
||||
break;
|
||||
tnvar++;
|
||||
remptr++;
|
||||
}
|
||||
|
||||
remptr = ptr;
|
||||
while( remptr->lexeme && remptr->nvariant == curvar ) {
|
||||
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
||||
remptr++;
|
||||
}
|
||||
|
||||
ptr = remptr;
|
||||
}
|
||||
}
|
||||
|
||||
free( d->wrds[i].lexeme );
|
||||
free( d->wrds[i].entries );
|
||||
}
|
||||
|
||||
free( d->wrds );
|
||||
d->wrds = newwrds;
|
||||
d->nwrds = nnw;
|
||||
d->ntwrds = tnm;
|
||||
|
||||
if ( d->nwrds > 1 ) {
|
||||
qsort( d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme );
|
||||
|
||||
/* uniq */
|
||||
newwrds = d->wrds;
|
||||
ptrwrds = d->wrds + 1;
|
||||
while( ptrwrds - d->wrds < d->nwrds ) {
|
||||
if ( cmpLexeme( ptrwrds, newwrds ) == 0 ) {
|
||||
if ( cmpLexemeInfo(ptrwrds->entries, newwrds->entries) ) {
|
||||
ptrwrds->entries->nextentry = newwrds->entries;
|
||||
newwrds->entries = ptrwrds->entries;
|
||||
} else
|
||||
free( ptrwrds->entries );
|
||||
|
||||
if ( ptrwrds->lexeme )
|
||||
free( ptrwrds->lexeme );
|
||||
} else {
|
||||
newwrds++;
|
||||
*newwrds = *ptrwrds;
|
||||
}
|
||||
|
||||
ptrwrds++;
|
||||
}
|
||||
|
||||
d->nwrds = newwrds - d->wrds + 1;
|
||||
d->wrds = (TheLexeme*)realloc( d->wrds, sizeof(TheLexeme) * d->nwrds );
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
compileTheSubstitute(DictThesaurus *d) {
|
||||
int i;
|
||||
|
||||
for(i=0;i<d->nsubst;i++) {
|
||||
TSLexeme *rem = d->subst[i].res, *outptr, *inptr;
|
||||
int n=2;
|
||||
|
||||
outptr = d->subst[i].res = (TSLexeme*)malloc( sizeof(TSLexeme) * n );
|
||||
if ( d->subst[i].res == NULL )
|
||||
elog(ERROR,"Out of Memory");
|
||||
outptr->lexeme = NULL;
|
||||
inptr = rem;
|
||||
|
||||
while( inptr && inptr->lexeme ) {
|
||||
TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer(
|
||||
FunctionCall4(
|
||||
&(d->subdict.lexize_info),
|
||||
PointerGetDatum(d->subdict.dictionary),
|
||||
PointerGetDatum(inptr->lexeme),
|
||||
Int32GetDatum(strlen(inptr->lexeme)),
|
||||
PointerGetDatum(NULL)
|
||||
)
|
||||
);
|
||||
|
||||
reml = lexized;
|
||||
if ( lexized ) {
|
||||
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
|
||||
|
||||
while( lexized->lexeme ) {
|
||||
if ( outptr - d->subst[i].res + 1 >= n ) {
|
||||
int diff = outptr - d->subst[i].res;
|
||||
n *= 2;
|
||||
d->subst[i].res = (TSLexeme*)realloc( d->subst[i].res, sizeof(TSLexeme) * n );
|
||||
if ( d->subst[i].res == NULL )
|
||||
elog(ERROR,"Out of Memory");
|
||||
outptr = d->subst[i].res + diff;
|
||||
}
|
||||
|
||||
*outptr = *lexized;
|
||||
if ( (outptr->lexeme = strdup(lexized->lexeme)) == NULL )
|
||||
elog(ERROR,"Out of Memory");
|
||||
|
||||
outptr++;
|
||||
lexized++;
|
||||
}
|
||||
|
||||
if ( toset > 0)
|
||||
d->subst[i].res[toset].flags |= TSL_ADDPOS;
|
||||
}
|
||||
|
||||
if ( inptr->lexeme )
|
||||
free( inptr->lexeme );
|
||||
inptr++;
|
||||
}
|
||||
|
||||
d->subst[i].reslen = outptr - d->subst[i].res;
|
||||
|
||||
free(rem);
|
||||
}
|
||||
}
|
||||
|
||||
Datum
|
||||
thesaurus_init(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictThesaurus *d;
|
||||
Map *cfg,
|
||||
*pcfg;
|
||||
text *in, *subdictname=NULL;
|
||||
bool fileloaded = false;
|
||||
|
||||
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("Thesaurus confguration error")));
|
||||
|
||||
d = (DictThesaurus *) malloc(sizeof(DictThesaurus));
|
||||
if (!d)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
memset(d, 0, sizeof(DictThesaurus));
|
||||
|
||||
in = PG_GETARG_TEXT_P(0);
|
||||
parse_cfgdict(in, &cfg);
|
||||
PG_FREE_IF_COPY(in, 0);
|
||||
pcfg = cfg;
|
||||
while (pcfg->key)
|
||||
{
|
||||
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
|
||||
{
|
||||
if (fileloaded)
|
||||
{
|
||||
freeDictThesaurus(d);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Thesaurus file is already loaded")));
|
||||
}
|
||||
fileloaded = true;
|
||||
thesaurusRead( pcfg->value, d );
|
||||
}
|
||||
else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
|
||||
{
|
||||
if (subdictname)
|
||||
{
|
||||
freeDictThesaurus(d);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Thesaurus: SubDictionary is already defined")));
|
||||
}
|
||||
subdictname = char2text( pcfg->value );
|
||||
}
|
||||
else
|
||||
{
|
||||
freeDictThesaurus(d);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("unrecognized option: %s => %s",
|
||||
pcfg->key, pcfg->value)));
|
||||
}
|
||||
pfree(pcfg->key);
|
||||
pfree(pcfg->value);
|
||||
pcfg++;
|
||||
}
|
||||
pfree(cfg);
|
||||
|
||||
if (!fileloaded)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Thesaurus file isn't defined")));
|
||||
|
||||
if ( subdictname ) {
|
||||
DictInfo *subdictptr;
|
||||
/*
|
||||
* we already in SPI, but name2id_dict()/finddict()
|
||||
* invoke SPI_connect()
|
||||
*/
|
||||
SPI_push();
|
||||
|
||||
subdictptr = finddict( name2id_dict( subdictname ) );
|
||||
|
||||
SPI_pop();
|
||||
|
||||
d->subdict = *subdictptr;
|
||||
} else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("Thesaurus: SubDictionary isn't defined")));
|
||||
|
||||
compileTheLexeme( d );
|
||||
compileTheSubstitute(d);
|
||||
|
||||
PG_RETURN_POINTER(d);
|
||||
}
|
||||
|
||||
static LexemeInfo*
|
||||
findTheLexeme(DictThesaurus *d, char * lexeme) {
|
||||
TheLexeme key = { lexeme, NULL }, *res;
|
||||
|
||||
if ( d->nwrds == 0 )
|
||||
return NULL;
|
||||
|
||||
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
|
||||
|
||||
if ( res == NULL )
|
||||
return NULL;
|
||||
return res->entries;
|
||||
}
|
||||
|
||||
static bool
|
||||
matchIdSubst(LexemeInfo *stored, uint16 idsubst) {
|
||||
bool res = true;
|
||||
|
||||
if (stored) {
|
||||
res = false;
|
||||
|
||||
for(; stored; stored=stored->nextvariant)
|
||||
if ( stored->idsubst == idsubst ) {
|
||||
res = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static LexemeInfo*
|
||||
findVariant( LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn) {
|
||||
for(;;) {
|
||||
int i;
|
||||
LexemeInfo *ptr = newin[0];
|
||||
|
||||
for(i=0; i<newn; i++) {
|
||||
while(newin[i] && newin[i]->idsubst < ptr->idsubst)
|
||||
newin[i] = newin[i]->nextentry;
|
||||
|
||||
if ( newin[i] == NULL )
|
||||
return in;
|
||||
|
||||
if ( newin[i]->idsubst > ptr->idsubst ) {
|
||||
ptr = newin[i];
|
||||
i=-1;
|
||||
continue;
|
||||
}
|
||||
|
||||
while(newin[i]->idsubst == ptr->idsubst) {
|
||||
if ( newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn ) {
|
||||
ptr = newin[i];
|
||||
break;
|
||||
}
|
||||
|
||||
newin[i] = newin[i]->nextentry;
|
||||
if ( newin[i] == NULL )
|
||||
return in;
|
||||
}
|
||||
|
||||
if ( newin[i]->idsubst != ptr->idsubst ) {
|
||||
ptr = newin[i];
|
||||
i=-1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i==newn && matchIdSubst(stored, ptr->idsubst) && (in==NULL || !matchIdSubst(in, ptr->idsubst)) ) { /* found */
|
||||
|
||||
ptr->nextvariant = in;
|
||||
in = ptr;
|
||||
}
|
||||
|
||||
/* step forward */
|
||||
for(i=0; i<newn; i++)
|
||||
newin[i] = newin[i]->nextentry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static TSLexeme*
|
||||
copyTSLexeme( TheSubstitute *ts ) {
|
||||
TSLexeme *res;
|
||||
uint16 i;
|
||||
|
||||
res = (TSLexeme*)palloc( sizeof(TSLexeme) * (ts->reslen+1) );
|
||||
for(i=0;i<ts->reslen;i++) {
|
||||
res[i] = ts->res[i];
|
||||
res[i].lexeme = pstrdup( ts->res[i].lexeme );
|
||||
}
|
||||
|
||||
res[ts->reslen].lexeme = NULL;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static TSLexeme*
|
||||
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres) {
|
||||
*moreres = false;
|
||||
while(info) {
|
||||
Assert( info->idsubst < d->nsubst );
|
||||
if ( info->nextvariant )
|
||||
*moreres = true;
|
||||
if ( d->subst[ info->idsubst ].lastlexeme == curpos )
|
||||
return copyTSLexeme( d->subst + info->idsubst );
|
||||
info = info->nextvariant;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Datum
|
||||
thesaurus_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
|
||||
DictSubState *dstate = (DictSubState*)PG_GETARG_POINTER(3);
|
||||
TSLexeme *res=NULL;
|
||||
LexemeInfo *stored, *info = NULL;
|
||||
uint16 curpos = 0;
|
||||
bool moreres = false;
|
||||
|
||||
if ( dstate == NULL || PG_NARGS() < 4 )
|
||||
elog(ERROR,"Forbidden call of thesaurus or nested call");
|
||||
|
||||
if ( dstate->isend )
|
||||
PG_RETURN_POINTER(NULL);
|
||||
stored = (LexemeInfo*) dstate->private;
|
||||
|
||||
if (stored)
|
||||
curpos = stored->posinsubst+1;
|
||||
|
||||
res =(TSLexeme*) DatumGetPointer (
|
||||
FunctionCall4(
|
||||
&(d->subdict.lexize_info),
|
||||
PointerGetDatum(d->subdict.dictionary),
|
||||
PG_GETARG_DATUM(1),
|
||||
PG_GETARG_INT32(2),
|
||||
PointerGetDatum(NULL)
|
||||
)
|
||||
);
|
||||
|
||||
if ( res && res->lexeme ) {
|
||||
TSLexeme *ptr = res , *basevar;
|
||||
|
||||
while( ptr->lexeme ) {
|
||||
uint16 nv = ptr->nvariant;
|
||||
uint16 i,nlex = 0;
|
||||
LexemeInfo **infos;
|
||||
|
||||
basevar = ptr;
|
||||
while( ptr->lexeme && nv == ptr->nvariant ) {
|
||||
nlex++;
|
||||
ptr++;
|
||||
}
|
||||
|
||||
infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
|
||||
for(i=0;i<nlex;i++)
|
||||
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
|
||||
break;
|
||||
|
||||
if ( i<nlex ) {
|
||||
/* no chance to find */
|
||||
pfree( infos );
|
||||
continue;
|
||||
}
|
||||
|
||||
info = findVariant( info, stored, curpos, infos, nlex);
|
||||
}
|
||||
|
||||
} else {
|
||||
LexemeInfo *infos = findTheLexeme(d, NULL);
|
||||
info = findVariant( NULL, stored, curpos, &infos, 1);
|
||||
}
|
||||
|
||||
dstate->private = (void*)info;
|
||||
|
||||
if ( !info ) {
|
||||
dstate->getnext = false;
|
||||
PG_RETURN_POINTER(NULL);
|
||||
}
|
||||
|
||||
if ( (res=checkMatch(d, info, curpos,&moreres)) != NULL ) {
|
||||
dstate->getnext = moreres;
|
||||
PG_RETURN_POINTER(res);
|
||||
}
|
||||
|
||||
dstate->getnext = true;
|
||||
|
||||
PG_RETURN_POINTER(NULL);
|
||||
}
|
@ -4,21 +4,21 @@
|
||||
--
|
||||
\set ECHO none
|
||||
psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
|
||||
psql:tsearch2.sql:158: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
|
||||
psql:tsearch2.sql:257: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
|
||||
psql:tsearch2.sql:264: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
|
||||
psql:tsearch2.sql:370: NOTICE: type "tsvector" is not yet defined
|
||||
psql:tsearch2.sql:177: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
|
||||
psql:tsearch2.sql:276: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
|
||||
psql:tsearch2.sql:283: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
|
||||
psql:tsearch2.sql:389: NOTICE: type "tsvector" is not yet defined
|
||||
DETAIL: Creating a shell type definition.
|
||||
psql:tsearch2.sql:375: NOTICE: argument type tsvector is only a shell
|
||||
psql:tsearch2.sql:429: NOTICE: type "tsquery" is not yet defined
|
||||
psql:tsearch2.sql:394: NOTICE: argument type tsvector is only a shell
|
||||
psql:tsearch2.sql:448: NOTICE: type "tsquery" is not yet defined
|
||||
DETAIL: Creating a shell type definition.
|
||||
psql:tsearch2.sql:434: NOTICE: argument type tsquery is only a shell
|
||||
psql:tsearch2.sql:592: NOTICE: type "gtsvector" is not yet defined
|
||||
psql:tsearch2.sql:453: NOTICE: argument type tsquery is only a shell
|
||||
psql:tsearch2.sql:611: NOTICE: type "gtsvector" is not yet defined
|
||||
DETAIL: Creating a shell type definition.
|
||||
psql:tsearch2.sql:597: NOTICE: argument type gtsvector is only a shell
|
||||
psql:tsearch2.sql:1087: NOTICE: type "gtsq" is not yet defined
|
||||
psql:tsearch2.sql:616: NOTICE: argument type gtsvector is only a shell
|
||||
psql:tsearch2.sql:1106: NOTICE: type "gtsq" is not yet defined
|
||||
DETAIL: Creating a shell type definition.
|
||||
psql:tsearch2.sql:1092: NOTICE: argument type gtsq is only a shell
|
||||
psql:tsearch2.sql:1111: NOTICE: argument type gtsq is only a shell
|
||||
--tsvector
|
||||
SELECT '1'::tsvector;
|
||||
tsvector
|
||||
|
@ -4,8 +4,6 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "dict.h"
|
||||
#include "ts_locale.h"
|
||||
@ -36,30 +34,11 @@ readstoplist(text *in, StopList * s)
|
||||
s->len = 0;
|
||||
if (in && VARSIZE(in) - VARHDRSZ > 0)
|
||||
{
|
||||
char *filename = text2char(in);
|
||||
char *filename = to_absfilename(text2char(in));
|
||||
FILE *hin;
|
||||
char buf[STOPBUFLEN];
|
||||
int reallen = 0;
|
||||
|
||||
/* if path is relative, take it as relative to share dir */
|
||||
if (!is_absolute_path(filename))
|
||||
{
|
||||
char sharepath[MAXPGPATH];
|
||||
char *absfn;
|
||||
#ifdef WIN32
|
||||
char delim = '\\';
|
||||
#else
|
||||
char delim = '/';
|
||||
#endif
|
||||
|
||||
get_share_path(my_exec_path, sharepath);
|
||||
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
|
||||
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
|
||||
|
||||
pfree(filename);
|
||||
filename = absfn;
|
||||
}
|
||||
|
||||
if ((hin = fopen(filename, "r")) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
|
19
contrib/tsearch2/thesaurus
Normal file
19
contrib/tsearch2/thesaurus
Normal file
@ -0,0 +1,19 @@
|
||||
#
|
||||
# Theasurus config file. Character ':' splits
|
||||
# string to part:
|
||||
# to be substituted string
|
||||
# substituting string
|
||||
#
|
||||
|
||||
#one two three : 123
|
||||
#one two : 12
|
||||
#one : 1
|
||||
#two : 2
|
||||
|
||||
#foo bar : blah blah
|
||||
#f bar : fbar
|
||||
#e bar : ebar
|
||||
#g bar bar : gbarbar
|
||||
#asd:sdffff
|
||||
#qwerty:qwer wert erty
|
||||
|
@ -281,15 +281,15 @@ name2id_cfg(text *name)
|
||||
return id;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
|
||||
{
|
||||
int type,
|
||||
lenlemm,
|
||||
i;
|
||||
lenlemm;
|
||||
char *lemm = NULL;
|
||||
WParserInfo *prsobj = findprs(cfg->prs_id);
|
||||
LexizeData ldata;
|
||||
TSLexeme *norms;
|
||||
|
||||
prsobj->prs = (void *) DatumGetPointer(
|
||||
FunctionCall2(
|
||||
@ -299,14 +299,16 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
|
||||
)
|
||||
);
|
||||
|
||||
while ((type = DatumGetInt32(FunctionCall3(
|
||||
LexizeInit(&ldata, cfg);
|
||||
|
||||
do {
|
||||
type = DatumGetInt32(FunctionCall3(
|
||||
&(prsobj->getlexeme_info),
|
||||
PointerGetDatum(prsobj->prs),
|
||||
PointerGetDatum(&lemm),
|
||||
PointerGetDatum(&lenlemm)))) != 0)
|
||||
{
|
||||
PointerGetDatum(&lenlemm)));
|
||||
|
||||
if (lenlemm >= MAXSTRLEN)
|
||||
if (type>0 && lenlemm >= MAXSTRLEN)
|
||||
{
|
||||
#ifdef IGNORE_LONGLEXEME
|
||||
ereport(NOTICE,
|
||||
@ -320,25 +322,11 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
|
||||
#endif
|
||||
}
|
||||
|
||||
if (type >= cfg->len) /* skip this type of lexeme */
|
||||
continue;
|
||||
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
||||
|
||||
for (i = 0; i < cfg->map[type].len; i++)
|
||||
while( (norms = LexizeExec(&ldata, NULL)) != NULL )
|
||||
{
|
||||
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
|
||||
TSLexeme *norms,
|
||||
*ptr;
|
||||
|
||||
norms = ptr = (TSLexeme *) DatumGetPointer(
|
||||
FunctionCall3(
|
||||
&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(lemm),
|
||||
PointerGetDatum(lenlemm)
|
||||
)
|
||||
);
|
||||
if (!norms) /* dictionary doesn't know this lexeme */
|
||||
continue;
|
||||
TSLexeme *ptr = norms;
|
||||
|
||||
prs->pos++; /* set pos */
|
||||
|
||||
@ -350,6 +338,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
|
||||
prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
|
||||
}
|
||||
|
||||
if ( ptr->flags & TSL_ADDPOS )
|
||||
prs->pos++;
|
||||
prs->words[prs->curwords].len = strlen(ptr->lexeme);
|
||||
prs->words[prs->curwords].word = ptr->lexeme;
|
||||
prs->words[prs->curwords].nvariant = ptr->nvariant;
|
||||
@ -359,9 +349,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
|
||||
prs->curwords++;
|
||||
}
|
||||
pfree(norms);
|
||||
break; /* lexeme already normalized or is stop word */
|
||||
}
|
||||
}
|
||||
} while(type>0);
|
||||
|
||||
FunctionCall1(
|
||||
&(prsobj->end_info),
|
||||
@ -417,14 +406,47 @@ hlfinditem(HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int buflen)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
addHLParsedLex(HLPRSTEXT *prs, QUERYTYPE * query, ParsedLex *lexs, TSLexeme *norms) {
|
||||
ParsedLex *tmplexs;
|
||||
TSLexeme *ptr;
|
||||
|
||||
while( lexs ) {
|
||||
|
||||
if ( lexs->type > 0 )
|
||||
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
|
||||
|
||||
ptr = norms;
|
||||
while( ptr && ptr->lexeme ) {
|
||||
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
|
||||
ptr++;
|
||||
}
|
||||
|
||||
tmplexs = lexs->next;
|
||||
pfree( lexs );
|
||||
lexs = tmplexs;
|
||||
}
|
||||
|
||||
if ( norms ) {
|
||||
ptr = norms;
|
||||
while( ptr->lexeme ) {
|
||||
pfree( ptr->lexeme );
|
||||
ptr++;
|
||||
}
|
||||
pfree(norms);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
|
||||
{
|
||||
int type,
|
||||
lenlemm,
|
||||
i;
|
||||
lenlemm;
|
||||
char *lemm = NULL;
|
||||
WParserInfo *prsobj = findprs(cfg->prs_id);
|
||||
LexizeData ldata;
|
||||
TSLexeme *norms;
|
||||
ParsedLex *lexs;
|
||||
|
||||
prsobj->prs = (void *) DatumGetPointer(
|
||||
FunctionCall2(
|
||||
@ -434,14 +456,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
|
||||
)
|
||||
);
|
||||
|
||||
while ((type = DatumGetInt32(FunctionCall3(
|
||||
LexizeInit(&ldata, cfg);
|
||||
|
||||
do {
|
||||
type = DatumGetInt32(FunctionCall3(
|
||||
&(prsobj->getlexeme_info),
|
||||
PointerGetDatum(prsobj->prs),
|
||||
PointerGetDatum(&lemm),
|
||||
PointerGetDatum(&lenlemm)))) != 0)
|
||||
{
|
||||
PointerGetDatum(&lenlemm)));
|
||||
|
||||
if (lenlemm >= MAXSTRLEN)
|
||||
if (type>0 && lenlemm >= MAXSTRLEN)
|
||||
{
|
||||
#ifdef IGNORE_LONGLEXEME
|
||||
ereport(NOTICE,
|
||||
@ -455,38 +479,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
|
||||
#endif
|
||||
}
|
||||
|
||||
hladdword(prs, lemm, lenlemm, type);
|
||||
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
||||
|
||||
if (type >= cfg->len)
|
||||
continue;
|
||||
do {
|
||||
if ( (norms = LexizeExec(&ldata,&lexs)) != NULL )
|
||||
addHLParsedLex(prs, query, lexs, norms);
|
||||
else
|
||||
addHLParsedLex(prs, query, lexs, NULL);
|
||||
} while( norms );
|
||||
|
||||
for (i = 0; i < cfg->map[type].len; i++)
|
||||
{
|
||||
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
|
||||
TSLexeme *norms,
|
||||
*ptr;
|
||||
|
||||
norms = ptr = (TSLexeme *) DatumGetPointer(
|
||||
FunctionCall3(
|
||||
&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(lemm),
|
||||
PointerGetDatum(lenlemm)
|
||||
)
|
||||
);
|
||||
if (!norms) /* dictionary doesn't know this lexeme */
|
||||
continue;
|
||||
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
|
||||
pfree(ptr->lexeme);
|
||||
ptr++;
|
||||
}
|
||||
pfree(norms);
|
||||
break; /* lexeme already normalized or is stop word */
|
||||
}
|
||||
}
|
||||
} while( type>0 );
|
||||
|
||||
FunctionCall1(
|
||||
&(prsobj->end_info),
|
||||
|
261
contrib/tsearch2/ts_lexize.c
Normal file
261
contrib/tsearch2/ts_lexize.c
Normal file
@ -0,0 +1,261 @@
|
||||
/*
|
||||
* lexize stream of lexemes
|
||||
* Teodor Sigaev <teodor@sigaev.ru>
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <locale.h>
|
||||
|
||||
#include "ts_cfg.h"
|
||||
#include "dict.h"
|
||||
|
||||
void
|
||||
LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
|
||||
ld->cfg = cfg;
|
||||
ld->curDictId = InvalidOid;
|
||||
ld->posDict = 0;
|
||||
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
|
||||
ld->waste.head = ld->waste.tail = NULL;
|
||||
ld->lastRes=NULL;
|
||||
ld->tmpRes=NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
|
||||
if ( list->tail ) {
|
||||
list->tail->next = newpl;
|
||||
list->tail = newpl;
|
||||
} else
|
||||
list->head = list->tail = newpl;
|
||||
newpl->next = NULL;
|
||||
}
|
||||
|
||||
static ParsedLex*
|
||||
LPLRemoveHead(ListParsedLex *list) {
|
||||
ParsedLex *res = list->head;
|
||||
|
||||
if ( list->head )
|
||||
list->head = list->head->next;
|
||||
|
||||
if ( list->head == NULL )
|
||||
list->tail = NULL;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
|
||||
ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
|
||||
|
||||
newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
|
||||
newpl->type = type;
|
||||
newpl->lemm = lemm;
|
||||
newpl->lenlemm = lenlemm;
|
||||
LPLAddTail(&ld->towork, newpl);
|
||||
ld->curSub = ld->towork.tail;
|
||||
}
|
||||
|
||||
static void
|
||||
RemoveHead(LexizeData *ld) {
|
||||
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
|
||||
|
||||
ld->posDict = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
|
||||
if ( correspondLexem ) {
|
||||
*correspondLexem = ld->waste.head;
|
||||
} else {
|
||||
ParsedLex *tmp, *ptr = ld->waste.head;
|
||||
|
||||
while(ptr) {
|
||||
tmp = ptr->next;
|
||||
pfree(ptr);
|
||||
ptr = tmp;
|
||||
}
|
||||
}
|
||||
ld->waste.head = ld->waste.tail = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
moveToWaste(LexizeData *ld, ParsedLex *stop) {
|
||||
bool go = true;
|
||||
|
||||
while( ld->towork.head && go) {
|
||||
if (ld->towork.head == stop) {
|
||||
ld->curSub = stop->next;
|
||||
go = false;
|
||||
}
|
||||
RemoveHead(ld);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
|
||||
if ( ld->tmpRes ) {
|
||||
TSLexeme *ptr;
|
||||
for( ptr=ld->tmpRes; ptr->lexeme; ptr++ )
|
||||
pfree( ptr->lexeme );
|
||||
pfree( ld->tmpRes );
|
||||
}
|
||||
ld->tmpRes = res;
|
||||
ld->lastRes = lex;
|
||||
}
|
||||
|
||||
TSLexeme*
|
||||
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
|
||||
int i;
|
||||
ListDictionary *map;
|
||||
DictInfo *dict;
|
||||
TSLexeme *res;
|
||||
|
||||
if ( ld->curDictId == InvalidOid ) {
|
||||
/*
|
||||
* usial mode: dictionary wants only one word,
|
||||
* but we should keep in mind that we should go through
|
||||
* all stack
|
||||
*/
|
||||
|
||||
while( ld->towork.head ) {
|
||||
ParsedLex *curVal = ld->towork.head;
|
||||
|
||||
map = ld->cfg->map + curVal->type;
|
||||
|
||||
if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {
|
||||
/* skip this type of lexeme */
|
||||
RemoveHead(ld);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (i = ld->posDict; i < map->len; i++) {
|
||||
dict = finddict(DatumGetObjectId(map->dict_id[i]));
|
||||
|
||||
ld->dictState.isend = ld->dictState.getnext = false;
|
||||
ld->dictState.private = NULL;
|
||||
res = (TSLexeme *) DatumGetPointer( FunctionCall4(
|
||||
&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(curVal->lemm),
|
||||
Int32GetDatum(curVal->lenlemm),
|
||||
PointerGetDatum(&ld->dictState)
|
||||
));
|
||||
|
||||
if ( ld->dictState.getnext ) {
|
||||
/*
|
||||
* dictinary wants next word, so setup and store
|
||||
* current position and go to multiword mode
|
||||
*/
|
||||
|
||||
ld->curDictId = DatumGetObjectId(map->dict_id[i]);
|
||||
ld->posDict = i+1;
|
||||
ld->curSub = curVal->next;
|
||||
if ( res )
|
||||
setNewTmpRes(ld, curVal, res);
|
||||
return LexizeExec(ld, correspondLexem);
|
||||
}
|
||||
|
||||
if (!res) /* dictionary doesn't know this lexeme */
|
||||
continue;
|
||||
|
||||
RemoveHead(ld);
|
||||
setCorrLex(ld, correspondLexem);
|
||||
return res;
|
||||
}
|
||||
|
||||
RemoveHead(ld);
|
||||
}
|
||||
} else { /* curDictId is valid */
|
||||
dict = finddict(ld->curDictId);
|
||||
|
||||
/*
|
||||
* Dictionary ld->curDictId asks us about following words
|
||||
*/
|
||||
|
||||
while( ld->curSub ) {
|
||||
ParsedLex *curVal = ld->curSub;
|
||||
|
||||
map = ld->cfg->map + curVal->type;
|
||||
|
||||
if (curVal->type != 0) {
|
||||
bool dictExists = false;
|
||||
|
||||
if (curVal->type >= ld->cfg->len || map->len == 0 ) {
|
||||
/* skip this type of lexeme */
|
||||
ld->curSub = curVal->next;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We should be sure that current type of lexeme is recognized by
|
||||
* our dictinonary: we just check is it exist in
|
||||
* list of dictionaries ?
|
||||
*/
|
||||
for(i=0;i < map->len && !dictExists; i++)
|
||||
if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
|
||||
dictExists = true;
|
||||
|
||||
if ( !dictExists ) {
|
||||
/*
|
||||
* Dictionary can't work with current tpe of lexeme,
|
||||
* return to basic mode and redo all stored lexemes
|
||||
*/
|
||||
ld->curDictId = InvalidOid;
|
||||
return LexizeExec(ld, correspondLexem);
|
||||
}
|
||||
}
|
||||
|
||||
ld->dictState.isend = (curVal->type==0) ? true : false;
|
||||
ld->dictState.getnext = false;
|
||||
|
||||
res = (TSLexeme *) DatumGetPointer( FunctionCall4(
|
||||
&(dict->lexize_info),
|
||||
PointerGetDatum(dict->dictionary),
|
||||
PointerGetDatum(curVal->lemm),
|
||||
Int32GetDatum(curVal->lenlemm),
|
||||
PointerGetDatum(&ld->dictState)
|
||||
));
|
||||
|
||||
if ( ld->dictState.getnext ) {
|
||||
/* Dictionary wants one more */
|
||||
ld->curSub = curVal->next;
|
||||
if ( res )
|
||||
setNewTmpRes(ld, curVal, res);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( res || ld->tmpRes ) {
|
||||
/*
|
||||
* Dictionary normalizes lexemes,
|
||||
* so we remove from stack all used lexemes ,
|
||||
* return to basic mode and redo end of stack (if it exists)
|
||||
*/
|
||||
if ( res ) {
|
||||
moveToWaste( ld, ld->curSub );
|
||||
} else {
|
||||
res = ld->tmpRes;
|
||||
moveToWaste( ld, ld->lastRes );
|
||||
}
|
||||
|
||||
/* reset to initial state */
|
||||
ld->curDictId = InvalidOid;
|
||||
ld->posDict = 0;
|
||||
ld->lastRes = NULL;
|
||||
ld->tmpRes = NULL;
|
||||
setCorrLex(ld, correspondLexem);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Dict don't want next lexem and didn't recognize anything,
|
||||
redo from ld->towork.head */
|
||||
ld->curDictId = InvalidOid;
|
||||
return LexizeExec(ld, correspondLexem);
|
||||
}
|
||||
}
|
||||
|
||||
setCorrLex(ld, correspondLexem);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -146,6 +146,25 @@ insert into pg_ts_dict select
|
||||
'Example of synonym dictionary'
|
||||
;
|
||||
|
||||
CREATE FUNCTION thesaurus_init(internal)
|
||||
RETURNS internal
|
||||
as 'MODULE_PATHNAME'
|
||||
LANGUAGE C;
|
||||
|
||||
CREATE FUNCTION thesaurus_lexize(internal,internal,int4,internal)
|
||||
RETURNS internal
|
||||
as 'MODULE_PATHNAME'
|
||||
LANGUAGE C
|
||||
RETURNS NULL ON NULL INPUT;
|
||||
|
||||
insert into pg_ts_dict select
|
||||
'thesaurus_template',
|
||||
'thesaurus_init(internal)',
|
||||
null,
|
||||
'thesaurus_lexize(internal,internal,int4,internal)',
|
||||
'Thesaurus template, must be pointed Dictionary and DictFile'
|
||||
;
|
||||
|
||||
--dict conf
|
||||
CREATE TABLE pg_ts_parser (
|
||||
prs_name text not null primary key,
|
||||
@ -1193,7 +1212,11 @@ AS
|
||||
|
||||
--example of ISpell dictionary
|
||||
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
|
||||
--example of synonym dict
|
||||
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
|
||||
|
||||
--example of synonym dict
|
||||
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_name='synonym';
|
||||
|
||||
--example of thesaurus dict
|
||||
--update pg_ts_dict set dict_initoption='DictFile="contrib/thesaurus", Dictionary="en_stem"' where dict_name='thesaurus_template';
|
||||
--update pg_ts_cfgmap set dict_name = '{thesaurus_template,en_stem}' where dict_name = '{en_stem}';
|
||||
END;
|
||||
|
@ -41,6 +41,8 @@ DROP FUNCTION snb_lexize(internal,internal,int4);
|
||||
DROP FUNCTION snb_ru_init(internal);
|
||||
DROP FUNCTION spell_init(internal);
|
||||
DROP FUNCTION spell_lexize(internal,internal,int4);
|
||||
DROP FUNCTION thesaurus_init(internal);
|
||||
DROP FUNCTION thesaurus_lexize(internal,internal,int4);
|
||||
DROP FUNCTION syn_init(internal);
|
||||
DROP FUNCTION syn_lexize(internal,internal,int4);
|
||||
DROP FUNCTION set_curprs(int);
|
||||
|
Loading…
x
Reference in New Issue
Block a user