It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.
262 lines
5.8 KiB
262 lines
5.8 KiB
* lexize stream of lexemes
* Teodor Sigaev <teodor@sigaev.ru>
#include "postgres.h"
#include <ctype.h>
#include <locale.h>
#include "ts_cfg.h"
#include "dict.h"
LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
ld->cfg = cfg;
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
ld->waste.head = ld->waste.tail = NULL;
static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
if ( list->tail ) {
list->tail->next = newpl;
list->tail = newpl;
} else
list->head = list->tail = newpl;
newpl->next = NULL;
static ParsedLex*
LPLRemoveHead(ListParsedLex *list) {
ParsedLex *res = list->head;
if ( list->head )
list->head = list->head->next;
if ( list->head == NULL )
list->tail = NULL;
return res;
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
newpl->type = type;
newpl->lemm = lemm;
newpl->lenlemm = lenlemm;
LPLAddTail(&ld->towork, newpl);
ld->curSub = ld->towork.tail;
static void
RemoveHead(LexizeData *ld) {
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
ld->posDict = 0;
static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
if ( correspondLexem ) {
*correspondLexem = ld->waste.head;
} else {
ParsedLex *tmp, *ptr = ld->waste.head;
while(ptr) {
tmp = ptr->next;
ptr = tmp;
ld->waste.head = ld->waste.tail = NULL;
static void
moveToWaste(LexizeData *ld, ParsedLex *stop) {
bool go = true;
while( ld->towork.head && go) {
if (ld->towork.head == stop) {
ld->curSub = stop->next;
go = false;
static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
if ( ld->tmpRes ) {
TSLexeme *ptr;
for( ptr=ld->tmpRes; ptr->lexeme; ptr++ )
pfree( ptr->lexeme );
pfree( ld->tmpRes );
ld->tmpRes = res;
ld->lastRes = lex;
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
int i;
ListDictionary *map;
DictInfo *dict;
TSLexeme *res;
if ( ld->curDictId == InvalidOid ) {
* usial mode: dictionary wants only one word,
* but we should keep in mind that we should go through
* all stack
while( ld->towork.head ) {
ParsedLex *curVal = ld->towork.head;
map = ld->cfg->map + curVal->type;
if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {
/* skip this type of lexeme */
for (i = ld->posDict; i < map->len; i++) {
dict = finddict(DatumGetObjectId(map->dict_id[i]));
ld->dictState.isend = ld->dictState.getnext = false;
ld->dictState.private = NULL;
res = (TSLexeme *) DatumGetPointer( FunctionCall4(
if ( ld->dictState.getnext ) {
* dictinary wants next word, so setup and store
* current position and go to multiword mode
ld->curDictId = DatumGetObjectId(map->dict_id[i]);
ld->posDict = i+1;
ld->curSub = curVal->next;
if ( res )
setNewTmpRes(ld, curVal, res);
return LexizeExec(ld, correspondLexem);
if (!res) /* dictionary doesn't know this lexeme */
setCorrLex(ld, correspondLexem);
return res;
} else { /* curDictId is valid */
dict = finddict(ld->curDictId);
* Dictionary ld->curDictId asks us about following words
while( ld->curSub ) {
ParsedLex *curVal = ld->curSub;
map = ld->cfg->map + curVal->type;
if (curVal->type != 0) {
bool dictExists = false;
if (curVal->type >= ld->cfg->len || map->len == 0 ) {
/* skip this type of lexeme */
ld->curSub = curVal->next;
* We should be sure that current type of lexeme is recognized by
* our dictinonary: we just check is it exist in
* list of dictionaries ?
for(i=0;i < map->len && !dictExists; i++)
if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
dictExists = true;
if ( !dictExists ) {
* Dictionary can't work with current tpe of lexeme,
* return to basic mode and redo all stored lexemes
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
ld->dictState.isend = (curVal->type==0) ? true : false;
ld->dictState.getnext = false;
res = (TSLexeme *) DatumGetPointer( FunctionCall4(
if ( ld->dictState.getnext ) {
/* Dictionary wants one more */
ld->curSub = curVal->next;
if ( res )
setNewTmpRes(ld, curVal, res);
if ( res || ld->tmpRes ) {
* Dictionary normalizes lexemes,
* so we remove from stack all used lexemes ,
* return to basic mode and redo end of stack (if it exists)
if ( res ) {
moveToWaste( ld, ld->curSub );
} else {
res = ld->tmpRes;
moveToWaste( ld, ld->lastRes );
/* reset to initial state */
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->lastRes = NULL;
ld->tmpRes = NULL;
setCorrLex(ld, correspondLexem);
return res;
/* Dict don't want next lexem and didn't recognize anything,
redo from ld->towork.head */
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
setCorrLex(ld, correspondLexem);
return NULL;