360 lines
7.5 KiB
C
360 lines
7.5 KiB
C
/*
|
|
* default word parser
|
|
* Teodor Sigaev <teodor@sigaev.ru>
|
|
*/
|
|
#include <errno.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "postgres.h"
|
|
#include "utils/builtins.h"
|
|
|
|
#include "dict.h"
|
|
#include "wparser.h"
|
|
#include "common.h"
|
|
#include "ts_cfg.h"
|
|
#include "wordparser/parser.h"
|
|
#include "wordparser/deflex.h"
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_lextype);
|
|
Datum prsd_lextype(PG_FUNCTION_ARGS);
|
|
|
|
Datum
|
|
prsd_lextype(PG_FUNCTION_ARGS)
|
|
{
|
|
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
|
|
int i;
|
|
|
|
for (i = 1; i <= LASTNUM; i++)
|
|
{
|
|
descr[i - 1].lexid = i;
|
|
descr[i - 1].alias = pstrdup(tok_alias[i]);
|
|
descr[i - 1].descr = pstrdup(lex_descr[i]);
|
|
}
|
|
|
|
descr[LASTNUM].lexid = 0;
|
|
|
|
PG_RETURN_POINTER(descr);
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_start);
|
|
Datum prsd_start(PG_FUNCTION_ARGS);
|
|
Datum
|
|
prsd_start(PG_FUNCTION_ARGS)
|
|
{
|
|
tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
|
|
PG_RETURN_POINTER(NULL);
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_getlexeme);
|
|
Datum prsd_getlexeme(PG_FUNCTION_ARGS);
|
|
Datum
|
|
prsd_getlexeme(PG_FUNCTION_ARGS)
|
|
{
|
|
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
|
|
char **t = (char **) PG_GETARG_POINTER(1);
|
|
int *tlen = (int *) PG_GETARG_POINTER(2);
|
|
int type = tsearch2_yylex();
|
|
|
|
*t = token;
|
|
*tlen = tokenlen;
|
|
PG_RETURN_INT32(type);
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_end);
|
|
Datum prsd_end(PG_FUNCTION_ARGS);
|
|
Datum
|
|
prsd_end(PG_FUNCTION_ARGS)
|
|
{
|
|
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
|
|
tsearch2_end_parse();
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
#define LEAVETOKEN(x) ( (x)==12 )
|
|
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
|
|
#define ENDPUNCTOKEN(x) ( (x)==12 )
|
|
|
|
|
|
#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
|
|
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
|
|
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
|
|
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
|
|
|
|
typedef struct
|
|
{
|
|
HLWORD *words;
|
|
int len;
|
|
} hlCheck;
|
|
|
|
static bool
|
|
checkcondition_HL(void *checkval, ITEM * val)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < ((hlCheck *) checkval)->len; i++)
|
|
{
|
|
if (((hlCheck *) checkval)->words[i].item == val)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
static bool
|
|
hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
|
|
{
|
|
int i,
|
|
j;
|
|
ITEM *item = GETQUERY(query);
|
|
int pos = *p;
|
|
|
|
*q = 0;
|
|
*p = 0x7fffffff;
|
|
|
|
for (j = 0; j < query->size; j++)
|
|
{
|
|
if (item->type != VAL)
|
|
{
|
|
item++;
|
|
continue;
|
|
}
|
|
for (i = pos; i < prs->curwords; i++)
|
|
{
|
|
if (prs->words[i].item == item)
|
|
{
|
|
if (i > *q)
|
|
*q = i;
|
|
break;
|
|
}
|
|
}
|
|
item++;
|
|
}
|
|
|
|
if (*q == 0)
|
|
return false;
|
|
|
|
item = GETQUERY(query);
|
|
for (j = 0; j < query->size; j++)
|
|
{
|
|
if (item->type != VAL)
|
|
{
|
|
item++;
|
|
continue;
|
|
}
|
|
for (i = *q; i >= pos; i--)
|
|
{
|
|
if (prs->words[i].item == item)
|
|
{
|
|
if (i < *p)
|
|
*p = i;
|
|
break;
|
|
}
|
|
}
|
|
item++;
|
|
}
|
|
|
|
if (*p <= *q)
|
|
{
|
|
hlCheck ch;
|
|
|
|
ch.words = &(prs->words[*p]);
|
|
ch.len = *q - *p + 1;
|
|
if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
|
|
return true;
|
|
else
|
|
{
|
|
(*p)++;
|
|
return hlCover(prs, query, p, q);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_headline);
|
|
Datum prsd_headline(PG_FUNCTION_ARGS);
|
|
Datum
|
|
prsd_headline(PG_FUNCTION_ARGS)
|
|
{
|
|
HLPRSTEXT *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
|
|
text *opt = (text *) PG_GETARG_POINTER(1); /* can't be toasted */
|
|
QUERYTYPE *query = (QUERYTYPE *) PG_GETARG_POINTER(2); /* can't be toasted */
|
|
|
|
/* from opt + start and and tag */
|
|
int min_words = 15;
|
|
int max_words = 35;
|
|
int shortword = 3;
|
|
|
|
int p = 0,
|
|
q = 0;
|
|
int bestb = -1,
|
|
beste = -1;
|
|
int bestlen = -1;
|
|
int pose = 0, posb,
|
|
poslen,
|
|
curlen;
|
|
|
|
int i;
|
|
|
|
/* config */
|
|
prs->startsel = NULL;
|
|
prs->stopsel = NULL;
|
|
if (opt)
|
|
{
|
|
Map *map,
|
|
*mptr;
|
|
|
|
parse_cfgdict(opt, &map);
|
|
mptr = map;
|
|
|
|
while (mptr && mptr->key)
|
|
{
|
|
if (strcasecmp(mptr->key, "MaxWords") == 0)
|
|
max_words = pg_atoi(mptr->value, 4, 1);
|
|
else if (strcasecmp(mptr->key, "MinWords") == 0)
|
|
min_words = pg_atoi(mptr->value, 4, 1);
|
|
else if (strcasecmp(mptr->key, "ShortWord") == 0)
|
|
shortword = pg_atoi(mptr->value, 4, 1);
|
|
else if (strcasecmp(mptr->key, "StartSel") == 0)
|
|
prs->startsel = pstrdup(mptr->value);
|
|
else if (strcasecmp(mptr->key, "StopSel") == 0)
|
|
prs->stopsel = pstrdup(mptr->value);
|
|
|
|
pfree(mptr->key);
|
|
pfree(mptr->value);
|
|
|
|
mptr++;
|
|
}
|
|
pfree(map);
|
|
|
|
if (min_words >= max_words)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("MinWords should be less than MaxWords")));
|
|
if (min_words <= 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("MinWords should be positive")));
|
|
if (shortword < 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("ShortWord should be >= 0")));
|
|
}
|
|
|
|
while (hlCover(prs, query, &p, &q))
|
|
{
|
|
/* find cover len in words */
|
|
curlen = 0;
|
|
poslen = 0;
|
|
for (i = p; i <= q && curlen < max_words; i++)
|
|
{
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
curlen++;
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
poslen++;
|
|
pose = i;
|
|
}
|
|
|
|
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
|
|
{
|
|
/* best already finded, so try one more cover */
|
|
p++;
|
|
continue;
|
|
}
|
|
|
|
posb=p;
|
|
if (curlen < max_words)
|
|
{ /* find good end */
|
|
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
|
|
{
|
|
if (i != q)
|
|
{
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
curlen++;
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
poslen++;
|
|
}
|
|
pose = i;
|
|
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
|
continue;
|
|
if (curlen >= min_words)
|
|
break;
|
|
}
|
|
if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
|
|
for(i=p; i>= 0; i--) {
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
curlen++;
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
poslen++;
|
|
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
|
continue;
|
|
if (curlen >= min_words)
|
|
break;
|
|
}
|
|
posb=(i>=0) ? i : 0;
|
|
}
|
|
}
|
|
else
|
|
{ /* shorter cover :((( */
|
|
for (; curlen > min_words; i--)
|
|
{
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
curlen--;
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
poslen--;
|
|
pose = i;
|
|
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
|
continue;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
|
|
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
|
|
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
|
|
{
|
|
bestb = posb;
|
|
beste = pose;
|
|
bestlen = poslen;
|
|
}
|
|
|
|
p++;
|
|
}
|
|
|
|
if (bestlen < 0)
|
|
{
|
|
curlen = 0;
|
|
poslen = 0;
|
|
for (i = 0; i < prs->curwords && curlen < min_words; i++)
|
|
{
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
curlen++;
|
|
pose = i;
|
|
}
|
|
bestb = 0;
|
|
beste = pose;
|
|
}
|
|
|
|
for (i = bestb; i <= beste; i++)
|
|
{
|
|
if (prs->words[i].item)
|
|
prs->words[i].selected = 1;
|
|
if (prs->words[i].repeated)
|
|
prs->words[i].skip = 1;
|
|
if (HLIDIGNORE(prs->words[i].type))
|
|
prs->words[i].replace = 1;
|
|
|
|
prs->words[i].in = 1;
|
|
}
|
|
|
|
if (!prs->startsel)
|
|
prs->startsel = pstrdup("<b>");
|
|
if (!prs->stopsel)
|
|
prs->stopsel = pstrdup("</b>");
|
|
prs->startsellen = strlen(prs->startsel);
|
|
prs->stopsellen = strlen(prs->stopsel);
|
|
|
|
PG_RETURN_POINTER(prs);
|
|
}
|