mirror of https://github.com/postgres/postgres
507 lines
12 KiB
C
507 lines
12 KiB
C
/*
|
|
* In/Out definitions for txtidx type
|
|
* Internal structure:
|
|
* string of values, array of position lexem in string and it's length
|
|
* Teodor Sigaev <teodor@stack.net>
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/gist.h"
|
|
#include "access/itup.h"
|
|
#include "utils/elog.h"
|
|
#include "utils/palloc.h"
|
|
#include "utils/builtins.h"
|
|
#include "storage/bufpage.h"
|
|
#include "executor/spi.h"
|
|
#include "commands/trigger.h"
|
|
|
|
#include "utils/pg_locale.h"
|
|
|
|
#include <ctype.h> /* tolower */
|
|
#include "txtidx.h"
|
|
#include "query.h"
|
|
|
|
#include "deflex.h"
|
|
#include "parser.h"
|
|
|
|
#include "morph.h"
|
|
|
|
PG_FUNCTION_INFO_V1(txtidx_in);
|
|
Datum txtidx_in(PG_FUNCTION_ARGS);
|
|
PG_FUNCTION_INFO_V1(txtidx_out);
|
|
Datum txtidx_out(PG_FUNCTION_ARGS);
|
|
|
|
PG_FUNCTION_INFO_V1(txt2txtidx);
|
|
Datum txt2txtidx(PG_FUNCTION_ARGS);
|
|
|
|
PG_FUNCTION_INFO_V1(tsearch);
|
|
Datum tsearch(PG_FUNCTION_ARGS);
|
|
|
|
PG_FUNCTION_INFO_V1(txtidxsize);
|
|
Datum txtidxsize(PG_FUNCTION_ARGS);
|
|
|
|
/*
|
|
* in/out text index type
|
|
*/
|
|
static char *BufferStr;
|
|
static int
|
|
compareentry( const void * a, const void * b ) {
|
|
if ( ((WordEntry*)a)->len == ((WordEntry*)b)->len ) {
|
|
return strncmp(
|
|
&BufferStr[((WordEntry*)a)->pos],
|
|
&BufferStr[((WordEntry*)b)->pos],
|
|
((WordEntry*)b)->len );
|
|
}
|
|
return ( ((WordEntry*)a)->len > ((WordEntry*)b)->len ) ? 1 : -1;
|
|
}
|
|
|
|
static int
|
|
uniqueentry( WordEntry* a, int4 l, char *buf, int4 *outbuflen ) {
|
|
WordEntry *ptr, *res;
|
|
|
|
res = a;
|
|
*outbuflen = res->len;
|
|
if ( l == 1 )
|
|
return l;
|
|
|
|
ptr = a+1;
|
|
BufferStr = buf;
|
|
qsort((void*)a, l, sizeof(int4), compareentry );
|
|
*outbuflen = res->len;
|
|
|
|
while (ptr - a < l) {
|
|
if ( ! (ptr->len == res->len &&
|
|
strncmp(&buf[ ptr->pos ], &buf[ res->pos ],res->len) == 0 ) ) {
|
|
res++;
|
|
res->len = ptr->len;
|
|
res->pos = ptr->pos;
|
|
*outbuflen += res->len;
|
|
|
|
}
|
|
ptr++;
|
|
}
|
|
return res + 1 - a;
|
|
}
|
|
|
|
#define WAITWORD 1
|
|
#define WAITENDWORD 2
|
|
#define WAITNEXTCHAR 3
|
|
#define WAITENDCMPLX 4
|
|
|
|
#define RESIZEPRSBUF if ( state->curpos - state->word == state->len ) { \
|
|
int4 clen = state->curpos - state->word; \
|
|
state->len *= 2; \
|
|
state->word = (char*)repalloc( (void*)state->word, state->len ); \
|
|
state->curpos = state->word + clen; \
|
|
}
|
|
|
|
int4
|
|
gettoken_txtidx( TI_IN_STATE *state ) {
|
|
int4 oldstate = 0;
|
|
state->curpos = state->word;
|
|
state->state = WAITWORD;
|
|
|
|
while( 1 ) {
|
|
if ( state->state == WAITWORD ) {
|
|
if ( *(state->prsbuf) == '\0' ) {
|
|
return 0;
|
|
} else if ( *(state->prsbuf) == '\'' ) {
|
|
state->state = WAITENDCMPLX;
|
|
} else if ( *(state->prsbuf) == '\\' ) {
|
|
state->state = WAITNEXTCHAR;
|
|
oldstate = WAITENDWORD;
|
|
} else if ( state->oprisdelim && ISOPERATOR( *(state->prsbuf) ) ) {
|
|
elog(ERROR, "Syntax error");
|
|
} else if ( *(state->prsbuf) != ' ' ) {
|
|
*(state->curpos) = *(state->prsbuf);
|
|
state->curpos++;
|
|
state->state = WAITENDWORD;
|
|
}
|
|
} else if ( state->state == WAITNEXTCHAR ) {
|
|
if ( *(state->prsbuf) == '\0' ) {
|
|
elog(ERROR,"There is no escaped character");
|
|
} else {
|
|
RESIZEPRSBUF;
|
|
*(state->curpos) = *(state->prsbuf);
|
|
state->curpos++;
|
|
state->state = oldstate;
|
|
}
|
|
} else if ( state->state == WAITENDWORD ) {
|
|
if ( *(state->prsbuf) == '\\' ) {
|
|
state->state = WAITNEXTCHAR;
|
|
oldstate = WAITENDWORD;
|
|
} else if ( *(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' ||
|
|
( state->oprisdelim && ISOPERATOR( *(state->prsbuf) ) ) ) {
|
|
RESIZEPRSBUF;
|
|
if ( state->curpos == state->word )
|
|
elog(ERROR, "Syntax error");
|
|
*(state->curpos) = '\0';
|
|
return 1;
|
|
} else {
|
|
RESIZEPRSBUF;
|
|
*(state->curpos) = *(state->prsbuf);
|
|
state->curpos++;
|
|
}
|
|
} else if ( state->state == WAITENDCMPLX ) {
|
|
if ( *(state->prsbuf) == '\'' ) {
|
|
RESIZEPRSBUF;
|
|
*(state->curpos) = '\0';
|
|
if ( state->curpos == state->word )
|
|
elog(ERROR, "Syntax error");
|
|
state->prsbuf++;
|
|
return 1;
|
|
} else if ( *(state->prsbuf) == '\\' ) {
|
|
state->state = WAITNEXTCHAR;
|
|
oldstate = WAITENDCMPLX;
|
|
} else if ( *(state->prsbuf) == '\0' ) {
|
|
elog(ERROR,"Syntax error");
|
|
} else {
|
|
RESIZEPRSBUF;
|
|
*(state->curpos) = *(state->prsbuf);
|
|
state->curpos++;
|
|
}
|
|
} else {
|
|
elog(ERROR, "Inner bug :(");
|
|
}
|
|
state->prsbuf++;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
Datum
|
|
txtidx_in(PG_FUNCTION_ARGS) {
|
|
char *buf = (char*)PG_GETARG_POINTER(0);
|
|
TI_IN_STATE state;
|
|
WordEntry *arr;
|
|
int4 len=0, totallen = 64;
|
|
txtidx *in;
|
|
char *tmpbuf, *cur;
|
|
int4 i,buflen = 256;
|
|
|
|
state.prsbuf = buf;
|
|
state.len=32;
|
|
state.word = (char*)palloc( state.len );
|
|
state.oprisdelim = false;
|
|
|
|
arr = (WordEntry*)palloc( sizeof(WordEntry) * totallen );
|
|
cur = tmpbuf = (char*)palloc( buflen );
|
|
while( gettoken_txtidx( &state ) ) {
|
|
if ( len == totallen ) {
|
|
totallen *= 2;
|
|
arr = (WordEntry*)repalloc( (void*)arr, sizeof(int4)*totallen );
|
|
}
|
|
while ( cur-tmpbuf + state.curpos - state.word >= buflen ) {
|
|
int4 dist = cur-tmpbuf;
|
|
buflen *= 2;
|
|
tmpbuf = (char*)repalloc( (void*)tmpbuf, buflen );
|
|
cur = tmpbuf+dist;
|
|
}
|
|
if ( state.curpos - state.word > 0xffff )
|
|
elog(ERROR,"Word is too long");
|
|
arr[len].len = state.curpos - state.word;
|
|
if ( cur - tmpbuf > 0xffff )
|
|
elog(ERROR,"Too long value");
|
|
arr[len].pos = cur - tmpbuf;
|
|
memcpy( (void*)cur, (void*)state.word, arr[len].len );
|
|
cur += arr[len].len;
|
|
len++;
|
|
}
|
|
pfree(state.word);
|
|
|
|
if ( !len )
|
|
elog(ERROR,"Void value");
|
|
|
|
len = uniqueentry( arr, len, tmpbuf, &buflen );
|
|
totallen = CALCDATASIZE( len, buflen );
|
|
in = (txtidx*)palloc( totallen );
|
|
in->len = totallen;
|
|
in->size = len;
|
|
cur = STRPTR(in);
|
|
for(i=0;i<len;i++) {
|
|
memcpy( (void*)cur, (void*)&tmpbuf[ arr[i].pos ], arr[i].len );
|
|
arr[i].pos = cur - STRPTR(in);
|
|
cur += arr[i].len;
|
|
}
|
|
pfree(tmpbuf);
|
|
memcpy( (void*)ARRPTR(in), (void*)arr, sizeof(int4)*len );
|
|
pfree( arr );
|
|
PG_RETURN_POINTER( in );
|
|
}
|
|
|
|
Datum
|
|
txtidxsize(PG_FUNCTION_ARGS) {
|
|
txtidx *in=(txtidx*)DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(0)));
|
|
int4 ret = in->size;
|
|
PG_FREE_IF_COPY(in,0);
|
|
PG_RETURN_INT32( ret );
|
|
}
|
|
|
|
Datum
|
|
txtidx_out(PG_FUNCTION_ARGS) {
|
|
txtidx *out=(txtidx*)DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(0)));
|
|
char *outbuf;
|
|
int4 i,j,lenbuf = STRSIZE(out) + 1 /* \0 */ + out->size*2 /* '' */ + out->size - 1 /* space */;
|
|
WordEntry *ptr = ARRPTR(out);
|
|
char *curin, *curout;
|
|
|
|
curout = outbuf = (char*) palloc( lenbuf );
|
|
for(i=0;i<out->size;i++) {
|
|
curin = STRPTR(out) + ptr->pos;
|
|
if ( i!= 0 )
|
|
*curout++ = ' ';
|
|
*curout++ = '\'';
|
|
j = ptr->len;
|
|
while( j-- ) {
|
|
if ( *curin == '\'' ) {
|
|
int4 pos = curout - outbuf;
|
|
outbuf = (char*)repalloc((void*)outbuf, ++lenbuf );
|
|
curout = outbuf + pos;
|
|
*curout++ = '\\';
|
|
}
|
|
*curout++ = *curin++;
|
|
}
|
|
*curout++ = '\'';
|
|
ptr++;
|
|
}
|
|
outbuf[ lenbuf-1 ] = '\0';
|
|
PG_FREE_IF_COPY(out,0);
|
|
PG_RETURN_POINTER( outbuf );
|
|
}
|
|
|
|
typedef struct {
|
|
uint16 len;
|
|
char* word;
|
|
} WORD;
|
|
|
|
typedef struct {
|
|
WORD *words;
|
|
int4 lenwords;
|
|
int4 curwords;
|
|
} PRSTEXT;
|
|
|
|
/*
|
|
* Parse text to lexems
|
|
*/
|
|
static void
|
|
parsetext( PRSTEXT *prs, char *buf, int4 buflen ) {
|
|
int type,lenlemm;
|
|
char *ptr,*ptrw;
|
|
char *lemm;
|
|
|
|
start_parse_str( buf, buflen );
|
|
while( (type=tsearch_yylex()) != 0 ) {
|
|
if ( prs->curwords == prs->lenwords ) {
|
|
prs->lenwords *= 2;
|
|
prs->words = (WORD*)repalloc( (void*)prs->words, prs->lenwords * sizeof(WORD) );
|
|
}
|
|
if ( tokenlen>0xffff ) {
|
|
end_parse();
|
|
elog(ERROR, "Word is too long");
|
|
}
|
|
|
|
lenlemm = tokenlen;
|
|
lemm = lemmatize( token, &lenlemm, type );
|
|
|
|
if ( ! lemm )
|
|
continue;
|
|
|
|
if ( lemm != token ) {
|
|
prs->words[ prs->curwords ].len = lenlemm;
|
|
prs->words[ prs->curwords ].word = lemm;
|
|
} else {
|
|
prs->words[ prs->curwords ].len = lenlemm;
|
|
ptrw = prs->words[ prs->curwords ].word = (char*)palloc( lenlemm );
|
|
ptr = token;
|
|
while( ptr-token < lenlemm ) {
|
|
*ptrw = tolower( (unsigned char) *ptr );
|
|
ptr++; ptrw++;
|
|
}
|
|
}
|
|
prs->curwords++;
|
|
}
|
|
end_parse();
|
|
}
|
|
|
|
static int
|
|
compareWORD( const void * a, const void * b ) {
|
|
if ( ((WORD*)a)->len == ((WORD*)b)->len )
|
|
return strncmp(
|
|
((WORD*)a)->word,
|
|
((WORD*)b)->word,
|
|
((WORD*)b)->len );
|
|
return ( ((WORD*)a)->len > ((WORD*)b)->len ) ? 1 : -1;
|
|
}
|
|
|
|
static int
|
|
uniqueWORD( WORD* a, int4 l ) {
|
|
WORD *ptr, *res;
|
|
|
|
if ( l == 1 )
|
|
return l;
|
|
|
|
res = a;
|
|
ptr = a + 1;
|
|
|
|
qsort((void*)a, l, sizeof(WORD), compareWORD );
|
|
|
|
while (ptr - a < l) {
|
|
if ( ! (ptr->len == res->len &&
|
|
strncmp(ptr->word, res->word ,res->len) == 0 ) ) {
|
|
res++;
|
|
res->len = ptr->len;
|
|
res->word = ptr->word;
|
|
} else {
|
|
pfree(ptr->word);
|
|
}
|
|
ptr++;
|
|
}
|
|
|
|
return res + 1 - a;
|
|
}
|
|
|
|
/*
|
|
* make value of txtidx
|
|
*/
|
|
static txtidx *
|
|
makevalue( PRSTEXT *prs ) {
|
|
int4 i, lenstr=0, totallen;
|
|
txtidx *in;
|
|
WordEntry *ptr;
|
|
char *str,*cur;
|
|
|
|
prs->curwords = uniqueWORD( prs->words, prs->curwords );
|
|
for(i=0;i<prs->curwords;i++)
|
|
lenstr += prs->words[i].len;
|
|
|
|
totallen = CALCDATASIZE( prs->curwords, lenstr );
|
|
in = (txtidx*)palloc( totallen );
|
|
in->len = totallen;
|
|
in->size = prs->curwords;
|
|
|
|
ptr = ARRPTR(in);
|
|
cur = str = STRPTR(in);
|
|
for(i=0;i<prs->curwords;i++) {
|
|
ptr->len = prs->words[i].len;
|
|
if ( cur-str > 0xffff )
|
|
elog(ERROR,"Value is too big");
|
|
ptr->pos = cur-str;
|
|
ptr++;
|
|
memcpy( (void*)cur, (void*)prs->words[i].word, prs->words[i].len );
|
|
pfree(prs->words[i].word);
|
|
cur += prs->words[i].len;
|
|
}
|
|
pfree(prs->words);
|
|
return in;
|
|
}
|
|
|
|
Datum
|
|
txt2txtidx(PG_FUNCTION_ARGS) {
|
|
text *in = (text*)DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(0)));
|
|
PRSTEXT prs;
|
|
txtidx *out = NULL;
|
|
|
|
prs.lenwords = 32;
|
|
prs.curwords = 0;
|
|
prs.words = (WORD*)palloc(sizeof(WORD)*prs.lenwords);
|
|
|
|
initmorph();
|
|
parsetext( &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ );
|
|
PG_FREE_IF_COPY(in,0);
|
|
|
|
if ( prs.curwords ) {
|
|
out = makevalue( &prs );
|
|
PG_RETURN_POINTER( out );
|
|
}
|
|
pfree(prs.words);
|
|
PG_RETURN_NULL();
|
|
}
|
|
|
|
/*
|
|
* Trigger
|
|
*/
|
|
Datum
|
|
tsearch(PG_FUNCTION_ARGS) {
|
|
TriggerData *trigdata;
|
|
Trigger *trigger;
|
|
Relation rel;
|
|
HeapTuple rettuple = NULL;
|
|
int numidxattr,i;
|
|
PRSTEXT prs;
|
|
Datum datum = (Datum)0;
|
|
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
elog(ERROR, "TSearch: Not fired by trigger manager");
|
|
|
|
trigdata = (TriggerData *) fcinfo->context;
|
|
if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
|
|
elog(ERROR, "TSearch: Can't process STATEMENT events");
|
|
if (TRIGGER_FIRED_AFTER(trigdata->tg_event))
|
|
elog(ERROR, "TSearch: Must be fired BEFORE event");
|
|
|
|
if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
|
|
rettuple = trigdata->tg_trigtuple;
|
|
else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
|
|
rettuple = trigdata->tg_newtuple;
|
|
else
|
|
elog(ERROR, "TSearch: Unknown event");
|
|
|
|
trigger = trigdata->tg_trigger;
|
|
rel = trigdata->tg_relation;
|
|
|
|
if ( trigger->tgnargs < 2 )
|
|
elog(ERROR,"TSearch: format tsearch(txtidx_field, text_field1,...)");
|
|
|
|
numidxattr = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
|
|
if ( numidxattr < 0 )
|
|
elog(ERROR,"TSearch: Can not find txtidx_field");
|
|
|
|
prs.lenwords = 32;
|
|
prs.curwords = 0;
|
|
prs.words = (WORD*)palloc(sizeof(WORD)*prs.lenwords);
|
|
|
|
initmorph();
|
|
/* find all words in indexable column */
|
|
for(i=1; i<trigger->tgnargs; i++) {
|
|
int4 numattr;
|
|
text *txt_toasted, *txt;
|
|
bool isnull;
|
|
Oid oidtype;
|
|
|
|
numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
|
|
oidtype = SPI_gettypeid(rel->rd_att, numattr);
|
|
if ( numattr<0 || ( ! ( oidtype==TEXTOID || oidtype==VARCHAROID ) ) ) {
|
|
elog(NOTICE, "TSearch: can not find field '%s'", trigger->tgargs[i]);
|
|
continue;
|
|
}
|
|
txt_toasted = (text*)DatumGetPointer( SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull ) );
|
|
if ( isnull )
|
|
continue;
|
|
txt = (text*)DatumGetPointer( PG_DETOAST_DATUM( PointerGetDatum ( txt_toasted ) ) );
|
|
|
|
parsetext( &prs, VARDATA(txt), VARSIZE(txt) - VARHDRSZ );
|
|
if ( txt != txt_toasted )
|
|
pfree(txt);
|
|
}
|
|
|
|
/* make txtidx value */
|
|
if (prs.curwords) {
|
|
datum = PointerGetDatum( makevalue( &prs ) );
|
|
rettuple = SPI_modifytuple( rel, rettuple, 1, &numidxattr,
|
|
&datum, NULL );
|
|
pfree(DatumGetPointer(datum));
|
|
} else {
|
|
char nulls = 'n';
|
|
pfree( prs.words );
|
|
rettuple = SPI_modifytuple( rel, rettuple, 1, &numidxattr,
|
|
&datum, &nulls );
|
|
}
|
|
|
|
if (rettuple == NULL)
|
|
elog(ERROR, "TSearch: %d returned by SPI_modifytuple", SPI_result);
|
|
|
|
return PointerGetDatum( rettuple );
|
|
}
|
|
|