Text parser rewritten:

- supports multibyte encodings
        - more strict rules for lexemes
        - flex isn't used
Add:
        - tsquery plainto_tsquery(text)
          Function makes tsquery from plain text.
        - &&, ||, !! operation for tsquery for combining
          tsquery from it's parts:  'foo & bar' || 'asd' => 'foo & bar | asd'
This commit is contained in:
Teodor Sigaev 2005-11-21 12:27:57 +00:00
parent b91e6ed93e
commit c52795d18a
15 changed files with 1613 additions and 424 deletions

View File

@ -1,4 +1,4 @@
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
MODULE_big = tsearch2 MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
wparser.o wparser_def.o \ wparser.o wparser_def.o \
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \ ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
tsvector_op.o rank.o ts_stat.o \ tsvector_op.o rank.o ts_stat.o \
query_util.o query_support.o query_rewrite.o query_gist.o query_util.o query_support.o query_rewrite.o query_gist.o \
ts_locale.o
SUBDIRS := snowball ispell wordparser SUBDIRS := snowball ispell wordparser
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o) SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)

View File

@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:544: NOTICE: type "gtsvector" is not yet defined psql:tsearch2.sql:559: NOTICE: type "gtsvector" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:549: NOTICE: argument type gtsvector is only a shell psql:tsearch2.sql:564: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:998: NOTICE: type "gtsq" is not yet defined psql:tsearch2.sql:1054: NOTICE: type "gtsq" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:1003: NOTICE: argument type gtsq is only a shell psql:tsearch2.sql:1059: NOTICE: argument type gtsq is only a shell
--tsvector --tsvector
SELECT '1'::tsvector; SELECT '1'::tsvector;
tsvector tsvector
@ -653,7 +653,7 @@ select * from token_type('default');
11 | lpart_hword | Latin part of hyphenated word 11 | lpart_hword | Latin part of hyphenated word
12 | blank | Space symbols 12 | blank | Space symbols
13 | tag | HTML Tag 13 | tag | HTML Tag
14 | http | HTTP head 14 | protocol | Protocol head
15 | hword | Hyphenated word 15 | hword | Hyphenated word
16 | lhword | Latin hyphenated word 16 | lhword | Latin hyphenated word
17 | nlhword | Non-latin hyphenated word 17 | nlhword | Non-latin hyphenated word
@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
-------+-------------------------------------- -------+--------------------------------------
22 | 345 22 | 345
12 | 12 |
4 | qwe@efd.r 1 | qwe
12 | 12 | @
12 | ' 19 | efd.r
12 | 12 | '
14 | http:// 14 | http://
6 | www.com 6 | www.com
12 | / 12 | /
12 |
14 | http:// 14 | http://
5 | aew.werc.ewr/?ad=qwe&dw 5 | aew.werc.ewr/?ad=qwe&dw
6 | aew.werc.ewr 6 | aew.werc.ewr
@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
6 | 4aew.werc.ewr 6 | 4aew.werc.ewr
12 | 12 |
14 | http:// 14 | http://
5 | 5aew.werc.ewr:8100/? 6 | 5aew.werc.ewr:8100
6 | 5aew.werc.ewr 12 | /?
18 | :8100/?
12 |
1 | ad 1 | ad
12 | = 12 | =
1 | qwe 1 | qwe
@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
1 | dw 1 | dw
12 | 12 |
5 | 6aew.werc.ewr:8100/?ad=qwe&dw 5 | 6aew.werc.ewr:8100/?ad=qwe&dw
6 | 6aew.werc.ewr 6 | 6aew.werc.ewr:8100
18 | :8100/?ad=qwe&dw 18 | /?ad=qwe&dw
12 | 12 |
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
6 | 7aew.werc.ewr 6 | 7aew.werc.ewr:8100
18 | :8100/?ad=qwe&dw=%20%32 18 | /?ad=qwe&dw=%20%32
12 | 12 |
7 | +4.0e-10 7 | +4.0e-10
12 | 12 |
@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
1 | jf 1 | jf
12 | 12 |
1 | sdjk 1 | sdjk
13 | <we hjwer <werrwe> 12 | <
1 | we
12 |
1 | hjwer
12 |
13 | <werrwe>
12 | 12 |
3 | ewr1 3 | ewr1
12 | > 12 | >
12 |
3 | ewri2 3 | ewri2
12 | 12 |
13 | <a href="qwe<qwe>"> 13 | <a href="qwe<qwe>">
@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
12 | 12 |
19 | /wqe-324/ewr 19 | /wqe-324/ewr
12 | 12 |
6 | gist.h 19 | gist.h
12 | 12 |
6 | gist.h.c 19 | gist.h.c
12 |
6 | gist.c
12 | .
12 | 12 |
19 | gist.c
12 | .
1 | readline 1 | readline
12 | 12 |
20 | 4.2 20 | 4.2
12 | 12 |
20 | 4.2 20 | 4.2
12 | . 12 | .
12 |
20 | 4.2 20 | 4.2
12 | , 12 | ,
12 | 15 | readline-4.2
15 | readline-4
11 | readline 11 | readline
12 | - 12 | -
20 | 4.2 20 | 4.2
12 | 12 |
15 | readline-4 15 | readline-4.2
11 | readline 11 | readline
12 | - 12 | -
20 | 4.2 20 | 4.2
12 | . 12 | .
12 |
22 | 234 22 | 234
12 | 12 |
13 | <i <b> 12 | <
1 | i
12 |
13 | <b>
12 | 12 |
1 | wow 1 | wow
12 | 12 |
12 | < 12 | <
12 |
1 | jqw 1 | jqw
12 | 12 |
12 | < 12 | <>
12 | >
12 |
1 | qwerty 1 | qwerty
(138 rows) (135 rows)
SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty'); <i <b> wow < jqw <> qwerty');
to_tsvector to_tsvector
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'ad':18 'dw':20 'jf':40 '234':62 '345':1 '4.2':53,54,55,58,61 '455':32 'jqw':64 'qwe':19,28,29,36 'wer':37 'wow':63 'asdf':38 'ewr1':42 'qwer':39 'sdjk':41 '5.005':33 'ewri2':43 'qwqwe':30 'wefjn':47 'gist.c':51 'gist.h':49 'qwerti':65 '234.435':31 ':8100/?':17 'qwe-wer':35 'readlin':52,57,60 'www.com':3 '+4.0e-10':27 'gist.h.c':50 'rewt/ewr':46 'qwe@efd.r':2 'readline-4':56,59 '/?ad=qwe&dw':6,9,13 '/wqe-324/ewr':48 'aew.werc.ewr':5 '1aew.werc.ewr':8 '2aew.werc.ewr':10 '3aew.werc.ewr':12 '4aew.werc.ewr':14 '5aew.werc.ewr':16 '6aew.werc.ewr':22 '7aew.werc.ewr':25 '/usr/local/fff':44 '/awdf/dwqe/4325':45 ':8100/?ad=qwe&dw':23 'teodor@stack.net':34 '5aew.werc.ewr:8100/?':15 ':8100/?ad=qwe&dw=%20%32':26 'aew.werc.ewr/?ad=qwe&dw':4 '1aew.werc.ewr/?ad=qwe&dw':7 '3aew.werc.ewr/?ad=qwe&dw':11 '6aew.werc.ewr:8100/?ad=qwe&dw':21 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':24 'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
(1 row) (1 row)
SELECT length(to_tsvector('default', '345 qw')); SELECT length(to_tsvector('default', '345 qw'));
@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae
<i <b> wow < jqw <> qwerty')); <i <b> wow < jqw <> qwerty'));
length length
-------- --------
53 51
(1 row) (1 row)
select to_tsquery('default', 'qwe & sKies '); select to_tsquery('default', 'qwe & sKies ');
@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj');
'1' & 'fghj' '1' & 'fghj'
(1 row) (1 row)
select plainto_tsquery('default', 'the and z 1))& fghj');
plainto_tsquery
--------------------
'z' & '1' & 'fghj'
(1 row)
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
?column?
-----------------------
'foo' & 'bar' & 'asd'
(1 row)
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
?column?
------------------------------
'foo' & 'bar' | 'asd' & 'fg'
(1 row)
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
?column?
-----------------------------------
'foo' & 'bar' | !( 'asd' & 'fg' )
(1 row)
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
?column?
----------------------------------
'foo' & 'bar' & ( 'asd' | 'fg' )
(1 row)
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
?column? ?column?
---------- ----------

View File

@ -51,10 +51,20 @@ Datum to_tsquery_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(to_tsquery_current); PG_FUNCTION_INFO_V1(to_tsquery_current);
Datum to_tsquery_current(PG_FUNCTION_ARGS); Datum to_tsquery_current(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery);
Datum plainto_tsquery(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery_name);
Datum plainto_tsquery_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery_current);
Datum plainto_tsquery_current(PG_FUNCTION_ARGS);
/* parser's states */ /* parser's states */
#define WAITOPERAND 1 #define WAITOPERAND 1
#define WAITOPERATOR 2 #define WAITOPERATOR 2
#define WAITFIRSTOPERAND 3 #define WAITFIRSTOPERAND 3
#define WAITSINGLEOPERAND 4
/* /*
* node of query tree, also used * node of query tree, also used
@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
else if (*(state->buf) != ' ') else if (*(state->buf) != ' ')
return ERR; return ERR;
break; break;
case WAITSINGLEOPERAND:
if ( *(state->buf) == '\0' )
return END;
*strval = state->buf;
*lenval = strlen( state->buf );
state->buf += strlen( state->buf );
state->count++;
return VAL;
default: default:
return ERR; return ERR;
break; break;
@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos)
* input * input
*/ */
static QUERYTYPE * static QUERYTYPE *
queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id) queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain)
{ {
QPRS_STATE state; QPRS_STATE state;
int4 i; int4 i;
@ -599,7 +617,7 @@ static QUERYTYPE *
/* init state */ /* init state */
state.buf = buf; state.buf = buf;
state.state = WAITFIRSTOPERAND; state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
state.count = 0; state.count = 0;
state.num = 0; state.num = 0;
state.str = NULL; state.str = NULL;
@ -679,7 +697,7 @@ Datum
tsquery_in(PG_FUNCTION_ARGS) tsquery_in(PG_FUNCTION_ARGS)
{ {
SET_FUNCOID(); SET_FUNCOID();
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0)); PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
} }
/* /*
@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS)
str = text2char(in); str = text2char(in);
PG_FREE_IF_COPY(in, 1); PG_FREE_IF_COPY(in, 1);
query = queryin(str, pushval_morph, PG_GETARG_INT32(0)); query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false);
if ( query->size == 0 ) if ( query->size == 0 )
PG_RETURN_POINTER(query); PG_RETURN_POINTER(query);
@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS)
Int32GetDatum(get_currcfg()), Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0))); PG_GETARG_DATUM(0)));
} }
Datum
plainto_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(1);
char *str;
QUERYTYPE *query;
ITEM *res;
int4 len;
SET_FUNCOID();
str = text2char(in);
PG_FREE_IF_COPY(in, 1);
query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true);
if ( query->size == 0 )
PG_RETURN_POINTER(query);
res = clean_fakeval_v2(GETQUERY(query), &len);
if (!res)
{
query->len = HDRSIZEQT;
query->size = 0;
PG_RETURN_POINTER(query);
}
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM));
pfree(res);
PG_RETURN_POINTER(query);
}
Datum
plainto_tsquery_name(PG_FUNCTION_ARGS)
{
text *name = PG_GETARG_TEXT_P(0);
Datum res;
SET_FUNCOID();
res = DirectFunctionCall2(plainto_tsquery,
Int32GetDatum(name2id_cfg(name)),
PG_GETARG_DATUM(1));
PG_FREE_IF_COPY(name, 0);
PG_RETURN_DATUM(res);
}
Datum
plainto_tsquery_current(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery,
Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0)));
}

View File

@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) {
PG_RETURN_INT32(nnode); PG_RETURN_INT32(nnode);
} }
static QTNode*
join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) {
QTNode *res=(QTNode*)palloc0( sizeof(QTNode) );
res->flags |= QTN_NEEDFREE;
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
res->valnode->type = OPR;
res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 );
res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) );
res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
res->nchild = 2;
return res;
}
PG_FUNCTION_INFO_V1(tsquery_and);
Datum tsquery_and(PG_FUNCTION_ARGS);
Datum
tsquery_and(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 ) {
PG_FREE_IF_COPY(a,1);
PG_RETURN_POINTER(b);
} else if ( b->size == 0 ) {
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(a);
}
res = join_tsqueries(a, b);
res->valnode->val = '&';
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(query);
}
PG_FUNCTION_INFO_V1(tsquery_or);
Datum tsquery_or(PG_FUNCTION_ARGS);
Datum
tsquery_or(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 ) {
PG_FREE_IF_COPY(a,1);
PG_RETURN_POINTER(b);
} else if ( b->size == 0 ) {
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(a);
}
res = join_tsqueries(a, b);
res->valnode->val = '|';
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(query);
}
PG_FUNCTION_INFO_V1(tsquery_not);
Datum tsquery_not(PG_FUNCTION_ARGS);
Datum
tsquery_not(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 )
PG_RETURN_POINTER(a);
res=(QTNode*)palloc0( sizeof(QTNode) );
res->flags |= QTN_NEEDFREE;
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
res->valnode->type = OPR;
res->valnode->val = '!';
res->child = (QTNode**)palloc0( sizeof(QTNode*) );
res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
res->nchild = 1;
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_RETURN_POINTER(query);
}
static int static int
CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) { CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
if ( a->size != b->size ) { if ( a->size != b->size ) {

View File

@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)');
select to_tsquery('default', '(asd&and)|fghj'); select to_tsquery('default', '(asd&and)|fghj');
select to_tsquery('default', '(asd&!and)|fghj'); select to_tsquery('default', '(asd&!and)|fghj');
select to_tsquery('default', '(the|and&(i&1))&fghj'); select to_tsquery('default', '(the|and&(i&1))&fghj');
select plainto_tsquery('default', 'the and z 1))& fghj');
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';

View File

@ -0,0 +1,61 @@
#include "ts_locale.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
#if defined(TS_USE_WIDE) && defined(WIN32)
size_t
wchar2char( const char *to, const wchar_t *from, size_t len ) {
if (GetDatabaseEncoding() == PG_UTF8) {
int r;
if (len==0)
return 0;
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
NULL, NULL);
if ( r==0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("UTF-16 to UTF-8 translation failed: %lu",
GetLastError())));
return r;
}
return wcstombs(to, from, len);
}
size_t
char2wchar( const wchar_t *to, const char *from, size_t len ) {
if (GetDatabaseEncoding() == PG_UTF8) {
int r;
if (len==0)
return 0;
r = MultiByteToWideChar(CP_UTF8, 0, from, len,
to, len);
if (!r) {
pg_verifymbstr(from, len, false);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
Assert(r <= nbytes);
return r;
}
return mbstowcs(to, from, len);
}
#endif

View File

@ -0,0 +1,38 @@
#ifndef __TSLOCALE_H__
#define __TSLOCALE_H__
#include "postgres.h"
#include <ctype.h>
#include <limits.h>
/*
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
* declare them in <wchar.h>.
*/
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
#define TS_USE_WIDE
#ifdef WIN32
size_t wchar2char( const char *to, const wchar_t *from, size_t len );
size_t char2wchar( const wchar_t *to, const char *from, size_t len );
#else /* WIN32 */
/* correct mbstowcs */
#define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */
#endif /* __TSLOCALE_H__ */

View File

@ -427,6 +427,21 @@ RETURNS tsquery
AS 'MODULE_PATHNAME','to_tsquery_current' AS 'MODULE_PATHNAME','to_tsquery_current'
LANGUAGE 'c' with (isstrict,iscachable); LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(oid, text)
RETURNS tsquery
AS 'MODULE_PATHNAME'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(text, text)
RETURNS tsquery
AS 'MODULE_PATHNAME','plainto_tsquery_name'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(text)
RETURNS tsquery
AS 'MODULE_PATHNAME','plainto_tsquery_current'
LANGUAGE 'c' with (isstrict,iscachable);
--operations --operations
CREATE FUNCTION exectsq(tsvector, tsquery) CREATE FUNCTION exectsq(tsvector, tsquery)
RETURNS bool RETURNS bool
@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery)
language 'C' language 'C'
with (isstrict,iscachable); with (isstrict,iscachable);
CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_and'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR && (
LEFTARG = tsquery,
RIGHTARG = tsquery,
PROCEDURE = tsquery_and,
COMMUTATOR = '&&',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_or'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR || (
LEFTARG = tsquery,
RIGHTARG = tsquery,
PROCEDURE = tsquery_or,
COMMUTATOR = '||',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION tsquery_not(tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_not'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR !! (
RIGHTARG = tsquery,
PROCEDURE = tsquery_not
);
--------------rewrite subsystem --------------rewrite subsystem
CREATE OR REPLACE FUNCTION rewrite(tsquery, text) CREATE OR REPLACE FUNCTION rewrite(tsquery, text)

View File

@ -1,8 +1,8 @@
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
SUBOBJS = parser.o deflex.o SUBOBJS = parser.o deflex.o
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
PG_CPPFLAGS = -I$(srcdir)/.. PG_CPPFLAGS = -I$(srcdir)/..
@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
all: SUBSYS.o all: SUBSYS.o
parser.c: parser.l
ifdef FLEX
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
else
@$(missing) flex $< $@
endif
SUBSYS.o: $(SUBOBJS) SUBSYS.o: $(SUBOBJS)
$(LD) $(LDREL) $(LDOUT) $@ $^ $(LD) $(LDREL) $(LDOUT) $@ $^

View File

@ -15,7 +15,7 @@ const char *lex_descr[] = {
"Latin part of hyphenated word", "Latin part of hyphenated word",
"Space symbols", "Space symbols",
"HTML Tag", "HTML Tag",
"HTTP head", "Protocol head",
"Hyphenated word", "Hyphenated word",
"Latin hyphenated word", "Latin hyphenated word",
"Non-latin hyphenated word", "Non-latin hyphenated word",
@ -42,7 +42,7 @@ const char *tok_alias[] = {
"lpart_hword", "lpart_hword",
"blank", "blank",
"tag", "tag",
"http", "protocol",
"hword", "hword",
"lhword", "lhword",
"nlhword", "nlhword",

View File

@ -17,7 +17,7 @@
#define LATPARTHYPHENWORD 11 #define LATPARTHYPHENWORD 11
#define SPACE 12 #define SPACE 12
#define TAG 13 #define TAG 13
#define HTTP 14 #define PROTOCOL 14
#define HYPHENWORD 15 #define HYPHENWORD 15
#define LATHYPHENWORD 16 #define LATHYPHENWORD 16
#define CYRHYPHENWORD 17 #define CYRHYPHENWORD 17

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,147 @@
#ifndef __PARSER_H__ #ifndef __PARSER_H__
#define __PARSER_H__ #define __PARSER_H__
extern char *token; #include <ctype.h>
extern int tokenlen; #include <limits.h>
int tsearch2_yylex(void); #include "ts_locale.h"
void tsearch2_start_parse_str(char *, int);
void tsearch2_end_parse(void); typedef enum {
TPS_Base = 0,
TPS_InUWord,
TPS_InLatWord,
TPS_InCyrWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InSpace,
TPS_InUDecimalFirst,
TPS_InUDecimal,
TPS_InDecimalFirst,
TPS_InDecimal,
TPS_InVersionFirst,
TPS_InVersion,
TPS_InMantissaFirst,
TPS_InMantissaSign,
TPS_InMantissa,
TPS_InHTMLEntityFirst,
TPS_InHTMLEntity,
TPS_InHTMLEntityNumFirst,
TPS_InHTMLEntityNum,
TPS_InHTMLEntityEnd,
TPS_InTagFirst,
TPS_InTagCloseFirst,
TPS_InTag,
TPS_InTagEscapeK,
TPS_InTagEscapeKK,
TPS_InTagBackSleshed,
TPS_InTagEnd,
TPS_InCommentFirst,
TPS_InCommentLast,
TPS_InComment,
TPS_InCloseCommentFirst,
TPS_InCloseCommentLast,
TPS_InCommentEnd,
TPS_InHostFirstDomen,
TPS_InHostDomenSecond,
TPS_InHostDomen,
TPS_InPortFirst,
TPS_InPort,
TPS_InHostFirstAN,
TPS_InHost,
TPS_InEmail,
TPS_InFileFirst,
TPS_InFile,
TPS_InFileNext,
TPS_InURIFirst,
TPS_InURIStart,
TPS_InURI,
TPS_InFURL,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
TPS_InHyphenLatWordFirst,
TPS_InHyphenLatWord,
TPS_InHyphenCyrWordFirst,
TPS_InHyphenCyrWord,
TPS_InHyphenUWordFirst,
TPS_InHyphenUWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenCyrWordPart,
TPS_InHyphenLatWordPart,
TPS_InHyphenUWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
TPS_InHVersionPartFirst,
TPS_InHVersionPart,
TPS_Null /* last state (fake value) */
} TParserState;
/* forward declaration */
struct TParser;
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */
typedef struct {
TParserCharTest isclass;
char c;
uint16 flags;
TParserState tostate;
int type;
TParserSpecial special;
} TParserStateActionItem;
typedef struct {
TParserState state;
TParserStateActionItem *action;
} TParserStateAction;
typedef struct TParserPosition {
int posbyte; /* position of parser in bytes */
int poschar; /* osition of parser in characters */
int charlen; /* length of current char */
int lenbytelexeme;
int lencharlexeme;
TParserState state;
struct TParserPosition *prev;
int flags;
TParserStateActionItem *pushedAtAction;
} TParserPosition;
typedef struct TParser {
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
wchar_t *wstr; /* wide character string */
int lenwstr; /* length of wsting */
/* State of parse */
int charmaxlen;
bool usewide;
TParserPosition *state;
bool ignore;
bool wanthost;
/* silly char */
char c;
/* out */
char *lexeme;
int lenbytelexeme;
int lencharlexeme;
int type;
} TParser;
TParser* TParserInit( char *, int );
bool TParserGet( TParser* );
void TParserClose( TParser* );
#endif #endif

View File

@ -1,346 +0,0 @@
%{
#include "postgres.h"
#include "deflex.h"
#include "parser.h"
#include "common.h"
/* Avoid exit() on fatal scanner errors */
#undef fprintf
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
char *token = NULL; /* pointer to token */
int tokenlen;
static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
typedef struct {
int tlen;
int clen;
char *str;
} TagStorage;
static TagStorage ts={0,0,NULL};
static void
addTag(void)
{
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
ts.tlen*=2;
ts.str=realloc(ts.str,ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
ts.clen+=tsearch2_yyleng;
ts.str[ts.clen]='\0';
}
static void
startTag(void)
{
if ( ts.str==NULL ) {
ts.tlen=tsearch2_yyleng+1;
ts.str=malloc(ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ts.clen=0;
ts.str[0]='\0';
addTag();
}
%}
%option 8bit
%option never-interactive
%option nodefault
%option nounput
%option noyywrap
/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
%x INCOMMENT
%x INSCRIPT
/* cyrillic koi8 char */
CYRALNUM [0-9\200-\377]
CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
<QINTAG>"\\\"" { addTag(); }
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return EMAIL;
}
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SCIENTIFIC;
}
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
[+-]?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
[+-][0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SIGNEDINT;
}
<DELIM,INITIAL>[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UNSIGNEDINT;
}
http"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
ftp"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
BEGIN SERVER;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return FURL;
}
<SERVER,URL,INITIAL>{HOSTNAME} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HOST;
}
<SERVER>[/:]{URI} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return URI;
}
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return FILEPATH;
}
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return CYRHYPHENWORD;
}
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return LATHYPHENWORD;
}
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return HYPHENWORD;
}
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
<DELIM>\+?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRPARTHYPHENWORD;
}
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return PARTHYPHENWORD;
}
<DELIM>- {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
yyless( 0 );
}
{CYRALPHA}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRWORD;
}
[[:alpha:]]+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATWORD;
}
{ALNUM}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UWORD;
}
[ \r\n\t]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
. {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
%%
/* clearing after parsing from string */
void
tsearch2_end_parse(void)
{
if (s)
{
free(s);
s = NULL;
}
tsearch2_yy_delete_buffer( buf );
buf = NULL;
}
/* start parse from string */
void
tsearch2_start_parse_str(char* str, int limit)
{
if (buf)
tsearch2_end_parse();
buf = tsearch2_yy_scan_bytes( str, limit );
tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}

View File

@ -39,8 +39,7 @@ Datum prsd_start(PG_FUNCTION_ARGS);
Datum Datum
prsd_start(PG_FUNCTION_ARGS) prsd_start(PG_FUNCTION_ARGS)
{ {
tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)); PG_RETURN_POINTER(TParserInit( (char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
PG_RETURN_POINTER(NULL);
} }
PG_FUNCTION_INFO_V1(prsd_getlexeme); PG_FUNCTION_INFO_V1(prsd_getlexeme);
@ -48,14 +47,17 @@ Datum prsd_getlexeme(PG_FUNCTION_ARGS);
Datum Datum
prsd_getlexeme(PG_FUNCTION_ARGS) prsd_getlexeme(PG_FUNCTION_ARGS)
{ {
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */ TParser *p=(TParser*)PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1); char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2); int *tlen = (int *) PG_GETARG_POINTER(2);
int type = tsearch2_yylex();
*t = token; if ( !TParserGet(p) )
*tlen = tokenlen; PG_RETURN_INT32(0);
PG_RETURN_INT32(type);
*t = p->lexeme;
*tlen = p->lenbytelexeme;
PG_RETURN_INT32(p->type);
} }
PG_FUNCTION_INFO_V1(prsd_end); PG_FUNCTION_INFO_V1(prsd_end);
@ -63,8 +65,8 @@ Datum prsd_end(PG_FUNCTION_ARGS);
Datum Datum
prsd_end(PG_FUNCTION_ARGS) prsd_end(PG_FUNCTION_ARGS)
{ {
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */ TParser *p=(TParser*)PG_GETARG_POINTER(0);
tsearch2_end_parse(); TParserClose(p);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }