Text parser rewritten:
- supports multibyte encodings - more strict rules for lexemes - flex isn't used Add: - tsquery plainto_tsquery(text) Function makes tsquery from plain text. - &&, ||, !! operation for tsquery for combining tsquery from it's parts: 'foo & bar' || 'asd' => 'foo & bar | asd'
This commit is contained in:
parent
b91e6ed93e
commit
c52795d18a
@ -1,4 +1,4 @@
|
|||||||
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $
|
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
|
||||||
|
|
||||||
MODULE_big = tsearch2
|
MODULE_big = tsearch2
|
||||||
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
|
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
|
||||||
@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
|
|||||||
wparser.o wparser_def.o \
|
wparser.o wparser_def.o \
|
||||||
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
|
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
|
||||||
tsvector_op.o rank.o ts_stat.o \
|
tsvector_op.o rank.o ts_stat.o \
|
||||||
query_util.o query_support.o query_rewrite.o query_gist.o
|
query_util.o query_support.o query_rewrite.o query_gist.o \
|
||||||
|
ts_locale.o
|
||||||
|
|
||||||
SUBDIRS := snowball ispell wordparser
|
SUBDIRS := snowball ispell wordparser
|
||||||
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
|
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
|
||||||
|
@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell
|
|||||||
psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined
|
psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined
|
||||||
DETAIL: Creating a shell type definition.
|
DETAIL: Creating a shell type definition.
|
||||||
psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell
|
psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell
|
||||||
psql:tsearch2.sql:544: NOTICE: type "gtsvector" is not yet defined
|
psql:tsearch2.sql:559: NOTICE: type "gtsvector" is not yet defined
|
||||||
DETAIL: Creating a shell type definition.
|
DETAIL: Creating a shell type definition.
|
||||||
psql:tsearch2.sql:549: NOTICE: argument type gtsvector is only a shell
|
psql:tsearch2.sql:564: NOTICE: argument type gtsvector is only a shell
|
||||||
psql:tsearch2.sql:998: NOTICE: type "gtsq" is not yet defined
|
psql:tsearch2.sql:1054: NOTICE: type "gtsq" is not yet defined
|
||||||
DETAIL: Creating a shell type definition.
|
DETAIL: Creating a shell type definition.
|
||||||
psql:tsearch2.sql:1003: NOTICE: argument type gtsq is only a shell
|
psql:tsearch2.sql:1059: NOTICE: argument type gtsq is only a shell
|
||||||
--tsvector
|
--tsvector
|
||||||
SELECT '1'::tsvector;
|
SELECT '1'::tsvector;
|
||||||
tsvector
|
tsvector
|
||||||
@ -653,7 +653,7 @@ select * from token_type('default');
|
|||||||
11 | lpart_hword | Latin part of hyphenated word
|
11 | lpart_hword | Latin part of hyphenated word
|
||||||
12 | blank | Space symbols
|
12 | blank | Space symbols
|
||||||
13 | tag | HTML Tag
|
13 | tag | HTML Tag
|
||||||
14 | http | HTTP head
|
14 | protocol | Protocol head
|
||||||
15 | hword | Hyphenated word
|
15 | hword | Hyphenated word
|
||||||
16 | lhword | Latin hyphenated word
|
16 | lhword | Latin hyphenated word
|
||||||
17 | nlhword | Non-latin hyphenated word
|
17 | nlhword | Non-latin hyphenated word
|
||||||
@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
|
|||||||
-------+--------------------------------------
|
-------+--------------------------------------
|
||||||
22 | 345
|
22 | 345
|
||||||
12 |
|
12 |
|
||||||
4 | qwe@efd.r
|
1 | qwe
|
||||||
12 |
|
12 | @
|
||||||
|
19 | efd.r
|
||||||
12 | '
|
12 | '
|
||||||
12 |
|
|
||||||
14 | http://
|
14 | http://
|
||||||
6 | www.com
|
6 | www.com
|
||||||
12 | /
|
12 | /
|
||||||
12 |
|
|
||||||
14 | http://
|
14 | http://
|
||||||
5 | aew.werc.ewr/?ad=qwe&dw
|
5 | aew.werc.ewr/?ad=qwe&dw
|
||||||
6 | aew.werc.ewr
|
6 | aew.werc.ewr
|
||||||
@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
|
|||||||
6 | 4aew.werc.ewr
|
6 | 4aew.werc.ewr
|
||||||
12 |
|
12 |
|
||||||
14 | http://
|
14 | http://
|
||||||
5 | 5aew.werc.ewr:8100/?
|
6 | 5aew.werc.ewr:8100
|
||||||
6 | 5aew.werc.ewr
|
12 | /?
|
||||||
18 | :8100/?
|
|
||||||
12 |
|
|
||||||
1 | ad
|
1 | ad
|
||||||
12 | =
|
12 | =
|
||||||
1 | qwe
|
1 | qwe
|
||||||
@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
|
|||||||
1 | dw
|
1 | dw
|
||||||
12 |
|
12 |
|
||||||
5 | 6aew.werc.ewr:8100/?ad=qwe&dw
|
5 | 6aew.werc.ewr:8100/?ad=qwe&dw
|
||||||
6 | 6aew.werc.ewr
|
6 | 6aew.werc.ewr:8100
|
||||||
18 | :8100/?ad=qwe&dw
|
18 | /?ad=qwe&dw
|
||||||
12 |
|
12 |
|
||||||
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
|
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
|
||||||
6 | 7aew.werc.ewr
|
6 | 7aew.werc.ewr:8100
|
||||||
18 | :8100/?ad=qwe&dw=%20%32
|
18 | /?ad=qwe&dw=%20%32
|
||||||
12 |
|
12 |
|
||||||
7 | +4.0e-10
|
7 | +4.0e-10
|
||||||
12 |
|
12 |
|
||||||
@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
|
|||||||
1 | jf
|
1 | jf
|
||||||
12 |
|
12 |
|
||||||
1 | sdjk
|
1 | sdjk
|
||||||
13 | <we hjwer <werrwe>
|
12 | <
|
||||||
|
1 | we
|
||||||
|
12 |
|
||||||
|
1 | hjwer
|
||||||
|
12 |
|
||||||
|
13 | <werrwe>
|
||||||
12 |
|
12 |
|
||||||
3 | ewr1
|
3 | ewr1
|
||||||
12 | >
|
12 | >
|
||||||
12 |
|
|
||||||
3 | ewri2
|
3 | ewri2
|
||||||
12 |
|
12 |
|
||||||
13 | <a href="qwe<qwe>">
|
13 | <a href="qwe<qwe>">
|
||||||
@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
|
|||||||
12 |
|
12 |
|
||||||
19 | /wqe-324/ewr
|
19 | /wqe-324/ewr
|
||||||
12 |
|
12 |
|
||||||
6 | gist.h
|
19 | gist.h
|
||||||
12 |
|
12 |
|
||||||
6 | gist.h.c
|
19 | gist.h.c
|
||||||
12 |
|
12 |
|
||||||
6 | gist.c
|
19 | gist.c
|
||||||
12 | .
|
12 | .
|
||||||
12 |
|
|
||||||
1 | readline
|
1 | readline
|
||||||
12 |
|
12 |
|
||||||
20 | 4.2
|
20 | 4.2
|
||||||
12 |
|
12 |
|
||||||
20 | 4.2
|
20 | 4.2
|
||||||
12 | .
|
12 | .
|
||||||
12 |
|
|
||||||
20 | 4.2
|
20 | 4.2
|
||||||
12 | ,
|
12 | ,
|
||||||
12 |
|
15 | readline-4.2
|
||||||
15 | readline-4
|
|
||||||
11 | readline
|
11 | readline
|
||||||
12 | -
|
12 | -
|
||||||
20 | 4.2
|
20 | 4.2
|
||||||
12 |
|
12 |
|
||||||
15 | readline-4
|
15 | readline-4.2
|
||||||
11 | readline
|
11 | readline
|
||||||
12 | -
|
12 | -
|
||||||
20 | 4.2
|
20 | 4.2
|
||||||
12 | .
|
12 | .
|
||||||
12 |
|
|
||||||
22 | 234
|
22 | 234
|
||||||
12 |
|
12 |
|
||||||
|
|
||||||
13 | <i <b>
|
12 | <
|
||||||
|
1 | i
|
||||||
|
12 |
|
||||||
|
13 | <b>
|
||||||
12 |
|
12 |
|
||||||
1 | wow
|
1 | wow
|
||||||
12 |
|
12 |
|
||||||
12 | <
|
12 | <
|
||||||
12 |
|
|
||||||
1 | jqw
|
1 | jqw
|
||||||
12 |
|
12 |
|
||||||
12 | <
|
12 | <>
|
||||||
12 | >
|
|
||||||
12 |
|
|
||||||
1 | qwerty
|
1 | qwerty
|
||||||
(138 rows)
|
(135 rows)
|
||||||
|
|
||||||
SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
|
SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
|
||||||
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
|
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
|
||||||
<i <b> wow < jqw <> qwerty');
|
<i <b> wow < jqw <> qwerty');
|
||||||
to_tsvector
|
to_tsvector
|
||||||
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
'ad':18 'dw':20 'jf':40 '234':62 '345':1 '4.2':53,54,55,58,61 '455':32 'jqw':64 'qwe':19,28,29,36 'wer':37 'wow':63 'asdf':38 'ewr1':42 'qwer':39 'sdjk':41 '5.005':33 'ewri2':43 'qwqwe':30 'wefjn':47 'gist.c':51 'gist.h':49 'qwerti':65 '234.435':31 ':8100/?':17 'qwe-wer':35 'readlin':52,57,60 'www.com':3 '+4.0e-10':27 'gist.h.c':50 'rewt/ewr':46 'qwe@efd.r':2 'readline-4':56,59 '/?ad=qwe&dw':6,9,13 '/wqe-324/ewr':48 'aew.werc.ewr':5 '1aew.werc.ewr':8 '2aew.werc.ewr':10 '3aew.werc.ewr':12 '4aew.werc.ewr':14 '5aew.werc.ewr':16 '6aew.werc.ewr':22 '7aew.werc.ewr':25 '/usr/local/fff':44 '/awdf/dwqe/4325':45 ':8100/?ad=qwe&dw':23 'teodor@stack.net':34 '5aew.werc.ewr:8100/?':15 ':8100/?ad=qwe&dw=%20%32':26 'aew.werc.ewr/?ad=qwe&dw':4 '1aew.werc.ewr/?ad=qwe&dw':7 '3aew.werc.ewr/?ad=qwe&dw':11 '6aew.werc.ewr:8100/?ad=qwe&dw':21 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':24
|
'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
SELECT length(to_tsvector('default', '345 qw'));
|
SELECT length(to_tsvector('default', '345 qw'));
|
||||||
@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae
|
|||||||
<i <b> wow < jqw <> qwerty'));
|
<i <b> wow < jqw <> qwerty'));
|
||||||
length
|
length
|
||||||
--------
|
--------
|
||||||
53
|
51
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
select to_tsquery('default', 'qwe & sKies ');
|
select to_tsquery('default', 'qwe & sKies ');
|
||||||
@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj');
|
|||||||
'1' & 'fghj'
|
'1' & 'fghj'
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
select plainto_tsquery('default', 'the and z 1))& fghj');
|
||||||
|
plainto_tsquery
|
||||||
|
--------------------
|
||||||
|
'z' & '1' & 'fghj'
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
|
||||||
|
?column?
|
||||||
|
-----------------------
|
||||||
|
'foo' & 'bar' & 'asd'
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
|
||||||
|
?column?
|
||||||
|
------------------------------
|
||||||
|
'foo' & 'bar' | 'asd' & 'fg'
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
|
||||||
|
?column?
|
||||||
|
-----------------------------------
|
||||||
|
'foo' & 'bar' | !( 'asd' & 'fg' )
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
|
||||||
|
?column?
|
||||||
|
----------------------------------
|
||||||
|
'foo' & 'bar' & ( 'asd' | 'fg' )
|
||||||
|
(1 row)
|
||||||
|
|
||||||
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
|
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
|
||||||
?column?
|
?column?
|
||||||
----------
|
----------
|
||||||
|
@ -51,10 +51,20 @@ Datum to_tsquery_name(PG_FUNCTION_ARGS);
|
|||||||
PG_FUNCTION_INFO_V1(to_tsquery_current);
|
PG_FUNCTION_INFO_V1(to_tsquery_current);
|
||||||
Datum to_tsquery_current(PG_FUNCTION_ARGS);
|
Datum to_tsquery_current(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(plainto_tsquery);
|
||||||
|
Datum plainto_tsquery(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(plainto_tsquery_name);
|
||||||
|
Datum plainto_tsquery_name(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(plainto_tsquery_current);
|
||||||
|
Datum plainto_tsquery_current(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
/* parser's states */
|
/* parser's states */
|
||||||
#define WAITOPERAND 1
|
#define WAITOPERAND 1
|
||||||
#define WAITOPERATOR 2
|
#define WAITOPERATOR 2
|
||||||
#define WAITFIRSTOPERAND 3
|
#define WAITFIRSTOPERAND 3
|
||||||
|
#define WAITSINGLEOPERAND 4
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* node of query tree, also used
|
* node of query tree, also used
|
||||||
@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
|
|||||||
else if (*(state->buf) != ' ')
|
else if (*(state->buf) != ' ')
|
||||||
return ERR;
|
return ERR;
|
||||||
break;
|
break;
|
||||||
|
case WAITSINGLEOPERAND:
|
||||||
|
if ( *(state->buf) == '\0' )
|
||||||
|
return END;
|
||||||
|
*strval = state->buf;
|
||||||
|
*lenval = strlen( state->buf );
|
||||||
|
state->buf += strlen( state->buf );
|
||||||
|
state->count++;
|
||||||
|
return VAL;
|
||||||
default:
|
default:
|
||||||
return ERR;
|
return ERR;
|
||||||
break;
|
break;
|
||||||
@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos)
|
|||||||
* input
|
* input
|
||||||
*/
|
*/
|
||||||
static QUERYTYPE *
|
static QUERYTYPE *
|
||||||
queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id)
|
queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain)
|
||||||
{
|
{
|
||||||
QPRS_STATE state;
|
QPRS_STATE state;
|
||||||
int4 i;
|
int4 i;
|
||||||
@ -599,7 +617,7 @@ static QUERYTYPE *
|
|||||||
|
|
||||||
/* init state */
|
/* init state */
|
||||||
state.buf = buf;
|
state.buf = buf;
|
||||||
state.state = WAITFIRSTOPERAND;
|
state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
|
||||||
state.count = 0;
|
state.count = 0;
|
||||||
state.num = 0;
|
state.num = 0;
|
||||||
state.str = NULL;
|
state.str = NULL;
|
||||||
@ -679,7 +697,7 @@ Datum
|
|||||||
tsquery_in(PG_FUNCTION_ARGS)
|
tsquery_in(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
SET_FUNCOID();
|
SET_FUNCOID();
|
||||||
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0));
|
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS)
|
|||||||
str = text2char(in);
|
str = text2char(in);
|
||||||
PG_FREE_IF_COPY(in, 1);
|
PG_FREE_IF_COPY(in, 1);
|
||||||
|
|
||||||
query = queryin(str, pushval_morph, PG_GETARG_INT32(0));
|
query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false);
|
||||||
|
|
||||||
if ( query->size == 0 )
|
if ( query->size == 0 )
|
||||||
PG_RETURN_POINTER(query);
|
PG_RETURN_POINTER(query);
|
||||||
@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS)
|
|||||||
Int32GetDatum(get_currcfg()),
|
Int32GetDatum(get_currcfg()),
|
||||||
PG_GETARG_DATUM(0)));
|
PG_GETARG_DATUM(0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
plainto_tsquery(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
text *in = PG_GETARG_TEXT_P(1);
|
||||||
|
char *str;
|
||||||
|
QUERYTYPE *query;
|
||||||
|
ITEM *res;
|
||||||
|
int4 len;
|
||||||
|
|
||||||
|
SET_FUNCOID();
|
||||||
|
|
||||||
|
str = text2char(in);
|
||||||
|
PG_FREE_IF_COPY(in, 1);
|
||||||
|
|
||||||
|
query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true);
|
||||||
|
|
||||||
|
if ( query->size == 0 )
|
||||||
|
PG_RETURN_POINTER(query);
|
||||||
|
|
||||||
|
res = clean_fakeval_v2(GETQUERY(query), &len);
|
||||||
|
if (!res)
|
||||||
|
{
|
||||||
|
query->len = HDRSIZEQT;
|
||||||
|
query->size = 0;
|
||||||
|
PG_RETURN_POINTER(query);
|
||||||
|
}
|
||||||
|
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM));
|
||||||
|
pfree(res);
|
||||||
|
PG_RETURN_POINTER(query);
|
||||||
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
plainto_tsquery_name(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
text *name = PG_GETARG_TEXT_P(0);
|
||||||
|
Datum res;
|
||||||
|
|
||||||
|
SET_FUNCOID();
|
||||||
|
res = DirectFunctionCall2(plainto_tsquery,
|
||||||
|
Int32GetDatum(name2id_cfg(name)),
|
||||||
|
PG_GETARG_DATUM(1));
|
||||||
|
|
||||||
|
PG_FREE_IF_COPY(name, 0);
|
||||||
|
PG_RETURN_DATUM(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
Datum
|
||||||
|
plainto_tsquery_current(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
SET_FUNCOID();
|
||||||
|
PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery,
|
||||||
|
Int32GetDatum(get_currcfg()),
|
||||||
|
PG_GETARG_DATUM(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) {
|
|||||||
PG_RETURN_INT32(nnode);
|
PG_RETURN_INT32(nnode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static QTNode*
|
||||||
|
join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) {
|
||||||
|
QTNode *res=(QTNode*)palloc0( sizeof(QTNode) );
|
||||||
|
|
||||||
|
res->flags |= QTN_NEEDFREE;
|
||||||
|
|
||||||
|
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
|
||||||
|
res->valnode->type = OPR;
|
||||||
|
|
||||||
|
res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 );
|
||||||
|
res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) );
|
||||||
|
res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
|
||||||
|
res->nchild = 2;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(tsquery_and);
|
||||||
|
Datum tsquery_and(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
|
Datum
|
||||||
|
tsquery_and(PG_FUNCTION_ARGS) {
|
||||||
|
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
|
||||||
|
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
|
||||||
|
QTNode *res;
|
||||||
|
QUERYTYPE *query;
|
||||||
|
|
||||||
|
if ( a->size == 0 ) {
|
||||||
|
PG_FREE_IF_COPY(a,1);
|
||||||
|
PG_RETURN_POINTER(b);
|
||||||
|
} else if ( b->size == 0 ) {
|
||||||
|
PG_FREE_IF_COPY(b,1);
|
||||||
|
PG_RETURN_POINTER(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
res = join_tsqueries(a, b);
|
||||||
|
|
||||||
|
res->valnode->val = '&';
|
||||||
|
|
||||||
|
query = QTN2QT( res, PlainMemory );
|
||||||
|
|
||||||
|
QTNFree(res);
|
||||||
|
PG_FREE_IF_COPY(a,0);
|
||||||
|
PG_FREE_IF_COPY(b,1);
|
||||||
|
|
||||||
|
PG_RETURN_POINTER(query);
|
||||||
|
}
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(tsquery_or);
|
||||||
|
Datum tsquery_or(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
|
Datum
|
||||||
|
tsquery_or(PG_FUNCTION_ARGS) {
|
||||||
|
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
|
||||||
|
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
|
||||||
|
QTNode *res;
|
||||||
|
QUERYTYPE *query;
|
||||||
|
|
||||||
|
if ( a->size == 0 ) {
|
||||||
|
PG_FREE_IF_COPY(a,1);
|
||||||
|
PG_RETURN_POINTER(b);
|
||||||
|
} else if ( b->size == 0 ) {
|
||||||
|
PG_FREE_IF_COPY(b,1);
|
||||||
|
PG_RETURN_POINTER(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
res = join_tsqueries(a, b);
|
||||||
|
|
||||||
|
res->valnode->val = '|';
|
||||||
|
|
||||||
|
query = QTN2QT( res, PlainMemory );
|
||||||
|
|
||||||
|
QTNFree(res);
|
||||||
|
PG_FREE_IF_COPY(a,0);
|
||||||
|
PG_FREE_IF_COPY(b,1);
|
||||||
|
|
||||||
|
PG_RETURN_POINTER(query);
|
||||||
|
}
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(tsquery_not);
|
||||||
|
Datum tsquery_not(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
|
Datum
|
||||||
|
tsquery_not(PG_FUNCTION_ARGS) {
|
||||||
|
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
|
||||||
|
QTNode *res;
|
||||||
|
QUERYTYPE *query;
|
||||||
|
|
||||||
|
if ( a->size == 0 )
|
||||||
|
PG_RETURN_POINTER(a);
|
||||||
|
|
||||||
|
res=(QTNode*)palloc0( sizeof(QTNode) );
|
||||||
|
|
||||||
|
res->flags |= QTN_NEEDFREE;
|
||||||
|
|
||||||
|
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
|
||||||
|
res->valnode->type = OPR;
|
||||||
|
res->valnode->val = '!';
|
||||||
|
|
||||||
|
res->child = (QTNode**)palloc0( sizeof(QTNode*) );
|
||||||
|
res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
|
||||||
|
res->nchild = 1;
|
||||||
|
|
||||||
|
query = QTN2QT( res, PlainMemory );
|
||||||
|
|
||||||
|
QTNFree(res);
|
||||||
|
PG_FREE_IF_COPY(a,0);
|
||||||
|
|
||||||
|
PG_RETURN_POINTER(query);
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
|
CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
|
||||||
if ( a->size != b->size ) {
|
if ( a->size != b->size ) {
|
||||||
|
@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)');
|
|||||||
select to_tsquery('default', '(asd&and)|fghj');
|
select to_tsquery('default', '(asd&and)|fghj');
|
||||||
select to_tsquery('default', '(asd&!and)|fghj');
|
select to_tsquery('default', '(asd&!and)|fghj');
|
||||||
select to_tsquery('default', '(the|and&(i&1))&fghj');
|
select to_tsquery('default', '(the|and&(i&1))&fghj');
|
||||||
|
|
||||||
|
select plainto_tsquery('default', 'the and z 1))& fghj');
|
||||||
|
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
|
||||||
|
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
|
||||||
|
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
|
||||||
|
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
|
||||||
|
|
||||||
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
|
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
|
||||||
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
|
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
|
||||||
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';
|
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';
|
||||||
|
61
contrib/tsearch2/ts_locale.c
Normal file
61
contrib/tsearch2/ts_locale.c
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#include "ts_locale.h"
|
||||||
|
|
||||||
|
#include "utils/builtins.h"
|
||||||
|
#include "utils/pg_locale.h"
|
||||||
|
#include "mb/pg_wchar.h"
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(TS_USE_WIDE) && defined(WIN32)
|
||||||
|
|
||||||
|
size_t
|
||||||
|
wchar2char( const char *to, const wchar_t *from, size_t len ) {
|
||||||
|
if (GetDatabaseEncoding() == PG_UTF8) {
|
||||||
|
int r;
|
||||||
|
|
||||||
|
if (len==0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
|
||||||
|
NULL, NULL);
|
||||||
|
|
||||||
|
|
||||||
|
if ( r==0 )
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||||
|
errmsg("UTF-16 to UTF-8 translation failed: %lu",
|
||||||
|
GetLastError())));
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
return wcstombs(to, from, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t
|
||||||
|
char2wchar( const wchar_t *to, const char *from, size_t len ) {
|
||||||
|
if (GetDatabaseEncoding() == PG_UTF8) {
|
||||||
|
int r;
|
||||||
|
|
||||||
|
if (len==0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
r = MultiByteToWideChar(CP_UTF8, 0, from, len,
|
||||||
|
to, len);
|
||||||
|
|
||||||
|
if (!r) {
|
||||||
|
pg_verifymbstr(from, len, false);
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||||
|
errmsg("invalid multibyte character for locale"),
|
||||||
|
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert(r <= nbytes);
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mbstowcs(to, from, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
38
contrib/tsearch2/ts_locale.h
Normal file
38
contrib/tsearch2/ts_locale.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#ifndef __TSLOCALE_H__
|
||||||
|
#define __TSLOCALE_H__
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
|
||||||
|
* declare them in <wchar.h>.
|
||||||
|
*/
|
||||||
|
#ifdef HAVE_WCHAR_H
|
||||||
|
#include <wchar.h>
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_WCTYPE_H
|
||||||
|
#include <wctype.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
|
||||||
|
#define TS_USE_WIDE
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
|
||||||
|
size_t wchar2char( const char *to, const wchar_t *from, size_t len );
|
||||||
|
size_t char2wchar( const wchar_t *to, const char *from, size_t len );
|
||||||
|
|
||||||
|
#else /* WIN32 */
|
||||||
|
|
||||||
|
/* correct mbstowcs */
|
||||||
|
#define char2wchar mbstowcs
|
||||||
|
#define wchar2char wcstombs
|
||||||
|
|
||||||
|
#endif /* WIN32 */
|
||||||
|
|
||||||
|
#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */
|
||||||
|
|
||||||
|
#endif /* __TSLOCALE_H__ */
|
@ -427,6 +427,21 @@ RETURNS tsquery
|
|||||||
AS 'MODULE_PATHNAME','to_tsquery_current'
|
AS 'MODULE_PATHNAME','to_tsquery_current'
|
||||||
LANGUAGE 'c' with (isstrict,iscachable);
|
LANGUAGE 'c' with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE FUNCTION plainto_tsquery(oid, text)
|
||||||
|
RETURNS tsquery
|
||||||
|
AS 'MODULE_PATHNAME'
|
||||||
|
LANGUAGE 'c' with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE FUNCTION plainto_tsquery(text, text)
|
||||||
|
RETURNS tsquery
|
||||||
|
AS 'MODULE_PATHNAME','plainto_tsquery_name'
|
||||||
|
LANGUAGE 'c' with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE FUNCTION plainto_tsquery(text)
|
||||||
|
RETURNS tsquery
|
||||||
|
AS 'MODULE_PATHNAME','plainto_tsquery_current'
|
||||||
|
LANGUAGE 'c' with (isstrict,iscachable);
|
||||||
|
|
||||||
--operations
|
--operations
|
||||||
CREATE FUNCTION exectsq(tsvector, tsquery)
|
CREATE FUNCTION exectsq(tsvector, tsquery)
|
||||||
RETURNS bool
|
RETURNS bool
|
||||||
@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery)
|
|||||||
language 'C'
|
language 'C'
|
||||||
with (isstrict,iscachable);
|
with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery)
|
||||||
|
returns tsquery
|
||||||
|
as 'MODULE_PATHNAME', 'tsquery_and'
|
||||||
|
language 'C'
|
||||||
|
with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE OPERATOR && (
|
||||||
|
LEFTARG = tsquery,
|
||||||
|
RIGHTARG = tsquery,
|
||||||
|
PROCEDURE = tsquery_and,
|
||||||
|
COMMUTATOR = '&&',
|
||||||
|
RESTRICT = contsel,
|
||||||
|
JOIN = contjoinsel
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery)
|
||||||
|
returns tsquery
|
||||||
|
as 'MODULE_PATHNAME', 'tsquery_or'
|
||||||
|
language 'C'
|
||||||
|
with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE OPERATOR || (
|
||||||
|
LEFTARG = tsquery,
|
||||||
|
RIGHTARG = tsquery,
|
||||||
|
PROCEDURE = tsquery_or,
|
||||||
|
COMMUTATOR = '||',
|
||||||
|
RESTRICT = contsel,
|
||||||
|
JOIN = contjoinsel
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION tsquery_not(tsquery)
|
||||||
|
returns tsquery
|
||||||
|
as 'MODULE_PATHNAME', 'tsquery_not'
|
||||||
|
language 'C'
|
||||||
|
with (isstrict,iscachable);
|
||||||
|
|
||||||
|
CREATE OPERATOR !! (
|
||||||
|
RIGHTARG = tsquery,
|
||||||
|
PROCEDURE = tsquery_not
|
||||||
|
);
|
||||||
|
|
||||||
--------------rewrite subsystem
|
--------------rewrite subsystem
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION rewrite(tsquery, text)
|
CREATE OR REPLACE FUNCTION rewrite(tsquery, text)
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
|
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
|
||||||
|
|
||||||
SUBOBJS = parser.o deflex.o
|
SUBOBJS = parser.o deflex.o
|
||||||
|
|
||||||
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
|
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
|
||||||
|
|
||||||
PG_CPPFLAGS = -I$(srcdir)/..
|
PG_CPPFLAGS = -I$(srcdir)/..
|
||||||
|
|
||||||
@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
|
|||||||
|
|
||||||
all: SUBSYS.o
|
all: SUBSYS.o
|
||||||
|
|
||||||
parser.c: parser.l
|
|
||||||
ifdef FLEX
|
|
||||||
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
|
|
||||||
else
|
|
||||||
@$(missing) flex $< $@
|
|
||||||
endif
|
|
||||||
|
|
||||||
SUBSYS.o: $(SUBOBJS)
|
SUBSYS.o: $(SUBOBJS)
|
||||||
$(LD) $(LDREL) $(LDOUT) $@ $^
|
$(LD) $(LDREL) $(LDOUT) $@ $^
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ const char *lex_descr[] = {
|
|||||||
"Latin part of hyphenated word",
|
"Latin part of hyphenated word",
|
||||||
"Space symbols",
|
"Space symbols",
|
||||||
"HTML Tag",
|
"HTML Tag",
|
||||||
"HTTP head",
|
"Protocol head",
|
||||||
"Hyphenated word",
|
"Hyphenated word",
|
||||||
"Latin hyphenated word",
|
"Latin hyphenated word",
|
||||||
"Non-latin hyphenated word",
|
"Non-latin hyphenated word",
|
||||||
@ -42,7 +42,7 @@ const char *tok_alias[] = {
|
|||||||
"lpart_hword",
|
"lpart_hword",
|
||||||
"blank",
|
"blank",
|
||||||
"tag",
|
"tag",
|
||||||
"http",
|
"protocol",
|
||||||
"hword",
|
"hword",
|
||||||
"lhword",
|
"lhword",
|
||||||
"nlhword",
|
"nlhword",
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
#define LATPARTHYPHENWORD 11
|
#define LATPARTHYPHENWORD 11
|
||||||
#define SPACE 12
|
#define SPACE 12
|
||||||
#define TAG 13
|
#define TAG 13
|
||||||
#define HTTP 14
|
#define PROTOCOL 14
|
||||||
#define HYPHENWORD 15
|
#define HYPHENWORD 15
|
||||||
#define LATHYPHENWORD 16
|
#define LATHYPHENWORD 16
|
||||||
#define CYRHYPHENWORD 17
|
#define CYRHYPHENWORD 17
|
||||||
|
1028
contrib/tsearch2/wordparser/parser.c
Normal file
1028
contrib/tsearch2/wordparser/parser.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,147 @@
|
|||||||
#ifndef __PARSER_H__
|
#ifndef __PARSER_H__
|
||||||
#define __PARSER_H__
|
#define __PARSER_H__
|
||||||
|
|
||||||
extern char *token;
|
#include <ctype.h>
|
||||||
extern int tokenlen;
|
#include <limits.h>
|
||||||
int tsearch2_yylex(void);
|
#include "ts_locale.h"
|
||||||
void tsearch2_start_parse_str(char *, int);
|
|
||||||
void tsearch2_end_parse(void);
|
typedef enum {
|
||||||
|
TPS_Base = 0,
|
||||||
|
TPS_InUWord,
|
||||||
|
TPS_InLatWord,
|
||||||
|
TPS_InCyrWord,
|
||||||
|
TPS_InUnsignedInt,
|
||||||
|
TPS_InSignedIntFirst,
|
||||||
|
TPS_InSignedInt,
|
||||||
|
TPS_InSpace,
|
||||||
|
TPS_InUDecimalFirst,
|
||||||
|
TPS_InUDecimal,
|
||||||
|
TPS_InDecimalFirst,
|
||||||
|
TPS_InDecimal,
|
||||||
|
TPS_InVersionFirst,
|
||||||
|
TPS_InVersion,
|
||||||
|
TPS_InMantissaFirst,
|
||||||
|
TPS_InMantissaSign,
|
||||||
|
TPS_InMantissa,
|
||||||
|
TPS_InHTMLEntityFirst,
|
||||||
|
TPS_InHTMLEntity,
|
||||||
|
TPS_InHTMLEntityNumFirst,
|
||||||
|
TPS_InHTMLEntityNum,
|
||||||
|
TPS_InHTMLEntityEnd,
|
||||||
|
TPS_InTagFirst,
|
||||||
|
TPS_InTagCloseFirst,
|
||||||
|
TPS_InTag,
|
||||||
|
TPS_InTagEscapeK,
|
||||||
|
TPS_InTagEscapeKK,
|
||||||
|
TPS_InTagBackSleshed,
|
||||||
|
TPS_InTagEnd,
|
||||||
|
TPS_InCommentFirst,
|
||||||
|
TPS_InCommentLast,
|
||||||
|
TPS_InComment,
|
||||||
|
TPS_InCloseCommentFirst,
|
||||||
|
TPS_InCloseCommentLast,
|
||||||
|
TPS_InCommentEnd,
|
||||||
|
TPS_InHostFirstDomen,
|
||||||
|
TPS_InHostDomenSecond,
|
||||||
|
TPS_InHostDomen,
|
||||||
|
TPS_InPortFirst,
|
||||||
|
TPS_InPort,
|
||||||
|
TPS_InHostFirstAN,
|
||||||
|
TPS_InHost,
|
||||||
|
TPS_InEmail,
|
||||||
|
TPS_InFileFirst,
|
||||||
|
TPS_InFile,
|
||||||
|
TPS_InFileNext,
|
||||||
|
TPS_InURIFirst,
|
||||||
|
TPS_InURIStart,
|
||||||
|
TPS_InURI,
|
||||||
|
TPS_InFURL,
|
||||||
|
TPS_InProtocolFirst,
|
||||||
|
TPS_InProtocolSecond,
|
||||||
|
TPS_InProtocolEnd,
|
||||||
|
TPS_InHyphenLatWordFirst,
|
||||||
|
TPS_InHyphenLatWord,
|
||||||
|
TPS_InHyphenCyrWordFirst,
|
||||||
|
TPS_InHyphenCyrWord,
|
||||||
|
TPS_InHyphenUWordFirst,
|
||||||
|
TPS_InHyphenUWord,
|
||||||
|
TPS_InHyphenValueFirst,
|
||||||
|
TPS_InHyphenValue,
|
||||||
|
TPS_InHyphenValueExact,
|
||||||
|
TPS_InParseHyphen,
|
||||||
|
TPS_InParseHyphenHyphen,
|
||||||
|
TPS_InHyphenCyrWordPart,
|
||||||
|
TPS_InHyphenLatWordPart,
|
||||||
|
TPS_InHyphenUWordPart,
|
||||||
|
TPS_InHyphenUnsignedInt,
|
||||||
|
TPS_InHDecimalPartFirst,
|
||||||
|
TPS_InHDecimalPart,
|
||||||
|
TPS_InHVersionPartFirst,
|
||||||
|
TPS_InHVersionPart,
|
||||||
|
TPS_Null /* last state (fake value) */
|
||||||
|
} TParserState;
|
||||||
|
|
||||||
|
/* forward declaration */
|
||||||
|
struct TParser;
|
||||||
|
|
||||||
|
|
||||||
|
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */
|
||||||
|
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
TParserCharTest isclass;
|
||||||
|
char c;
|
||||||
|
uint16 flags;
|
||||||
|
TParserState tostate;
|
||||||
|
int type;
|
||||||
|
TParserSpecial special;
|
||||||
|
} TParserStateActionItem;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
TParserState state;
|
||||||
|
TParserStateActionItem *action;
|
||||||
|
} TParserStateAction;
|
||||||
|
|
||||||
|
typedef struct TParserPosition {
|
||||||
|
int posbyte; /* position of parser in bytes */
|
||||||
|
int poschar; /* osition of parser in characters */
|
||||||
|
int charlen; /* length of current char */
|
||||||
|
int lenbytelexeme;
|
||||||
|
int lencharlexeme;
|
||||||
|
TParserState state;
|
||||||
|
struct TParserPosition *prev;
|
||||||
|
int flags;
|
||||||
|
TParserStateActionItem *pushedAtAction;
|
||||||
|
} TParserPosition;
|
||||||
|
|
||||||
|
typedef struct TParser {
|
||||||
|
/* string and position information */
|
||||||
|
char *str; /* multibyte string */
|
||||||
|
int lenstr; /* length of mbstring */
|
||||||
|
wchar_t *wstr; /* wide character string */
|
||||||
|
int lenwstr; /* length of wsting */
|
||||||
|
|
||||||
|
/* State of parse */
|
||||||
|
int charmaxlen;
|
||||||
|
bool usewide;
|
||||||
|
TParserPosition *state;
|
||||||
|
bool ignore;
|
||||||
|
bool wanthost;
|
||||||
|
|
||||||
|
/* silly char */
|
||||||
|
char c;
|
||||||
|
|
||||||
|
/* out */
|
||||||
|
char *lexeme;
|
||||||
|
int lenbytelexeme;
|
||||||
|
int lencharlexeme;
|
||||||
|
int type;
|
||||||
|
|
||||||
|
} TParser;
|
||||||
|
|
||||||
|
|
||||||
|
TParser* TParserInit( char *, int );
|
||||||
|
bool TParserGet( TParser* );
|
||||||
|
void TParserClose( TParser* );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,346 +0,0 @@
|
|||||||
%{
|
|
||||||
#include "postgres.h"
|
|
||||||
|
|
||||||
#include "deflex.h"
|
|
||||||
#include "parser.h"
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
/* Avoid exit() on fatal scanner errors */
|
|
||||||
#undef fprintf
|
|
||||||
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
|
|
||||||
|
|
||||||
char *token = NULL; /* pointer to token */
|
|
||||||
int tokenlen;
|
|
||||||
static char *s = NULL; /* to return WHOLE hyphenated-word */
|
|
||||||
|
|
||||||
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int tlen;
|
|
||||||
int clen;
|
|
||||||
char *str;
|
|
||||||
} TagStorage;
|
|
||||||
|
|
||||||
static TagStorage ts={0,0,NULL};
|
|
||||||
|
|
||||||
static void
|
|
||||||
addTag(void)
|
|
||||||
{
|
|
||||||
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
|
|
||||||
ts.tlen*=2;
|
|
||||||
ts.str=realloc(ts.str,ts.tlen);
|
|
||||||
if (!ts.str)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
||||||
errmsg("out of memory")));
|
|
||||||
}
|
|
||||||
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
|
|
||||||
ts.clen+=tsearch2_yyleng;
|
|
||||||
ts.str[ts.clen]='\0';
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
startTag(void)
|
|
||||||
{
|
|
||||||
if ( ts.str==NULL ) {
|
|
||||||
ts.tlen=tsearch2_yyleng+1;
|
|
||||||
ts.str=malloc(ts.tlen);
|
|
||||||
if (!ts.str)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
||||||
errmsg("out of memory")));
|
|
||||||
}
|
|
||||||
ts.clen=0;
|
|
||||||
ts.str[0]='\0';
|
|
||||||
addTag();
|
|
||||||
}
|
|
||||||
|
|
||||||
%}
|
|
||||||
|
|
||||||
%option 8bit
|
|
||||||
%option never-interactive
|
|
||||||
%option nodefault
|
|
||||||
%option nounput
|
|
||||||
%option noyywrap
|
|
||||||
|
|
||||||
/* parser's state for parsing hyphenated-word */
|
|
||||||
%x DELIM
|
|
||||||
/* parser's state for parsing URL*/
|
|
||||||
%x URL
|
|
||||||
%x SERVER
|
|
||||||
|
|
||||||
/* parser's state for parsing TAGS */
|
|
||||||
%x INTAG
|
|
||||||
%x QINTAG
|
|
||||||
%x INCOMMENT
|
|
||||||
%x INSCRIPT
|
|
||||||
|
|
||||||
/* cyrillic koi8 char */
|
|
||||||
CYRALNUM [0-9\200-\377]
|
|
||||||
CYRALPHA [\200-\377]
|
|
||||||
ALPHA [a-zA-Z\200-\377]
|
|
||||||
ALNUM [0-9a-zA-Z\200-\377]
|
|
||||||
|
|
||||||
|
|
||||||
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
|
|
||||||
URI [-_[:alnum:]/%,\.;=&?#]+
|
|
||||||
|
|
||||||
%%
|
|
||||||
|
|
||||||
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
|
|
||||||
|
|
||||||
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
|
|
||||||
BEGIN INITIAL;
|
|
||||||
addTag();
|
|
||||||
token = ts.str;
|
|
||||||
tokenlen = ts.clen;
|
|
||||||
return TAG;
|
|
||||||
}
|
|
||||||
|
|
||||||
"<!--" { BEGIN INCOMMENT; startTag(); }
|
|
||||||
|
|
||||||
<INCOMMENT>"-->" {
|
|
||||||
BEGIN INITIAL;
|
|
||||||
addTag();
|
|
||||||
token = ts.str;
|
|
||||||
tokenlen = ts.clen;
|
|
||||||
return TAG;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
|
|
||||||
|
|
||||||
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
|
|
||||||
|
|
||||||
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
|
|
||||||
|
|
||||||
<QINTAG>"\\\"" { addTag(); }
|
|
||||||
|
|
||||||
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
|
|
||||||
|
|
||||||
<INTAG>">" {
|
|
||||||
BEGIN INITIAL;
|
|
||||||
addTag();
|
|
||||||
token = ts.str;
|
|
||||||
tokenlen = ts.clen;
|
|
||||||
return TAG;
|
|
||||||
}
|
|
||||||
|
|
||||||
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
|
|
||||||
|
|
||||||
\&(quot|amp|nbsp|lt|gt)\; {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return HTMLENTITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
\&\#[0-9][0-9]?[0-9]?\; {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return HTMLENTITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return EMAIL;
|
|
||||||
}
|
|
||||||
|
|
||||||
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return SCIENTIFIC;
|
|
||||||
}
|
|
||||||
|
|
||||||
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return VERSIONNUMBER;
|
|
||||||
}
|
|
||||||
|
|
||||||
[+-]?[0-9]+\.[0-9]+ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return DECIMAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
[+-][0-9]+ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return SIGNEDINT;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM,INITIAL>[0-9]+ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return UNSIGNEDINT;
|
|
||||||
}
|
|
||||||
|
|
||||||
http"://" {
|
|
||||||
BEGIN URL;
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return HTTP;
|
|
||||||
}
|
|
||||||
|
|
||||||
ftp"://" {
|
|
||||||
BEGIN URL;
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return HTTP;
|
|
||||||
}
|
|
||||||
|
|
||||||
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
|
|
||||||
BEGIN SERVER;
|
|
||||||
if (s) { free(s); s=NULL; }
|
|
||||||
s = strdup( tsearch2_yytext );
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
yyless( 0 );
|
|
||||||
token = s;
|
|
||||||
return FURL;
|
|
||||||
}
|
|
||||||
|
|
||||||
<SERVER,URL,INITIAL>{HOSTNAME} {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return HOST;
|
|
||||||
}
|
|
||||||
|
|
||||||
<SERVER>[/:]{URI} {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return URI;
|
|
||||||
}
|
|
||||||
|
|
||||||
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return FILEPATH;
|
|
||||||
}
|
|
||||||
|
|
||||||
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
|
|
||||||
BEGIN DELIM;
|
|
||||||
if (s) { free(s); s=NULL; }
|
|
||||||
s = strdup( tsearch2_yytext );
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
yyless( 0 );
|
|
||||||
token = s;
|
|
||||||
return CYRHYPHENWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
|
|
||||||
BEGIN DELIM;
|
|
||||||
if (s) { free(s); s=NULL; }
|
|
||||||
s = strdup( tsearch2_yytext );
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
yyless( 0 );
|
|
||||||
token = s;
|
|
||||||
return LATHYPHENWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
|
|
||||||
BEGIN DELIM;
|
|
||||||
if (s) { free(s); s=NULL; }
|
|
||||||
s = strdup( tsearch2_yytext );
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
yyless( 0 );
|
|
||||||
token = s;
|
|
||||||
return HYPHENWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return VERSIONNUMBER;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM>\+?[0-9]+\.[0-9]+ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return DECIMAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return CYRPARTHYPHENWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return LATPARTHYPHENWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM>{ALNUM}+ /* one word in composite-word */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return PARTHYPHENWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM>- {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return SPACE;
|
|
||||||
}
|
|
||||||
|
|
||||||
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
|
|
||||||
BEGIN INITIAL;
|
|
||||||
yyless( 0 );
|
|
||||||
}
|
|
||||||
|
|
||||||
{CYRALPHA}+ /* normal word */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return CYRWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
[[:alpha:]]+ /* normal word */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return LATWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
{ALNUM}+ /* normal word */ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return UWORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
[ \r\n\t]+ {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return SPACE;
|
|
||||||
}
|
|
||||||
|
|
||||||
. {
|
|
||||||
token = tsearch2_yytext;
|
|
||||||
tokenlen = tsearch2_yyleng;
|
|
||||||
return SPACE;
|
|
||||||
}
|
|
||||||
|
|
||||||
%%
|
|
||||||
|
|
||||||
/* clearing after parsing from string */
|
|
||||||
void
|
|
||||||
tsearch2_end_parse(void)
|
|
||||||
{
|
|
||||||
if (s)
|
|
||||||
{
|
|
||||||
free(s);
|
|
||||||
s = NULL;
|
|
||||||
}
|
|
||||||
tsearch2_yy_delete_buffer( buf );
|
|
||||||
buf = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* start parse from string */
|
|
||||||
void
|
|
||||||
tsearch2_start_parse_str(char* str, int limit)
|
|
||||||
{
|
|
||||||
if (buf)
|
|
||||||
tsearch2_end_parse();
|
|
||||||
buf = tsearch2_yy_scan_bytes( str, limit );
|
|
||||||
tsearch2_yy_switch_to_buffer( buf );
|
|
||||||
BEGIN INITIAL;
|
|
||||||
}
|
|
@ -39,8 +39,7 @@ Datum prsd_start(PG_FUNCTION_ARGS);
|
|||||||
Datum
|
Datum
|
||||||
prsd_start(PG_FUNCTION_ARGS)
|
prsd_start(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
|
PG_RETURN_POINTER(TParserInit( (char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
|
||||||
PG_RETURN_POINTER(NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(prsd_getlexeme);
|
PG_FUNCTION_INFO_V1(prsd_getlexeme);
|
||||||
@ -48,14 +47,17 @@ Datum prsd_getlexeme(PG_FUNCTION_ARGS);
|
|||||||
Datum
|
Datum
|
||||||
prsd_getlexeme(PG_FUNCTION_ARGS)
|
prsd_getlexeme(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
|
TParser *p=(TParser*)PG_GETARG_POINTER(0);
|
||||||
char **t = (char **) PG_GETARG_POINTER(1);
|
char **t = (char **) PG_GETARG_POINTER(1);
|
||||||
int *tlen = (int *) PG_GETARG_POINTER(2);
|
int *tlen = (int *) PG_GETARG_POINTER(2);
|
||||||
int type = tsearch2_yylex();
|
|
||||||
|
|
||||||
*t = token;
|
if ( !TParserGet(p) )
|
||||||
*tlen = tokenlen;
|
PG_RETURN_INT32(0);
|
||||||
PG_RETURN_INT32(type);
|
|
||||||
|
*t = p->lexeme;
|
||||||
|
*tlen = p->lenbytelexeme;
|
||||||
|
|
||||||
|
PG_RETURN_INT32(p->type);
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(prsd_end);
|
PG_FUNCTION_INFO_V1(prsd_end);
|
||||||
@ -63,8 +65,8 @@ Datum prsd_end(PG_FUNCTION_ARGS);
|
|||||||
Datum
|
Datum
|
||||||
prsd_end(PG_FUNCTION_ARGS)
|
prsd_end(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
|
TParser *p=(TParser*)PG_GETARG_POINTER(0);
|
||||||
tsearch2_end_parse();
|
TParserClose(p);
|
||||||
PG_RETURN_VOID();
|
PG_RETURN_VOID();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user