mirror of https://github.com/postgres/postgres
Stat function now can show statistics per weight of lexemes
This commit is contained in:
parent
1b9ef0025d
commit
a6ea6457fa
|
@ -782,6 +782,7 @@ select rank(' a:1 s:2 d g'::tsvector, 'a & s');
|
|||
(1 row)
|
||||
|
||||
insert into test_tsvector (t) values ('foo bar foo the over foo qq bar');
|
||||
drop trigger tsvectorupdate on test_tsvector;
|
||||
select * from stat('select a from test_tsvector') order by ndoc desc, nentry desc, word;
|
||||
word | ndoc | nentry
|
||||
-----------+------+--------
|
||||
|
@ -1933,6 +1934,55 @@ select * from stat('select a from test_tsvector') order by ndoc desc, nentry des
|
|||
qwerti | 1 | 1
|
||||
(1146 rows)
|
||||
|
||||
insert into test_tsvector values ('1', 'a:1a,2,3b b:5a,6a,7c,8');
|
||||
insert into test_tsvector values ('1', 'a:1a,2,3c b:5a,6b,7c,8b');
|
||||
select * from stat('select a from test_tsvector','a') order by ndoc desc, nentry desc, word;
|
||||
word | ndoc | nentry
|
||||
------+------+--------
|
||||
b | 2 | 3
|
||||
a | 2 | 2
|
||||
(2 rows)
|
||||
|
||||
select * from stat('select a from test_tsvector','b') order by ndoc desc, nentry desc, word;
|
||||
word | ndoc | nentry
|
||||
------+------+--------
|
||||
b | 1 | 2
|
||||
a | 1 | 1
|
||||
(2 rows)
|
||||
|
||||
select * from stat('select a from test_tsvector','c') order by ndoc desc, nentry desc, word;
|
||||
word | ndoc | nentry
|
||||
------+------+--------
|
||||
b | 2 | 2
|
||||
a | 1 | 1
|
||||
(2 rows)
|
||||
|
||||
select * from stat('select a from test_tsvector','d') order by ndoc desc, nentry desc, word;
|
||||
word | ndoc | nentry
|
||||
-----------+------+--------
|
||||
a | 2 | 2
|
||||
copyright | 2 | 2
|
||||
foo | 1 | 3
|
||||
bar | 1 | 2
|
||||
345 | 1 | 1
|
||||
b | 1 | 1
|
||||
qq | 1 | 1
|
||||
qwerti | 1 | 1
|
||||
(8 rows)
|
||||
|
||||
select * from stat('select a from test_tsvector','ad') order by ndoc desc, nentry desc, word;
|
||||
word | ndoc | nentry
|
||||
-----------+------+--------
|
||||
a | 2 | 4
|
||||
b | 2 | 4
|
||||
copyright | 2 | 2
|
||||
foo | 1 | 3
|
||||
bar | 1 | 2
|
||||
345 | 1 | 1
|
||||
qq | 1 | 1
|
||||
qwerti | 1 | 1
|
||||
(8 rows)
|
||||
|
||||
select reset_tsearch();
|
||||
NOTICE: TSearch cache cleaned
|
||||
reset_tsearch
|
||||
|
@ -2092,7 +2142,6 @@ select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
|
|||
(5 rows)
|
||||
|
||||
--check ordering
|
||||
drop trigger tsvectorupdate on test_tsvector;
|
||||
insert into test_tsvector values (null, null);
|
||||
select a is null, a from test_tsvector order by a;
|
||||
?column? | a
|
||||
|
@ -2108,6 +2157,8 @@ select a is null, a from test_tsvector order by a;
|
|||
f |
|
||||
f | '345':1 'qwerti':2 'copyright':3
|
||||
f | 'qq':7 'bar':2,8 'foo':1,3,6 'copyright':9
|
||||
f | 'a':1A,2,3C 'b':5A,6B,7C,8B
|
||||
f | 'a':1A,2,3B 'b':5A,6A,7C,8
|
||||
f | '7w' 'ch' 'd7' 'eo' 'gw' 'i4' 'lq' 'o6' 'qt' 'y0'
|
||||
f | 'ar' 'ei' 'kq' 'ma' 'qa' 'qh' 'qq' 'qz' 'rx' 'st'
|
||||
f | 'gs' 'i6' 'i9' 'j2' 'l0' 'oq' 'qx' 'sc' 'xe' 'yu'
|
||||
|
@ -2609,5 +2660,5 @@ select a is null, a from test_tsvector order by a;
|
|||
f | '1b' '42' 'a7' 'ab' 'ak' 'ap' 'at' 'av' 'ay' 'b0' 'b9' 'bb' 'bp' 'bu' 'bz' 'cq' 'da' 'de' 'dn' 'e0' 'eb' 'ef' 'eg' 'ek' 'eq' 'er' 'eu' 'ey' 'fn' 'ft' 'gg' 'h4' 'hk' 'hl' 'i7' 'ig' 'ik' 'ip' 'ir' 'iu' 'iw' 'jr' 'jw' 'jx' 'kg' 'lc' 'lg' 'm0' 'na' 'np' 'om' 'on' 'oz' 'pg' 'pn' 'ps' 'pt' 'pz' 'q3' 'q6' 'qa' 'qb' 'ql' 'qq' 'qt' 'qv' 'qw' 'qy' 'r8' 'rf' 'ri' 'rk' 'rl' 'rw' 'sg' 'si' 'sp' 'sw' 'ta' 'th' 'ua' 'uj' 'uu' 'uv' 'uz' 'vj' 'vk' 'vm' 'wc' 'wf' 'wh' 'wn' 'wo' 'ww' 'xb' 'xk' 'xt' 'xw' 'y7' 'ye' 'yl' 'yt' 'yw' 'z4' 'z7' 'zc' 'zw'
|
||||
f | '1h' '3s' 'ab' 'ae' 'ax' 'b1' 'bz' 'cy' 'dk' 'dq' 'ds' 'du' 'e8' 'ef' 'ej' 'ek' 'ex' 'f1' 'fe' 'ff' 'fn' 'fo' 'ft' 'fx' 'ge' 'go' 'gz' 'h6' 'hz' 'i2' 'iv' 'iy' 'j5' 'j6' 'ke' 'kf' 'lh' 'lr' 'mc' 'mj' 'na' 'ng' 'oh' 'om' 'oy' 'p2' 'pi' 'pk' 'py' 'q3' 'qb' 'qc' 'qg' 'qn' 'qo' 'qq' 'qu' 'qw' 'qx' 'qy' 'qz' 'r1' 'rk' 'rl' 'rq' 'rs' 'rt' 'ry' 'rz' 'sk' 'sl' 'so' 't9' 'td' 'te' 'tn' 'tw' 'tz' 'ud' 'uk' 'uo' 'uq' 'uw' 'ux' 'uy' 'v1' 'vg' 'vq' 'w4' 'w9' 'wa' 'wg' 'wj' 'wm' 'wo' 'wr' 'ww' 'wy' 'xf' 'xg' 'y9' 'yh' 'yi' 'yk' 'ym' 'yq' 'yv' 'zm'
|
||||
t |
|
||||
(512 rows)
|
||||
(514 rows)
|
||||
|
||||
|
|
|
@ -150,7 +150,15 @@ select rank(' a:1 s:2B d g'::tsvector, 'a & s');
|
|||
select rank(' a:1 s:2 d g'::tsvector, 'a & s');
|
||||
|
||||
insert into test_tsvector (t) values ('foo bar foo the over foo qq bar');
|
||||
drop trigger tsvectorupdate on test_tsvector;
|
||||
select * from stat('select a from test_tsvector') order by ndoc desc, nentry desc, word;
|
||||
insert into test_tsvector values ('1', 'a:1a,2,3b b:5a,6a,7c,8');
|
||||
insert into test_tsvector values ('1', 'a:1a,2,3c b:5a,6b,7c,8b');
|
||||
select * from stat('select a from test_tsvector','a') order by ndoc desc, nentry desc, word;
|
||||
select * from stat('select a from test_tsvector','b') order by ndoc desc, nentry desc, word;
|
||||
select * from stat('select a from test_tsvector','c') order by ndoc desc, nentry desc, word;
|
||||
select * from stat('select a from test_tsvector','d') order by ndoc desc, nentry desc, word;
|
||||
select * from stat('select a from test_tsvector','ad') order by ndoc desc, nentry desc, word;
|
||||
|
||||
select reset_tsearch();
|
||||
select to_tsquery('default', 'skies & books');
|
||||
|
@ -249,7 +257,6 @@ Upon a woman s face. E. J. Pratt (1882 1964)
|
|||
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
|
||||
|
||||
--check ordering
|
||||
drop trigger tsvectorupdate on test_tsvector;
|
||||
insert into test_tsvector values (null, null);
|
||||
select a is null, a from test_tsvector order by a;
|
||||
|
||||
|
|
|
@ -15,9 +15,10 @@ Datum
|
|||
tsstat_in(PG_FUNCTION_ARGS)
|
||||
{
|
||||
tsstat *stat = palloc(STATHDRSIZE);
|
||||
|
||||
|
||||
stat->len = STATHDRSIZE;
|
||||
stat->size = 0;
|
||||
stat->weight = 0;
|
||||
PG_RETURN_POINTER(stat);
|
||||
}
|
||||
|
||||
|
@ -32,6 +33,20 @@ tsstat_out(PG_FUNCTION_ARGS)
|
|||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
static int
|
||||
check_weight(tsvector *txt, WordEntry *wptr, int8 weight) {
|
||||
int len = POSDATALEN(txt, wptr);
|
||||
int num=0;
|
||||
WordEntryPos *ptr = POSDATAPTR(txt, wptr);
|
||||
|
||||
while (len--) {
|
||||
if (weight & (1 << ptr->weight))
|
||||
num++;
|
||||
ptr++;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
static WordEntry **
|
||||
SEI_realloc(WordEntry ** in, uint32 *len)
|
||||
{
|
||||
|
@ -83,6 +98,7 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
|
|||
totallen = CALCSTATSIZE(nentry, slen);
|
||||
newstat = palloc(totallen);
|
||||
newstat->len = totallen;
|
||||
newstat->weight = stat->weight;
|
||||
newstat->size = nentry;
|
||||
|
||||
memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat));
|
||||
|
@ -107,8 +123,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
|
|||
}
|
||||
nptr = STATPTR(newstat) + (StopLow - STATPTR(stat));
|
||||
memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat)));
|
||||
nptr->nentry = POSDATALEN(txt, *ptr);
|
||||
if (nptr->nentry == 0)
|
||||
if ( (*ptr)->haspos ) {
|
||||
nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
|
||||
} else
|
||||
nptr->nentry = 1;
|
||||
nptr->ndoc = 1;
|
||||
nptr->len = (*ptr)->len;
|
||||
|
@ -127,8 +144,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
|
|||
}
|
||||
else
|
||||
{
|
||||
nptr->nentry = POSDATALEN(txt, *ptr);
|
||||
if (nptr->nentry == 0)
|
||||
if ( (*ptr)->haspos ) {
|
||||
nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
|
||||
} else
|
||||
nptr->nentry = 1;
|
||||
nptr->ndoc = 1;
|
||||
nptr->len = (*ptr)->len;
|
||||
|
@ -144,8 +162,9 @@ formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
|
|||
|
||||
while (ptr - entry < len)
|
||||
{
|
||||
nptr->nentry = POSDATALEN(txt, *ptr);
|
||||
if (nptr->nentry == 0)
|
||||
if ( (*ptr)->haspos ) {
|
||||
nptr->nentry = ( stat->weight ) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
|
||||
} else
|
||||
nptr->nentry = 1;
|
||||
nptr->ndoc = 1;
|
||||
nptr->len = (*ptr)->len;
|
||||
|
@ -173,12 +192,14 @@ ts_accum(PG_FUNCTION_ARGS)
|
|||
cur = 0;
|
||||
StatEntry *sptr;
|
||||
WordEntry *wptr;
|
||||
int n=0;
|
||||
|
||||
if (stat == NULL || PG_ARGISNULL(0))
|
||||
{ /* Init in first */
|
||||
stat = palloc(STATHDRSIZE);
|
||||
stat->len = STATHDRSIZE;
|
||||
stat->size = 0;
|
||||
stat->weight = 0;
|
||||
}
|
||||
|
||||
/* simple check of correctness */
|
||||
|
@ -201,32 +222,37 @@ ts_accum(PG_FUNCTION_ARGS)
|
|||
sptr++;
|
||||
else if (cmp == 0)
|
||||
{
|
||||
int n = POSDATALEN(txt, wptr);
|
||||
|
||||
if (n == 0)
|
||||
n = 1;
|
||||
sptr->ndoc++;
|
||||
sptr->nentry += n;
|
||||
if ( stat->weight == 0 ) {
|
||||
sptr->ndoc++;
|
||||
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
|
||||
} else if ( wptr->haspos && (n=check_weight(txt, wptr, stat->weight))!=0 ) {
|
||||
sptr->ndoc++;
|
||||
sptr->nentry += n;
|
||||
}
|
||||
sptr++;
|
||||
wptr++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cur == len)
|
||||
newentry = SEI_realloc(newentry, &len);
|
||||
newentry[cur] = wptr;
|
||||
if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) {
|
||||
if (cur == len)
|
||||
newentry = SEI_realloc(newentry, &len);
|
||||
newentry[cur] = wptr;
|
||||
cur++;
|
||||
}
|
||||
wptr++;
|
||||
cur++;
|
||||
}
|
||||
}
|
||||
|
||||
while (wptr - ARRPTR(txt) < txt->size)
|
||||
{
|
||||
if (cur == len)
|
||||
newentry = SEI_realloc(newentry, &len);
|
||||
newentry[cur] = wptr;
|
||||
if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) {
|
||||
if (cur == len)
|
||||
newentry = SEI_realloc(newentry, &len);
|
||||
newentry[cur] = wptr;
|
||||
cur++;
|
||||
}
|
||||
wptr++;
|
||||
cur++;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -243,12 +269,13 @@ ts_accum(PG_FUNCTION_ARGS)
|
|||
cmp = compareStatWord(sptr, wptr, stat, txt);
|
||||
if (cmp == 0)
|
||||
{
|
||||
int n = POSDATALEN(txt, wptr);
|
||||
|
||||
if (n == 0)
|
||||
n = 1;
|
||||
sptr->ndoc++;
|
||||
sptr->nentry += n;
|
||||
if ( stat->weight == 0 ) {
|
||||
sptr->ndoc++;
|
||||
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
|
||||
} else if ( wptr->haspos && (n=check_weight(txt, wptr, stat->weight))!=0 ) {
|
||||
sptr->ndoc++;
|
||||
sptr->nentry += n;
|
||||
}
|
||||
break;
|
||||
}
|
||||
else if (cmp < 0)
|
||||
|
@ -259,10 +286,12 @@ ts_accum(PG_FUNCTION_ARGS)
|
|||
|
||||
if (StopLow >= StopHigh)
|
||||
{ /* not found */
|
||||
if (cur == len)
|
||||
newentry = SEI_realloc(newentry, &len);
|
||||
newentry[cur] = wptr;
|
||||
cur++;
|
||||
if ( stat->weight == 0 || check_weight(txt, wptr, stat->weight)!=0 ) {
|
||||
if (cur == len)
|
||||
newentry = SEI_realloc(newentry, &len);
|
||||
newentry[cur] = wptr;
|
||||
cur++;
|
||||
}
|
||||
}
|
||||
wptr++;
|
||||
}
|
||||
|
@ -389,7 +418,7 @@ get_ti_Oid(void)
|
|||
}
|
||||
|
||||
static tsstat *
|
||||
ts_stat_sql(text *txt)
|
||||
ts_stat_sql(text *txt, text *ws)
|
||||
{
|
||||
char *query = text2char(txt);
|
||||
int i;
|
||||
|
@ -423,6 +452,31 @@ ts_stat_sql(text *txt)
|
|||
stat = palloc(STATHDRSIZE);
|
||||
stat->len = STATHDRSIZE;
|
||||
stat->size = 0;
|
||||
stat->weight = 0;
|
||||
|
||||
if ( ws ) {
|
||||
char *buf;
|
||||
buf = VARDATA(ws);
|
||||
while( buf - VARDATA(ws) < VARSIZE(buf) - VARHDRSZ ) {
|
||||
switch (tolower(*buf)) {
|
||||
case 'a':
|
||||
stat->weight |= 1 << 3;
|
||||
break;
|
||||
case 'b':
|
||||
stat->weight |= 1 << 2;
|
||||
break;
|
||||
case 'c':
|
||||
stat->weight |= 1 << 1;
|
||||
break;
|
||||
case 'd':
|
||||
stat->weight |= 1;
|
||||
break;
|
||||
default:
|
||||
stat->weight |= 0;
|
||||
}
|
||||
buf++;
|
||||
}
|
||||
}
|
||||
|
||||
while (SPI_processed > 0)
|
||||
{
|
||||
|
@ -467,11 +521,13 @@ ts_stat(PG_FUNCTION_ARGS)
|
|||
{
|
||||
tsstat *stat;
|
||||
text *txt = PG_GETARG_TEXT_P(0);
|
||||
text *ws = (PG_NARGS() > 1) ? PG_GETARG_TEXT_P(1) : NULL;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
SPI_connect();
|
||||
stat = ts_stat_sql(txt);
|
||||
stat = ts_stat_sql(txt,ws);
|
||||
PG_FREE_IF_COPY(txt, 0);
|
||||
if (PG_NARGS() > 1 ) PG_FREE_IF_COPY(ws, 1);
|
||||
ts_setup_firstcall(funcctx, stat);
|
||||
SPI_finish();
|
||||
}
|
||||
|
|
|
@ -20,10 +20,11 @@ typedef struct
|
|||
{
|
||||
int4 len;
|
||||
int4 size;
|
||||
int4 weight;
|
||||
char data[1];
|
||||
} tsstat;
|
||||
|
||||
#define STATHDRSIZE (sizeof(int4)*2)
|
||||
#define STATHDRSIZE (sizeof(int4)*4)
|
||||
#define CALCSTATSIZE(x, lenstr) ( x * sizeof(StatEntry) + STATHDRSIZE + lenstr )
|
||||
#define STATPTR(x) ( (StatEntry*) ( (char*)x + STATHDRSIZE ) )
|
||||
#define STATSTRPTR(x) ( (char*)x + STATHDRSIZE + ( sizeof(StatEntry) * ((tsvector*)x)->size ) )
|
||||
|
|
|
@ -652,6 +652,12 @@ CREATE FUNCTION stat(text)
|
|||
language 'C'
|
||||
with (isstrict);
|
||||
|
||||
CREATE FUNCTION stat(text,text)
|
||||
returns setof statinfo
|
||||
as 'MODULE_PATHNAME', 'ts_stat'
|
||||
language 'C'
|
||||
with (isstrict);
|
||||
|
||||
--reset - just for debuging
|
||||
CREATE FUNCTION reset_tsearch()
|
||||
returns void
|
||||
|
|
|
@ -59,6 +59,8 @@ DROP FUNCTION gtsvector_penalty(internal,internal,internal);
|
|||
DROP FUNCTION gtsvector_picksplit(internal, internal);
|
||||
DROP FUNCTION gtsvector_union(internal, internal);
|
||||
DROP FUNCTION reset_tsearch();
|
||||
DROP FUNCTION stat(text);
|
||||
DROP FUNCTION stat(text,stat);
|
||||
DROP FUNCTION tsearch2() CASCADE;
|
||||
DROP FUNCTION _get_parser_from_curcfg();
|
||||
|
||||
|
|
Loading…
Reference in New Issue