
The strategy of GIN index scan is driven by opclass-specific extract_query method. This method that needed search mode is GIN_SEARCH_MODE_ALL. This mode means that matching tuple may contain none of extracted entries. Simple example is '!term' tsquery, which doesn't need any term to exist in matching tsvector. In order to handle such scan key GIN calculates virtual entry, which contains all TIDs of all entries of attribute. In fact this is full scan of index attribute. And typically this is very slow, but allows to handle some queries correctly in GIN. However, current algorithm calculate such virtual entry for each GIN_SEARCH_MODE_ALL scan key even if they are multiple for the same attribute. This is clearly not optimal. This commit improves the situation by introduction of "exclude only" scan keys. Such scan keys are not capable to return set of matching TIDs. Instead, they are capable only to filter TIDs produced by normal scan keys. Therefore, each attribute should contain at least one normal scan key, while rest of them may be "exclude only" if search mode is GIN_SEARCH_MODE_ALL. The same optimization might be applied to the whole scan, not per-attribute. But that leads to NULL values elimination problem. There is trade-off between multiple possible ways to do this. We probably want to do this later using some cost-based decision algorithm. Discussion: https://postgr.es/m/CAOBaU_YGP5-BEt5Cc0%3DzMve92vocPzD%2BXiZgiZs1kjY0cj%3DXBg%40mail.gmail.com Author: Nikita Glukhov, Alexander Korotkov, Tom Lane, Julien Rouhaud Reviewed-by: Julien Rouhaud, Tomas Vondra, Tom Lane
185 lines
7.6 KiB
SQL
185 lines
7.6 KiB
SQL
CREATE EXTENSION pg_trgm;
|
|
|
|
-- Check whether any of our opclasses fail amvalidate
|
|
SELECT amname, opcname
|
|
FROM pg_opclass opc LEFT JOIN pg_am am ON am.oid = opcmethod
|
|
WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
|
|
|
|
--backslash is used in tests below, installcheck will fail if
|
|
--standard_conforming_string is off
|
|
set standard_conforming_strings=on;
|
|
|
|
-- reduce noise
|
|
set extra_float_digits = 0;
|
|
|
|
select show_trgm('');
|
|
select show_trgm('(*&^$@%@');
|
|
select show_trgm('a b c');
|
|
select show_trgm(' a b c ');
|
|
select show_trgm('aA bB cC');
|
|
select show_trgm(' aA bB cC ');
|
|
select show_trgm('a b C0*%^');
|
|
|
|
select similarity('wow','WOWa ');
|
|
select similarity('wow',' WOW ');
|
|
|
|
select similarity('---', '####---');
|
|
|
|
CREATE TABLE test_trgm(t text COLLATE "C");
|
|
|
|
\copy test_trgm from 'data/trgm.data'
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
|
|
|
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
|
|
set enable_seqscan=off;
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
|
explain (costs off)
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
|
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
|
|
|
drop index trgm_idx;
|
|
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
|
|
set enable_seqscan=off;
|
|
|
|
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
|
|
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
|
|
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
|
|
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
|
|
|
|
-- check handling of indexquals that generate no searchable conditions
|
|
explain (costs off)
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qwerty%';
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qwerty%';
|
|
explain (costs off)
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qw%';
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qw%';
|
|
-- ensure that pending-list items are handled correctly, too
|
|
create temp table t_test_trgm(t text COLLATE "C");
|
|
create index t_trgm_idx on t_test_trgm using gin (t gin_trgm_ops);
|
|
insert into t_test_trgm values ('qwerty99'), ('qwerty01');
|
|
explain (costs off)
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%';
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%';
|
|
explain (costs off)
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qw%';
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qw%';
|
|
|
|
-- run the same queries with sequential scan to check the results
|
|
set enable_bitmapscan=off;
|
|
set enable_seqscan=on;
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qwerty%';
|
|
select count(*) from test_trgm where t like '%99%' and t like '%qw%';
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qwerty%';
|
|
select count(*) from t_test_trgm where t like '%99%' and t like '%qw%';
|
|
reset enable_bitmapscan;
|
|
|
|
create table test2(t text COLLATE "C");
|
|
insert into test2 values ('abcdef');
|
|
insert into test2 values ('quark');
|
|
insert into test2 values (' z foo bar');
|
|
insert into test2 values ('/123/-45/');
|
|
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
|
|
set enable_seqscan=off;
|
|
explain (costs off)
|
|
select * from test2 where t like '%BCD%';
|
|
explain (costs off)
|
|
select * from test2 where t ilike '%BCD%';
|
|
select * from test2 where t like '%BCD%';
|
|
select * from test2 where t like '%bcd%';
|
|
select * from test2 where t like E'%\\bcd%';
|
|
select * from test2 where t ilike '%BCD%';
|
|
select * from test2 where t ilike 'qua%';
|
|
select * from test2 where t like '%z foo bar%';
|
|
select * from test2 where t like ' z foo%';
|
|
explain (costs off)
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
explain (costs off)
|
|
select * from test2 where t ~* 'DEF';
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
select * from test2 where t ~ 'a[bc]+d';
|
|
select * from test2 where t ~ '(abc)*$';
|
|
select * from test2 where t ~* 'DEF';
|
|
select * from test2 where t ~ 'dEf';
|
|
select * from test2 where t ~* '^q';
|
|
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
|
select * from test2 where t ~* 'ab[a-z]{3}';
|
|
select * from test2 where t ~* '(^| )qua';
|
|
select * from test2 where t ~ 'q.*rk$';
|
|
select * from test2 where t ~ 'q';
|
|
select * from test2 where t ~ '[a-z]{3}';
|
|
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
|
select * from test2 where t ~ 'z foo bar';
|
|
select * from test2 where t ~ ' z foo bar';
|
|
select * from test2 where t ~ ' z foo bar';
|
|
select * from test2 where t ~ ' z foo';
|
|
select * from test2 where t ~ 'qua(?!foo)';
|
|
select * from test2 where t ~ '/\d+/-\d';
|
|
drop index test2_idx_gin;
|
|
|
|
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
|
|
set enable_seqscan=off;
|
|
explain (costs off)
|
|
select * from test2 where t like '%BCD%';
|
|
explain (costs off)
|
|
select * from test2 where t ilike '%BCD%';
|
|
select * from test2 where t like '%BCD%';
|
|
select * from test2 where t like '%bcd%';
|
|
select * from test2 where t like E'%\\bcd%';
|
|
select * from test2 where t ilike '%BCD%';
|
|
select * from test2 where t ilike 'qua%';
|
|
select * from test2 where t like '%z foo bar%';
|
|
select * from test2 where t like ' z foo%';
|
|
explain (costs off)
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
explain (costs off)
|
|
select * from test2 where t ~* 'DEF';
|
|
select * from test2 where t ~ '[abc]{3}';
|
|
select * from test2 where t ~ 'a[bc]+d';
|
|
select * from test2 where t ~ '(abc)*$';
|
|
select * from test2 where t ~* 'DEF';
|
|
select * from test2 where t ~ 'dEf';
|
|
select * from test2 where t ~* '^q';
|
|
select * from test2 where t ~* '[abc]{3}[def]{3}';
|
|
select * from test2 where t ~* 'ab[a-z]{3}';
|
|
select * from test2 where t ~* '(^| )qua';
|
|
select * from test2 where t ~ 'q.*rk$';
|
|
select * from test2 where t ~ 'q';
|
|
select * from test2 where t ~ '[a-z]{3}';
|
|
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
|
|
select * from test2 where t ~ 'z foo bar';
|
|
select * from test2 where t ~ ' z foo bar';
|
|
select * from test2 where t ~ ' z foo bar';
|
|
select * from test2 where t ~ ' z foo';
|
|
select * from test2 where t ~ 'qua(?!foo)';
|
|
select * from test2 where t ~ '/\d+/-\d';
|
|
|
|
-- Check similarity threshold (bug #14202)
|
|
|
|
CREATE TEMP TABLE restaurants (city text);
|
|
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);
|
|
INSERT INTO restaurants SELECT 'Szczecin' FROM generate_series(1, 10000);
|
|
CREATE INDEX ON restaurants USING gist(city gist_trgm_ops);
|
|
|
|
-- Similarity of the two names (for reference).
|
|
SELECT similarity('Szczecin', 'Warsaw');
|
|
|
|
-- Should get only 'Warsaw' for either setting of set_limit.
|
|
EXPLAIN (COSTS OFF)
|
|
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
|
|
FROM restaurants WHERE city % 'Warsaw';
|
|
SELECT set_limit(0.3);
|
|
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
|
|
FROM restaurants WHERE city % 'Warsaw';
|
|
SELECT set_limit(0.5);
|
|
SELECT DISTINCT city, similarity(city, 'Warsaw'), show_limit()
|
|
FROM restaurants WHERE city % 'Warsaw';
|