7f7d0b19f8
tries to prevent a very slow query plan in cases where the output row count estimate is imprecise. FossilOrigin-Name: 8018417b0143ea11535f2457bf3e4b3755717c554a17df1076425b4251b5f2c6
104 lines
4.2 KiB
Plaintext
104 lines
4.2 KiB
Plaintext
# 2024-04-02
|
|
#
|
|
# The author disclaims copyright to this source code. In place of
|
|
# a legal notice, here is a blessing:
|
|
#
|
|
# May you do good and not evil.
|
|
# May you find forgiveness for yourself and forgive others.
|
|
# May you share freely, never taking more than you give.
|
|
#
|
|
#***********************************************************************
|
|
# Tests for the whereInterstageHeuristic() routine in the query planner.
|
|
#
|
|
|
|
set testdir [file dirname $argv0]
|
|
source $testdir/tester.tcl
|
|
set testprefix whereN
|
|
|
|
# The following is a simplified and "sanitized" version of the original
|
|
# real-world query that brought the problem to light.
|
|
#
|
|
# The issue is a slow query. The answer is correct, but it was taking too
|
|
# much time, because it was doing a full table scan rather than an indexed
|
|
# lookup.
|
|
#
|
|
# The problem was that the query planner was overestimating the number of
|
|
# output rows. The estimated number of output rows is accurate if the
|
|
# DSNAME parameter is "ds-one". In that case, a large fraction of the rows
|
|
# in "violation" end up being output. The query planner correctly deduces
|
|
# that it is faster to do a full table scan of the large "violation" table
|
|
# to avoid the after-query sort that implements the ORDER BY clause. However,
|
|
# if the DSNAME is "ds-two", then only a few rows (about 6) are generated,
|
|
# and it is much much faster to do an indexed lookup of "violation" followed
|
|
# by a sort operation to implement ORDER BY
|
|
#
|
|
# The problem, of course, is that the query planner has no way of knowing
|
|
# in advance how many rows will be generated. The query planner tries to
|
|
# estimate a worst case, which is a large number of output rows, and it picks
|
|
# the best plan for that case. However, the plan choosen is very inefficient
|
|
# when the number of output rows is small.
|
|
#
|
|
# The whereInterstageHeuristic() routine in the query planner attempts to
|
|
# correct this by adjusting the query plan such that it avoids the very bad
|
|
# query plan for a small number of rows, at the expense of a slightly less
|
|
# efficient plan for a large number of rows. The large number of rows case
|
|
# is perhaps 5% slower with the revised plan, but the small number of
|
|
# rows case is around 100 times faster. That seems like a good tradeoff.
|
|
#
|
|
do_execsql_test 1.0 {
|
|
CREATE TABLE datasource(dsid INT, name TEXT);
|
|
INSERT INTO datasource VALUES(1,'ds-one'),(2,'ds-two'),(3,'ds-three');
|
|
CREATE INDEX ds1 ON datasource(name, dsid);
|
|
|
|
CREATE TABLE rule(rid INT, team_id INT, dsid INT);
|
|
WITH RECURSIVE c(n) AS (VALUES(1) UNION ALL SELECT n+1 FROM c WHERE n<9)
|
|
INSERT INTO rule(rid,team_id,dsid) SELECT n, 1, 1 FROM c;
|
|
WITH RECURSIVE c(n) AS (VALUES(10) UNION ALL SELECT n+1 FROM c WHERE n<24)
|
|
INSERT INTO rule(rid,team_id,dsid) SELECT n, 2, 2 FROM c;
|
|
CREATE INDEX rule2 ON rule(dsid, rid);
|
|
|
|
CREATE TABLE violation(vid INT, rid INT, vx BLOB);
|
|
/*** Uncomment to insert actual data
|
|
WITH src(rid, cnt) AS (VALUES(1,3586),(2,1343),(3,6505),(5,76230),
|
|
(6,740),(7,287794),(8,457),(12,1),
|
|
(14,1),(16,1),(17,1),(18,1),(19,1))
|
|
INSERT INTO violation(vid, rid, vx)
|
|
SELECT rid*1000000+value, rid, randomblob(15)
|
|
FROM src, generate_series(1,cnt);
|
|
***/
|
|
CREATE INDEX v1 ON violation(rid, vid);
|
|
CREATE INDEX v2 ON violation(vid);
|
|
ANALYZE;
|
|
DELETE FROM sqlite_stat1;
|
|
DROP TABLE IF EXISTS sqlite_stat4;
|
|
INSERT INTO sqlite_stat1 VALUES
|
|
('violation','v2','376661 1'),
|
|
('violation','v1','376661 28974 1'),
|
|
('rule','rule2','24 12 1'),
|
|
('datasource','ds1','3 1 1');
|
|
ANALYZE sqlite_schema;
|
|
}
|
|
set DSNAME ds-two ;# Only a few rows. Change to "ds-one" for many rows.
|
|
do_eqp_test 1.1 {
|
|
SELECT count(*), length(group_concat(vx)) FROM (
|
|
SELECT V.*
|
|
FROM datasource DS, rule R, violation V
|
|
WHERE V.rid=R.rid
|
|
AND R.dsid=DS.dsid
|
|
AND DS.name=$DSNAME
|
|
ORDER BY V.vid desc
|
|
);
|
|
} {
|
|
QUERY PLAN
|
|
|--CO-ROUTINE (subquery-xxxxxx)
|
|
| |--SEARCH DS USING COVERING INDEX ds1 (name=?)
|
|
| |--SEARCH R USING COVERING INDEX rule2 (dsid=?)
|
|
| |--SEARCH V USING INDEX v1 (rid=?)
|
|
| `--USE TEMP B-TREE FOR ORDER BY
|
|
`--SCAN (subquery-xxxxxx)
|
|
}
|
|
# ^^^^---- We want to see three SEARCH terms. No SCAN terms.
|
|
# The ORDER BY is implemented by a separate sorter pass.
|
|
|
|
finish_test
|