From f6d208d6e51810c73f0e02c477984a6b44627f11 Mon Sep 17 00:00:00 2001 From: Simon Riggs Date: Fri, 15 May 2015 14:37:10 -0400 Subject: [PATCH] TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs --- contrib/file_fdw/file_fdw.c | 2 +- contrib/postgres_fdw/postgres_fdw.c | 2 +- doc/src/sgml/catalogs.sgml | 120 ++++++ doc/src/sgml/ref/select.sgml | 61 ++- src/backend/access/Makefile | 3 +- src/backend/access/heap/heapam.c | 41 ++- src/backend/access/tablesample/Makefile | 17 + src/backend/access/tablesample/bernoulli.c | 235 ++++++++++++ src/backend/access/tablesample/system.c | 186 ++++++++++ src/backend/access/tablesample/tablesample.c | 368 +++++++++++++++++++ src/backend/catalog/Makefile | 5 +- src/backend/commands/analyze.c | 2 +- src/backend/commands/explain.c | 21 ++ src/backend/executor/Makefile | 2 +- src/backend/executor/execAmi.c | 8 + src/backend/executor/execCurrent.c | 1 + src/backend/executor/execProcnode.c | 14 + src/backend/executor/nodeSamplescan.c | 256 +++++++++++++ src/backend/nodes/copyfuncs.c | 60 +++ src/backend/nodes/equalfuncs.c | 37 ++ src/backend/nodes/nodeFuncs.c | 28 ++ src/backend/nodes/outfuncs.c | 48 +++ src/backend/nodes/readfuncs.c | 45 +++ src/backend/optimizer/path/allpaths.c | 49 +++ src/backend/optimizer/path/costsize.c | 67 ++++ src/backend/optimizer/plan/createplan.c | 69 ++++ src/backend/optimizer/plan/planner.c | 15 +- src/backend/optimizer/plan/setrefs.c | 11 + src/backend/optimizer/plan/subselect.c | 1 + src/backend/optimizer/util/pathnode.c | 22 ++ src/backend/parser/gram.y | 36 +- src/backend/parser/parse_clause.c | 56 +++ src/backend/parser/parse_func.c | 143 +++++++ src/backend/rewrite/rewriteHandler.c | 3 + src/backend/utils/adt/ruleutils.c | 50 +++ src/backend/utils/cache/lsyscache.c | 27 ++ src/backend/utils/cache/syscache.c | 23 ++ src/backend/utils/misc/sampling.c | 33 +- src/include/access/heapam.h | 4 + src/include/access/relscan.h | 1 + src/include/access/tablesample.h | 60 +++ src/include/catalog/indexing.h | 5 + src/include/catalog/pg_proc.h | 27 ++ src/include/catalog/pg_tablesample_method.h | 78 ++++ src/include/executor/nodeSamplescan.h | 24 ++ src/include/nodes/execnodes.h | 9 + src/include/nodes/nodes.h | 4 + src/include/nodes/parsenodes.h | 37 ++ src/include/nodes/plannodes.h | 6 + src/include/optimizer/cost.h | 1 + src/include/optimizer/pathnode.h | 2 + src/include/parser/kwlist.h | 1 + src/include/parser/parse_func.h | 5 + src/include/port.h | 4 + src/include/utils/lsyscache.h | 1 + src/include/utils/rel.h | 1 - src/include/utils/sampling.h | 15 +- src/include/utils/syscache.h | 2 + src/port/erand48.c | 3 - src/test/regress/expected/rowsecurity.out | 26 ++ src/test/regress/expected/sanity_check.out | 1 + src/test/regress/expected/tablesample.out | 231 ++++++++++++ src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/rowsecurity.sql | 4 + src/test/regress/sql/tablesample.sql | 74 ++++ 66 files changed, 2756 insertions(+), 40 deletions(-) create mode 100644 src/backend/access/tablesample/Makefile create mode 100644 src/backend/access/tablesample/bernoulli.c create mode 100644 src/backend/access/tablesample/system.c create mode 100644 src/backend/access/tablesample/tablesample.c create mode 100644 src/backend/executor/nodeSamplescan.c create mode 100644 src/include/access/tablesample.h create mode 100644 src/include/catalog/pg_tablesample_method.h create mode 100644 src/include/executor/nodeSamplescan.h create mode 100644 src/test/regress/expected/tablesample.out create mode 100644 src/test/regress/sql/tablesample.sql diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index ea4ed40082..499f24ff28 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -1097,7 +1097,7 @@ file_acquire_sample_rows(Relation onerel, int elevel, * Found a suitable tuple, so save it, replacing one old tuple * at random */ - int k = (int) (targrows * sampler_random_fract()); + int k = (int) (targrows * sampler_random_fract(rstate.randstate)); Assert(k >= 0 && k < targrows); heap_freetuple(rows[k]); diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index 43288c27b0..d420cb2d0c 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -2557,7 +2557,7 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate) if (astate->rowstoskip <= 0) { /* Choose a random reservoir element to replace. */ - pos = (int) (targrows * sampler_random_fract()); + pos = (int) (targrows * sampler_random_fract(astate->rstate.randstate)); Assert(pos >= 0 && pos < targrows); heap_freetuple(astate->rows[pos]); } diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 6c51974961..5b36487609 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -278,6 +278,11 @@ planner statistics + + pg_tablesample_method + table sampling methods + + pg_tablespace tablespaces within this database cluster @@ -6127,6 +6132,121 @@ + + <structname>pg_tabesample_method</structname> + + + pg_am + + + + The catalog pg_tablesample_method stores + information about table sampling methods which can be used in + TABLESAMPLE clause of a SELECT + statement. + + + + <structname>pg_tablesample_method</> Columns + + + + + Name + Type + References + Description + + + + + + oid + oid + + Row identifier (hidden attribute; must be explicitly selected) + + + + tsmname + name + + Name of the sampling method + + + + tsmseqscan + bool + + If true, the sampling method scans the whole table sequentially. + + + + + tsmpagemode + bool + + If true, the sampling method always reads the pages completely. + + + + + tsminit + regproc + pg_proc.oid + Initialize the sampling scan function + + + + tsmnextblock + regproc + pg_proc.oid + Get next block number function + + + + tsmnexttuple + regproc + pg_proc.oid + Get next tuple offset function + + + + tsmexaminetuple + regproc + pg_proc.oid + Function which examines the tuple contents and decides if to + return it, or zero if none + + + + tsmend + regproc + pg_proc.oid + End the sampling scan function + + + + tsmreset + regproc + pg_proc.oid + Restart the state of sampling scan function + + + + tsmcost + regproc + pg_proc.oid + Costing function + + + + +
+ +
+ + <structname>pg_tablespace</structname> diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml index 2295f63c13..42e04660a1 100644 --- a/doc/src/sgml/ref/select.sgml +++ b/doc/src/sgml/ref/select.sgml @@ -49,7 +49,7 @@ SELECT [ ALL | DISTINCT [ ON ( expressionwhere from_item can be one of: - [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] [ LATERAL ] function_name ( [ argument [, ...] ] ) @@ -316,6 +316,50 @@ TABLE [ ONLY ] table_name [ * ] + + TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] + + + Table sample clause after + table_name indicates that + a sampling_method should + be used to retrieve subset of rows in the table. + The sampling_method can be + any sampling method installed in the database. There are currently two + sampling methods available in the standard + PostgreSQL distribution: + + + SYSTEM + + + BERNOULLI + + + Both of these sampling methods currently accept only single argument + which is the percent (floating point from 0 to 100) of the rows to + be returned. + The SYSTEM sampling method does block level + sampling with each block having the same chance of being selected and + returns all rows from each selected block. + The BERNOULLI scans whole table and returns + individual rows with equal probability. Additional sampling methods + may be installed in the database via extensions. + + + The optional parameter REPEATABLE uses the seed + parameter, which can be a number or expression producing a number, as + a random seed for sampling. Note that subsequent commands may return + different results even if same REPEATABLE clause was + specified. This happens because DML statements and + maintenance operations such as VACUUM may affect physical + distribution of data. The setseed() function will not + affect the sampling result when the REPEATABLE + parameter is used. + + + + alias @@ -1927,5 +1971,20 @@ SELECT distributors.* WHERE distributors.name = 'Westward'; ROWS FROM( ... ) is an extension of the SQL standard. + + + <literal>TABLESAMPLE</literal> clause + + + The TABLESAMPLE clause is currently accepted only on physical + relations and materialized views. + + + + Additional modules allow you to install custom sampling methods and use + them instead of the SQL standard methods. + + + diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 21721b48f0..bd93a6a8d1 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,7 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam +SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ + tablesample transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1a8d2f2d0b..f0c2394e60 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -80,8 +80,9 @@ bool synchronize_seqscans = true; static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, - bool is_bitmapscan, bool temp_snap); + bool allow_strat, bool allow_sync, bool allow_pagemode, + bool is_bitmapscan, bool is_samplescan, + bool temp_snap); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, @@ -294,9 +295,10 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) /* * Currently, we don't have a stats counter for bitmap heap scans (but the - * underlying bitmap index scans will be counted). + * underlying bitmap index scans will be counted) or sample scans (we only + * update stats for tuple fetches there) */ - if (!scan->rs_bitmapscan) + if (!scan->rs_bitmapscan && !scan->rs_samplescan) pgstat_count_heap_scan(scan->rs_rd); } @@ -315,7 +317,7 @@ heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks) * In page-at-a-time mode it performs additional work, namely determining * which tuples on the page are visible. */ -static void +void heapgetpage(HeapScanDesc scan, BlockNumber page) { Buffer buffer; @@ -1310,6 +1312,9 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, * HeapScanDesc for a bitmap heap scan. Although that scan technology is * really quite unlike a standard seqscan, there is just enough commonality * to make it worth using the same data structure. + * + * heap_beginscan_samplingscan is alternate entry point for setting up a + * HeapScanDesc for a TABLESAMPLE scan. * ---------------- */ HeapScanDesc @@ -1317,7 +1322,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - true, true, false, false); + true, true, true, false, false, false); } HeapScanDesc @@ -1327,7 +1332,7 @@ heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key) Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); return heap_beginscan_internal(relation, snapshot, nkeys, key, - true, true, false, true); + true, true, true, false, false, true); } HeapScanDesc @@ -1336,7 +1341,8 @@ heap_beginscan_strat(Relation relation, Snapshot snapshot, bool allow_strat, bool allow_sync) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - allow_strat, allow_sync, false, false); + allow_strat, allow_sync, true, + false, false, false); } HeapScanDesc @@ -1344,14 +1350,24 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - false, false, true, false); + false, false, true, true, false, false); +} + +HeapScanDesc +heap_beginscan_sampling(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + bool allow_strat, bool allow_pagemode) +{ + return heap_beginscan_internal(relation, snapshot, nkeys, key, + allow_strat, false, allow_pagemode, + false, true, false); } static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, - bool is_bitmapscan, bool temp_snap) + bool allow_strat, bool allow_sync, bool allow_pagemode, + bool is_bitmapscan, bool is_samplescan, bool temp_snap) { HeapScanDesc scan; @@ -1373,6 +1389,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, scan->rs_snapshot = snapshot; scan->rs_nkeys = nkeys; scan->rs_bitmapscan = is_bitmapscan; + scan->rs_samplescan = is_samplescan; scan->rs_strategy = NULL; /* set in initscan */ scan->rs_allow_strat = allow_strat; scan->rs_allow_sync = allow_sync; @@ -1381,7 +1398,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot */ - scan->rs_pageatatime = IsMVCCSnapshot(snapshot); + scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot); /* * For a seqscan in a serializable transaction, acquire a predicate lock diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile new file mode 100644 index 0000000000..46eeb59f9c --- /dev/null +++ b/src/backend/access/tablesample/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for utils/tablesample +# +# IDENTIFICATION +# src/backend/utils/tablesample/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/tablesample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = tablesample.o system.o bernoulli.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c new file mode 100644 index 0000000000..c91f3f593e --- /dev/null +++ b/src/backend/access/tablesample/bernoulli.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * bernoulli.c + * interface routines for BERNOULLI tablesample method + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/bernoulli.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* tsdesc */ +typedef struct +{ + uint32 seed; /* random seed */ + BlockNumber startblock; /* starting block, we use ths for syncscan support */ + BlockNumber nblocks; /* number of blocks */ + BlockNumber blockno; /* current block */ + float4 probability; /* probabilty that tuple will be returned (0.0-1.0) */ + OffsetNumber lt; /* last tuple returned from current block */ + SamplerRandomState randstate; /* random generator tsdesc */ +} BernoulliSamplerData; + +/* + * Initialize the state. + */ +Datum +tsm_bernoulli_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + BernoulliSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(BernoulliSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->startblock = scan->rs_startblock; + sampler->nblocks = scan->rs_nblocks; + sampler->blockno = InvalidBlockNumber; + sampler->probability = percent / 100; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number to read or InvalidBlockNumber if we are at the + * end of the relation. + */ +Datum +tsm_bernoulli_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + /* + * Bernoulli sampling scans all blocks on the table and supports + * syncscan so loop from startblock to startblock instead of + * from 0 to nblocks. + */ + if (sampler->blockno == InvalidBlockNumber) + sampler->blockno = sampler->startblock; + else + { + sampler->blockno++; + + if (sampler->blockno >= sampler->nblocks) + sampler->blockno = 0; + + if (sampler->blockno == sampler->startblock) + PG_RETURN_UINT32(InvalidBlockNumber); + } + + PG_RETURN_UINT32(sampler->blockno); +} + +/* + * Get next tuple from current block. + * + * This method implements the main logic in bernoulli sampling. + * The algorithm simply generates new random number (in 0.0-1.0 range) and if + * it falls within user specified probability (in the same range) return the + * tuple offset. + * + * It is ok here to return tuple offset without knowing if tuple is visible + * and not check it via examinetuple. The reason for that is that we do the + * coinflip (random number generation) for every tuple in the table. Since all + * tuples have same probability of being returned the visible and invisible + * tuples will be returned in same ratio as they have in the actual table. + * This means that there is no skew towards either visible or invisible tuples + * and the number returned visible tuples to from the executor node is the + * fraction of visible tuples which was specified in input. + * + * This is faster than doing the coinflip in the examinetuple because we don't + * have to do visibility checks on uninteresting tuples. + * + * If we reach end of the block return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +Datum +tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + float4 probability = sampler->probability; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* + * Loop over tuple offsets until the random generator returns value that + * is within the probability of returning the tuple or until we reach + * end of the block. + * + * (This is our implementation of bernoulli trial) + */ + while (sampler_random_fract(sampler->randstate) > probability) + { + tupoffset++; + + if (tupoffset > maxoffset) + break; + } + + if (tupoffset > maxoffset) + /* Tell SampleScan that we want next block. */ + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_bernoulli_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset tsdesc (called by ReScan). + */ +Datum +tsm_bernoulli_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + sampler->blockno = InvalidBlockNumber; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_bernoulli_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + *pages = baserel->pages; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c new file mode 100644 index 0000000000..1412e511fa --- /dev/null +++ b/src/backend/access/tablesample/system.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * system.c + * interface routines for system tablesample method + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/system.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* + * State + */ +typedef struct +{ + BlockSamplerData bs; + uint32 seed; /* random seed */ + BlockNumber nblocks; /* number of block in relation */ + int samplesize; /* number of blocks to return */ + OffsetNumber lt; /* last tuple returned from current block */ +} SystemSamplerData; + + +/* + * Initializes the state. + */ +Datum +tsm_system_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + SystemSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(SystemSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->nblocks = scan->rs_nblocks; + sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0)); + sampler->lt = InvalidOffsetNumber; + + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number or InvalidBlockNumber when we're done. + * + * Uses the same logic as ANALYZE for picking the random blocks. + */ +Datum +tsm_system_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + BlockNumber blockno; + + if (!BlockSampler_HasMore(&sampler->bs)) + PG_RETURN_UINT32(InvalidBlockNumber); + + blockno = BlockSampler_Next(&sampler->bs); + + PG_RETURN_UINT32(blockno); +} + +/* + * Get next tuple offset in current block or InvalidOffsetNumber if we are done + * with this block. + */ +Datum +tsm_system_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_system_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset state (called by ReScan). + */ +Datum +tsm_system_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + + sampler->lt = InvalidOffsetNumber; + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_system_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *pages = baserel->pages * samplesize; + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c new file mode 100644 index 0000000000..ef55d062e7 --- /dev/null +++ b/src/backend/access/tablesample/tablesample.c @@ -0,0 +1,368 @@ +/*------------------------------------------------------------------------- + * + * tablesample.c + * TABLESAMPLE internal API + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/tablesample/tablesample.c + * + * TABLESAMPLE is the SQL standard clause for sampling the relations. + * + * The API is interface between the Executor and the TABLESAMPLE Methods. + * + * TABLESAMPLE Methods are implementations of actual sampling algorithms which + * can be used for returning a sample of the source relation. + * Methods don't read the table directly but are asked for block number and + * tuple offset which they want to examine (or return) and the tablesample + * interface implemented here does the reading for them. + * + * We currently only support sampling of the physical relations, but in the + * future we might extend the API to support subqueries as well. + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tablesample.h" + +#include "catalog/pg_tablesample_method.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" +#include "utils/tqual.h" + + +static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); + + +/* + * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. + */ +TableSampleDesc * +tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) +{ + FunctionCallInfoData fcinfo; + int i; + List *args = tablesample->args; + ListCell *arg; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); + + /* Load functions */ + fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); + fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); + fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); + if (OidIsValid(tablesample->tsmexaminetuple)) + fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); + else + tsdesc->tsmexaminetuple.fn_oid = InvalidOid; + fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); + fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); + + InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, + list_length(args) + 2, + InvalidOid, NULL, NULL); + + tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; + + /* First argument for init function is always TableSampleDesc */ + fcinfo.arg[0] = PointerGetDatum(tsdesc); + fcinfo.argnull[0] = false; + + /* + * Second arg for init function is always REPEATABLE + * When tablesample->repeatable is NULL then REPEATABLE clause was not + * specified. + * When specified, the expression cannot evaluate to NULL. + */ + if (tablesample->repeatable) + { + ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, + (PlanState *) scanstate); + fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[1], NULL); + if (fcinfo.argnull[1]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("REPEATABLE clause must be NOT NULL numeric value"))); + } + else + { + fcinfo.arg[1] = UInt32GetDatum(random()); + fcinfo.argnull[1] = false; + } + + /* Rest of the arguments come from user. */ + i = 2; + foreach(arg, args) + { + Expr *argexpr = (Expr *) lfirst(arg); + ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); + + if (argstate == NULL) + { + fcinfo.argnull[i] = true; + fcinfo.arg[i] = (Datum) 0;; + } + + fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[i], NULL); + i++; + } + Assert(i == fcinfo.nargs); + + (void) FunctionCallInvoke(&fcinfo); + + return tsdesc; +} + +/* + * Get next tuple from TABLESAMPLE Method. + */ +HeapTuple +tablesample_getnext(TableSampleDesc *desc) +{ + HeapScanDesc scan = desc->heapScan; + HeapTuple tuple = &(scan->rs_ctup); + bool pagemode = scan->rs_pageatatime; + BlockNumber blockno; + Page page; + bool page_all_visible; + ItemId itemid; + OffsetNumber tupoffset, + maxoffset; + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return NULL; + } + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + if (!BlockNumberIsValid(blockno)) + { + tuple->t_data = NULL; + return NULL; + } + + heapgetpage(scan, blockno); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + blockno = scan->rs_cblock; /* current page */ + } + + /* + * When pagemode is disabled, the scan will do visibility checks for each + * tuple it finds so the buffer needs to be locked. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + UInt16GetDatum(maxoffset))); + + if (OffsetNumberIsValid(tupoffset)) + { + bool visible; + bool found; + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + if (page_all_visible) + visible = true; + else + visible = SampleTupleVisible(tuple, tupoffset, scan); + + /* + * Let the sampling method examine the actual tuple and decide if we + * should return it. + * + * Note that we let it examine even invisible tuples for + * statistical purposes, but not return them since user should + * never see invisible tuples. + */ + if (OidIsValid(desc->tsmexaminetuple.fn_oid)) + { + found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + PointerGetDatum(tuple), + BoolGetDatum(visible))); + /* Should not happen if sampling method is well written. */ + if (found && !visible) + elog(ERROR, "Sampling method wanted to return invisible tuple"); + } + else + found = visible; + + /* Found visible tuple, return it. */ + if (found) + { + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + break; + } + else + { + /* Try next tuple from same page. */ + continue; + } + } + + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? + blockno : scan->rs_startblock); + + /* + * Reached end of scan. + */ + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return NULL; + } + + heapgetpage(scan, blockno); + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + } + + pgstat_count_heap_getnext(scan->rs_rd); + + return &(scan->rs_ctup); +} + +/* + * Reset the sampling to starting state + */ +void +tablesample_reset(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); +} + +/* + * Signal the sampling method that the scan has finished. + */ +void +tablesample_end(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +{ + /* + * If this scan is reading whole pages at a time, there is already + * visibility info present in rs_vistuples so we can just search it + * for the tupoffset. + */ + if (scan->rs_pageatatime) + { + int start = 0, + end = scan->rs_ntuples - 1; + + /* + * Do the binary search over rs_vistuples, it's already sorted by + * OffsetNumber so we don't need to do any sorting ourselves here. + * + * We could use bsearch() here but it's slower for integers because + * of the function call overhead and because it needs boiler plate code + * it would not save us anything code-wise anyway. + */ + while (start <= end) + { + int mid = start + (end - start) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (curoffset == tupoffset) + return true; + else if (curoffset > tupoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* No pagemode, we have to check the tuple itself. */ + Snapshot snapshot = scan->rs_snapshot; + Buffer buffer = scan->rs_cbuf; + + bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, + snapshot); + + return visible; + } +} diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 37d05d1acc..3d1139b5ba 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -40,9 +40,8 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_ts_parser.h pg_ts_template.h pg_extension.h \ pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \ pg_foreign_table.h pg_policy.h pg_replication_origin.h \ - pg_default_acl.h pg_seclabel.h pg_shseclabel.h pg_collation.h pg_range.h \ - pg_transform.h \ - toasting.h indexing.h \ + pg_tablesample_method.h pg_default_acl.h pg_seclabel.h pg_shseclabel.h \ + pg_collation.h pg_range.h pg_transform.h toasting.h indexing.h \ ) # location of Catalog.pm diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 952cf204a0..65e329eab0 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1150,7 +1150,7 @@ acquire_sample_rows(Relation onerel, int elevel, * Found a suitable tuple, so save it, replacing one * old tuple at random */ - int k = (int) (targrows * sampler_random_fract()); + int k = (int) (targrows * sampler_random_fract(rstate.randstate)); Assert(k >= 0 && k < targrows); heap_freetuple(rows[k]); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index eeb8f19017..478771c6ba 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -731,6 +731,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) case T_ValuesScan: case T_CteScan: case T_WorkTableScan: + case T_SampleScan: *rels_used = bms_add_member(*rels_used, ((Scan *) plan)->scanrelid); break; @@ -967,6 +968,21 @@ ExplainNode(PlanState *planstate, List *ancestors, else pname = sname; break; + case T_SampleScan: + { + /* + * Fetch the tablesample method name from RTE. + * + * It would be nice to also show parameters, but since we + * support arbitrary expressions as parameter it might get + * quite messy. + */ + RangeTblEntry *rte; + rte = rt_fetch(((SampleScan *) plan)->scanrelid, es->rtable); + custom_name = get_tablesample_method_name(rte->tablesample->tsmid); + pname = psprintf("Sample Scan (%s)", custom_name); + } + break; case T_Material: pname = sname = "Materialize"; break; @@ -1089,6 +1105,9 @@ ExplainNode(PlanState *planstate, List *ancestors, if (((Scan *) plan)->scanrelid > 0) ExplainScanTarget((Scan *) plan, es); break; + case T_SampleScan: + ExplainScanTarget((Scan *) plan, es); + break; case T_IndexScan: { IndexScan *indexscan = (IndexScan *) plan; @@ -1339,6 +1358,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_CteScan: case T_WorkTableScan: case T_SubqueryScan: + case T_SampleScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, @@ -2238,6 +2258,7 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) case T_TidScan: case T_ForeignScan: case T_CustomScan: + case T_SampleScan: case T_ModifyTable: /* Assert it's on a real relation */ Assert(rte->rtekind == RTE_RELATION); diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index bc5d373d68..08cba6fa2b 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -21,7 +21,7 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \ nodeLimit.o nodeLockRows.o \ nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \ nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \ - nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ + nodeSamplescan.o nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \ nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \ nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o spi.o diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 6ebad2f03f..4948a265cb 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -39,6 +39,7 @@ #include "executor/nodeNestloop.h" #include "executor/nodeRecursiveunion.h" #include "executor/nodeResult.h" +#include "executor/nodeSamplescan.h" #include "executor/nodeSeqscan.h" #include "executor/nodeSetOp.h" #include "executor/nodeSort.h" @@ -155,6 +156,10 @@ ExecReScan(PlanState *node) ExecReScanSeqScan((SeqScanState *) node); break; + case T_SampleScanState: + ExecReScanSampleScan((SampleScanState *) node); + break; + case T_IndexScanState: ExecReScanIndexScan((IndexScanState *) node); break; @@ -480,6 +485,9 @@ ExecSupportsBackwardScan(Plan *node) } return false; + case T_SampleScan: + return false; + case T_Material: case T_Sort: /* these don't evaluate tlist */ diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c index d87be963a9..bcd287f874 100644 --- a/src/backend/executor/execCurrent.c +++ b/src/backend/executor/execCurrent.c @@ -261,6 +261,7 @@ search_plan_tree(PlanState *node, Oid table_oid) * Relation scan nodes can all be treated alike */ case T_SeqScanState: + case T_SampleScanState: case T_IndexScanState: case T_IndexOnlyScanState: case T_BitmapHeapScanState: diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 9892499fb7..03c2febc3e 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -102,6 +102,7 @@ #include "executor/nodeNestloop.h" #include "executor/nodeRecursiveunion.h" #include "executor/nodeResult.h" +#include "executor/nodeSamplescan.h" #include "executor/nodeSeqscan.h" #include "executor/nodeSetOp.h" #include "executor/nodeSort.h" @@ -190,6 +191,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_SampleScan: + result = (PlanState *) ExecInitSampleScan((SampleScan *) node, + estate, eflags); + break; + case T_IndexScan: result = (PlanState *) ExecInitIndexScan((IndexScan *) node, estate, eflags); @@ -406,6 +412,10 @@ ExecProcNode(PlanState *node) result = ExecSeqScan((SeqScanState *) node); break; + case T_SampleScanState: + result = ExecSampleScan((SampleScanState *) node); + break; + case T_IndexScanState: result = ExecIndexScan((IndexScanState *) node); break; @@ -644,6 +654,10 @@ ExecEndNode(PlanState *node) ExecEndSeqScan((SeqScanState *) node); break; + case T_SampleScanState: + ExecEndSampleScan((SampleScanState *) node); + break; + case T_IndexScanState: ExecEndIndexScan((IndexScanState *) node); break; diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c new file mode 100644 index 0000000000..fc89d1dca0 --- /dev/null +++ b/src/backend/executor/nodeSamplescan.c @@ -0,0 +1,256 @@ +/*------------------------------------------------------------------------- + * + * nodeSamplescan.c + * Support routines for sample scans of relations (table sampling). + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSamplescan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/tablesample.h" +#include "executor/executor.h" +#include "executor/nodeSamplescan.h" +#include "miscadmin.h" +#include "parser/parsetree.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/tqual.h" + +static void InitScanRelation(SampleScanState *node, EState *estate, + int eflags, TableSampleClause *tablesample); +static TupleTableSlot *SampleNext(SampleScanState *node); + + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * SampleNext + * + * This is a workhorse for ExecSampleScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +SampleNext(SampleScanState *node) +{ + TupleTableSlot *slot; + TableSampleDesc *tsdesc; + HeapTuple tuple; + + /* + * get information from the scan state + */ + slot = node->ss.ss_ScanTupleSlot; + tsdesc = node->tsdesc; + + tuple = tablesample_getnext(tsdesc); + + if (tuple) + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + tsdesc->heapScan->rs_cbuf, /* buffer associated with this tuple */ + false); /* don't pfree this pointer */ + else + ExecClearTuple(slot); + + return slot; +} + +/* + * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +SampleRecheck(SampleScanState *node, TupleTableSlot *slot) +{ + /* No need to recheck for SampleScan */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecSampleScan(node) + * + * Scans the relation using the sampling method and returns + * the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +TupleTableSlot * +ExecSampleScan(SampleScanState *node) +{ + return ExecScan((ScanState *) node, + (ExecScanAccessMtd) SampleNext, + (ExecScanRecheckMtd) SampleRecheck); +} + +/* ---------------------------------------------------------------- + * InitScanRelation + * + * Set up to access the scan relation. + * ---------------------------------------------------------------- + */ +static void +InitScanRelation(SampleScanState *node, EState *estate, int eflags, + TableSampleClause *tablesample) +{ + Relation currentRelation; + + /* + * get the relation object id from the relid'th entry in the range table, + * open that relation and acquire appropriate lock on it. + */ + currentRelation = ExecOpenScanRelation(estate, + ((SampleScan *) node->ss.ps.plan)->scanrelid, + eflags); + + node->ss.ss_currentRelation = currentRelation; + + /* + * Even though we aren't going to do a conventional seqscan, it is useful + * to create a HeapScanDesc --- many of the fields in it are usable. + */ + node->ss.ss_currentScanDesc = + heap_beginscan_sampling(currentRelation, estate->es_snapshot, 0, NULL, + tablesample->tsmseqscan, + tablesample->tsmpagemode); + + /* and report the scan tuple slot's rowtype */ + ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation)); +} + + +/* ---------------------------------------------------------------- + * ExecInitSampleScan + * ---------------------------------------------------------------- + */ +SampleScanState * +ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) +{ + SampleScanState *scanstate; + RangeTblEntry *rte = rt_fetch(node->scanrelid, + estate->es_range_table); + + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + Assert(rte->tablesample != NULL); + + /* + * create state structure + */ + scanstate = makeNode(SampleScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * initialize child expressions + */ + scanstate->ss.ps.targetlist = (List *) + ExecInitExpr((Expr *) node->plan.targetlist, + (PlanState *) scanstate); + scanstate->ss.ps.qual = (List *) + ExecInitExpr((Expr *) node->plan.qual, + (PlanState *) scanstate); + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &scanstate->ss.ps); + ExecInitScanTupleSlot(estate, &scanstate->ss); + + /* + * initialize scan relation + */ + InitScanRelation(scanstate, estate, eflags, rte->tablesample); + + scanstate->ss.ps.ps_TupFromTlist = false; + + /* + * Initialize result tuple type and projection info. + */ + ExecAssignResultTypeFromTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + scanstate->tsdesc = tablesample_init(scanstate, rte->tablesample); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndSampleScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndSampleScan(SampleScanState *node) +{ + /* + * Tell sampling function that we finished the scan. + */ + tablesample_end(node->tsdesc); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close heap scan + */ + heap_endscan(node->ss.ss_currentScanDesc); + + /* + * close the heap relation. + */ + ExecCloseScanRelation(node->ss.ss_currentRelation); +} + +/* ---------------------------------------------------------------- + * Join Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecReScanSampleScan + * + * Rescans the relation. + * + * ---------------------------------------------------------------- + */ +void +ExecReScanSampleScan(SampleScanState *node) +{ + heap_rescan(node->ss.ss_currentScanDesc, NULL); + + /* + * Tell sampling function to reset its state for rescan. + */ + tablesample_reset(node->tsdesc); + + ExecScanReScan(&node->ss); +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 25839eed94..bdc7e61935 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -641,6 +641,22 @@ _copyCustomScan(const CustomScan *from) return newnode; } +/* + * _copySampleScan + */ +static SampleScan * +_copySampleScan(const SampleScan *from) +{ + SampleScan *newnode = makeNode(SampleScan); + + /* + * copy node superclass fields + */ + CopyScanFields((const Scan *) from, (Scan *) newnode); + + return newnode; +} + /* * CopyJoinFields * @@ -2063,6 +2079,7 @@ _copyRangeTblEntry(const RangeTblEntry *from) COPY_SCALAR_FIELD(rtekind); COPY_SCALAR_FIELD(relid); COPY_SCALAR_FIELD(relkind); + COPY_NODE_FIELD(tablesample); COPY_NODE_FIELD(subquery); COPY_SCALAR_FIELD(security_barrier); COPY_SCALAR_FIELD(jointype); @@ -2224,6 +2241,40 @@ _copyCommonTableExpr(const CommonTableExpr *from) return newnode; } +static RangeTableSample * +_copyRangeTableSample(const RangeTableSample *from) +{ + RangeTableSample *newnode = makeNode(RangeTableSample); + + COPY_NODE_FIELD(relation); + COPY_STRING_FIELD(method); + COPY_NODE_FIELD(repeatable); + COPY_NODE_FIELD(args); + + return newnode; +} + +static TableSampleClause * +_copyTableSampleClause(const TableSampleClause *from) +{ + TableSampleClause *newnode = makeNode(TableSampleClause); + + COPY_SCALAR_FIELD(tsmid); + COPY_SCALAR_FIELD(tsmseqscan); + COPY_SCALAR_FIELD(tsmpagemode); + COPY_SCALAR_FIELD(tsminit); + COPY_SCALAR_FIELD(tsmnextblock); + COPY_SCALAR_FIELD(tsmnexttuple); + COPY_SCALAR_FIELD(tsmexaminetuple); + COPY_SCALAR_FIELD(tsmend); + COPY_SCALAR_FIELD(tsmreset); + COPY_SCALAR_FIELD(tsmcost); + COPY_NODE_FIELD(repeatable); + COPY_NODE_FIELD(args); + + return newnode; +} + static A_Expr * _copyAExpr(const A_Expr *from) { @@ -4179,6 +4230,9 @@ copyObject(const void *from) case T_CustomScan: retval = _copyCustomScan(from); break; + case T_SampleScan: + retval = _copySampleScan(from); + break; case T_Join: retval = _copyJoin(from); break; @@ -4842,6 +4896,12 @@ copyObject(const void *from) case T_CommonTableExpr: retval = _copyCommonTableExpr(from); break; + case T_RangeTableSample: + retval = _copyRangeTableSample(from); + break; + case T_TableSampleClause: + retval = _copyTableSampleClause(from); + break; case T_FuncWithArgs: retval = _copyFuncWithArgs(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index c4b3615caf..d483221fb7 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2359,6 +2359,7 @@ _equalRangeTblEntry(const RangeTblEntry *a, const RangeTblEntry *b) COMPARE_SCALAR_FIELD(rtekind); COMPARE_SCALAR_FIELD(relid); COMPARE_SCALAR_FIELD(relkind); + COMPARE_NODE_FIELD(tablesample); COMPARE_NODE_FIELD(subquery); COMPARE_SCALAR_FIELD(security_barrier); COMPARE_SCALAR_FIELD(jointype); @@ -2502,6 +2503,36 @@ _equalCommonTableExpr(const CommonTableExpr *a, const CommonTableExpr *b) return true; } +static bool +_equalRangeTableSample(const RangeTableSample *a, const RangeTableSample *b) +{ + COMPARE_NODE_FIELD(relation); + COMPARE_STRING_FIELD(method); + COMPARE_NODE_FIELD(repeatable); + COMPARE_NODE_FIELD(args); + + return true; +} + +static bool +_equalTableSampleClause(const TableSampleClause *a, const TableSampleClause *b) +{ + COMPARE_SCALAR_FIELD(tsmid); + COMPARE_SCALAR_FIELD(tsmseqscan); + COMPARE_SCALAR_FIELD(tsmpagemode); + COMPARE_SCALAR_FIELD(tsminit); + COMPARE_SCALAR_FIELD(tsmnextblock); + COMPARE_SCALAR_FIELD(tsmnexttuple); + COMPARE_SCALAR_FIELD(tsmexaminetuple); + COMPARE_SCALAR_FIELD(tsmend); + COMPARE_SCALAR_FIELD(tsmreset); + COMPARE_SCALAR_FIELD(tsmcost); + COMPARE_NODE_FIELD(repeatable); + COMPARE_NODE_FIELD(args); + + return true; +} + static bool _equalXmlSerialize(const XmlSerialize *a, const XmlSerialize *b) { @@ -3236,6 +3267,12 @@ equal(const void *a, const void *b) case T_CommonTableExpr: retval = _equalCommonTableExpr(a, b); break; + case T_RangeTableSample: + retval = _equalRangeTableSample(a, b); + break; + case T_TableSampleClause: + retval = _equalTableSampleClause(a, b); + break; case T_FuncWithArgs: retval = _equalFuncWithArgs(a, b); break; diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index eac0215923..42d62d32d9 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -2058,6 +2058,14 @@ range_table_walker(List *rtable, switch (rte->rtekind) { case RTE_RELATION: + if (rte->tablesample) + { + if (walker(rte->tablesample->args, context)) + return true; + if (walker(rte->tablesample->repeatable, context)) + return true; + } + break; case RTE_CTE: /* nothing to do */ break; @@ -2813,6 +2821,14 @@ range_table_mutator(List *rtable, switch (rte->rtekind) { case RTE_RELATION: + if (rte->tablesample) + { + MUTATE(rte->tablesample->args, rte->tablesample->args, + List *); + MUTATE(rte->tablesample->repeatable, + rte->tablesample->repeatable, Node *); + } + break; case RTE_CTE: /* we don't bother to copy eref, aliases, etc; OK? */ break; @@ -3309,6 +3325,18 @@ raw_expression_tree_walker(Node *node, break; case T_CommonTableExpr: return walker(((CommonTableExpr *) node)->ctequery, context); + case T_RangeTableSample: + { + RangeTableSample *rts = (RangeTableSample *) node; + + if (walker(rts->relation, context)) + return true; + if (walker(rts->repeatable, context)) + return true; + if (walker(rts->args, context)) + return true; + } + break; default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index fe868b889d..7918553da0 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -591,6 +591,14 @@ _outCustomScan(StringInfo str, const CustomScan *node) node->methods->TextOutCustomScan(str, node); } +static void +_outSampleScan(StringInfo str, const SampleScan *node) +{ + WRITE_NODE_TYPE("SAMPLESCAN"); + + _outScanInfo(str, (const Scan *) node); +} + static void _outJoin(StringInfo str, const Join *node) { @@ -2444,6 +2452,36 @@ _outCommonTableExpr(StringInfo str, const CommonTableExpr *node) WRITE_NODE_FIELD(ctecolcollations); } +static void +_outRangeTableSample(StringInfo str, const RangeTableSample *node) +{ + WRITE_NODE_TYPE("RANGETABLESAMPLE"); + + WRITE_NODE_FIELD(relation); + WRITE_STRING_FIELD(method); + WRITE_NODE_FIELD(repeatable); + WRITE_NODE_FIELD(args); +} + +static void +_outTableSampleClause(StringInfo str, const TableSampleClause *node) +{ + WRITE_NODE_TYPE("TABLESAMPLECLAUSE"); + + WRITE_OID_FIELD(tsmid); + WRITE_BOOL_FIELD(tsmseqscan); + WRITE_BOOL_FIELD(tsmpagemode); + WRITE_OID_FIELD(tsminit); + WRITE_OID_FIELD(tsmnextblock); + WRITE_OID_FIELD(tsmnexttuple); + WRITE_OID_FIELD(tsmexaminetuple); + WRITE_OID_FIELD(tsmend); + WRITE_OID_FIELD(tsmreset); + WRITE_OID_FIELD(tsmcost); + WRITE_NODE_FIELD(repeatable); + WRITE_NODE_FIELD(args); +} + static void _outSetOperationStmt(StringInfo str, const SetOperationStmt *node) { @@ -2474,6 +2512,7 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node) case RTE_RELATION: WRITE_OID_FIELD(relid); WRITE_CHAR_FIELD(relkind); + WRITE_NODE_FIELD(tablesample); break; case RTE_SUBQUERY: WRITE_NODE_FIELD(subquery); @@ -2973,6 +3012,9 @@ _outNode(StringInfo str, const void *obj) case T_CustomScan: _outCustomScan(str, obj); break; + case T_SampleScan: + _outSampleScan(str, obj); + break; case T_Join: _outJoin(str, obj); break; @@ -3319,6 +3361,12 @@ _outNode(StringInfo str, const void *obj) case T_CommonTableExpr: _outCommonTableExpr(str, obj); break; + case T_RangeTableSample: + _outRangeTableSample(str, obj); + break; + case T_TableSampleClause: + _outTableSampleClause(str, obj); + break; case T_SetOperationStmt: _outSetOperationStmt(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 8136306e1e..c8fb894a75 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -351,6 +351,46 @@ _readCommonTableExpr(void) READ_DONE(); } +/* + * _readRangeTableSample + */ +static RangeTableSample * +_readRangeTableSample(void) +{ + READ_LOCALS(RangeTableSample); + + READ_NODE_FIELD(relation); + READ_STRING_FIELD(method); + READ_NODE_FIELD(repeatable); + READ_NODE_FIELD(args); + + READ_DONE(); +} + +/* + * _readTableSampleClause + */ +static TableSampleClause * +_readTableSampleClause(void) +{ + READ_LOCALS(TableSampleClause); + + READ_OID_FIELD(tsmid); + READ_BOOL_FIELD(tsmseqscan); + READ_BOOL_FIELD(tsmpagemode); + READ_OID_FIELD(tsminit); + READ_OID_FIELD(tsmnextblock); + READ_OID_FIELD(tsmnexttuple); + READ_OID_FIELD(tsmexaminetuple); + READ_OID_FIELD(tsmend); + READ_OID_FIELD(tsmreset); + READ_OID_FIELD(tsmcost); + READ_NODE_FIELD(repeatable); + READ_NODE_FIELD(args); + + READ_DONE(); +} + /* * _readSetOperationStmt */ @@ -1255,6 +1295,7 @@ _readRangeTblEntry(void) case RTE_RELATION: READ_OID_FIELD(relid); READ_CHAR_FIELD(relkind); + READ_NODE_FIELD(tablesample); break; case RTE_SUBQUERY: READ_NODE_FIELD(subquery); @@ -1351,6 +1392,10 @@ parseNodeString(void) return_value = _readRowMarkClause(); else if (MATCH("COMMONTABLEEXPR", 15)) return_value = _readCommonTableExpr(); + else if (MATCH("RANGETABLESAMPLE", 16)) + return_value = _readRangeTableSample(); + else if (MATCH("TABLESAMPLECLAUSE", 17)) + return_value = _readTableSampleClause(); else if (MATCH("SETOPERATIONSTMT", 16)) return_value = _readSetOperationStmt(); else if (MATCH("ALIAS", 5)) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 9caca94f64..4cd1bf65e7 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -71,6 +71,10 @@ static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); +static void set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, + RangeTblEntry *rte); +static void set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, + RangeTblEntry *rte); static void set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static void set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, @@ -265,6 +269,11 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel, /* Foreign table */ set_foreign_size(root, rel, rte); } + else if (rte->tablesample != NULL) + { + /* Sampled relation */ + set_tablesample_rel_size(root, rel, rte); + } else { /* Plain relation */ @@ -332,6 +341,11 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, /* Foreign table */ set_foreign_pathlist(root, rel, rte); } + else if (rte->tablesample != NULL) + { + /* Build sample scan on relation */ + set_tablesample_rel_pathlist(root, rel, rte); + } else { /* Plain relation */ @@ -417,6 +431,41 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) create_tidscan_paths(root, rel); } +/* + * set_tablesample_rel_size + * Set size estimates for a sampled relation. + */ +static void +set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) +{ + /* Mark rel with estimated output rows, width, etc */ + set_baserel_size_estimates(root, rel); +} + +/* + * set_tablesample_rel_pathlist + * Build access paths for a sampled relation + * + * There is only one possible path - sampling scan + */ +static void +set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) +{ + Relids required_outer; + Path *path; + + /* + * We don't support pushing join clauses into the quals of a seqscan, but + * it could still have required parameterization due to LATERAL refs in + * its tlist. + */ + required_outer = rel->lateral_relids; + + /* We only do sample scan if it was requested */ + path = create_samplescan_path(root, rel, required_outer); + rel->pathlist = list_make1(path); +} + /* * set_foreign_size * Set size estimates for a foreign table RTE diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1a0d358c5f..c2b2b7622a 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -219,6 +219,73 @@ cost_seqscan(Path *path, PlannerInfo *root, path->total_cost = startup_cost + run_cost; } +/* + * cost_samplescan + * Determines and returns the cost of scanning a relation using sampling. + * + * From planner/optimizer perspective, we don't care all that much about cost + * itself since there is always only one scan path to consider when sampling + * scan is present, but number of rows estimation is still important. + * + * 'baserel' is the relation to be scanned + * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL + */ +void +cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel) +{ + Cost startup_cost = 0; + Cost run_cost = 0; + double spc_seq_page_cost, + spc_random_page_cost, + spc_page_cost; + QualCost qpqual_cost; + Cost cpu_per_tuple; + BlockNumber pages; + double tuples; + RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root); + TableSampleClause *tablesample = rte->tablesample; + + /* Should only be applied to base relations */ + Assert(baserel->relid > 0); + Assert(baserel->rtekind == RTE_RELATION); + + /* Mark the path with the correct row estimate */ + if (path->param_info) + path->rows = path->param_info->ppi_rows; + else + path->rows = baserel->rows; + + /* Call the sampling method's costing function. */ + OidFunctionCall6(tablesample->tsmcost, PointerGetDatum(root), + PointerGetDatum(path), PointerGetDatum(baserel), + PointerGetDatum(tablesample->args), + PointerGetDatum(&pages), PointerGetDatum(&tuples)); + + /* fetch estimated page cost for tablespace containing table */ + get_tablespace_page_costs(baserel->reltablespace, + &spc_random_page_cost, + &spc_seq_page_cost); + + + spc_page_cost = tablesample->tsmseqscan ? spc_seq_page_cost : + spc_random_page_cost; + + /* + * disk costs + */ + run_cost += spc_page_cost * pages; + + /* CPU costs */ + get_restriction_qual_cost(root, baserel, path->param_info, &qpqual_cost); + + startup_cost += qpqual_cost.startup; + cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; + run_cost += cpu_per_tuple * tuples; + + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} + /* * cost_index * Determines and returns the cost of scanning a relation using an index. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 783e34b4fb..c6095167e8 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -59,6 +59,8 @@ static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path); static SeqScan *create_seqscan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); +static SampleScan *create_samplescan_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses); static Scan *create_indexscan_plan(PlannerInfo *root, IndexPath *best_path, List *tlist, List *scan_clauses, bool indexonly); static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root, @@ -101,6 +103,7 @@ static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_path_costsize(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid); +static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid); static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, Oid *indexorderbyops, @@ -229,6 +232,7 @@ create_plan_recurse(PlannerInfo *root, Path *best_path) switch (best_path->pathtype) { case T_SeqScan: + case T_SampleScan: case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: @@ -344,6 +348,13 @@ create_scan_plan(PlannerInfo *root, Path *best_path) scan_clauses); break; + case T_SampleScan: + plan = (Plan *) create_samplescan_plan(root, + best_path, + tlist, + scan_clauses); + break; + case T_IndexScan: plan = (Plan *) create_indexscan_plan(root, (IndexPath *) best_path, @@ -547,6 +558,7 @@ disuse_physical_tlist(PlannerInfo *root, Plan *plan, Path *path) switch (path->pathtype) { case T_SeqScan: + case T_SampleScan: case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: @@ -1133,6 +1145,45 @@ create_seqscan_plan(PlannerInfo *root, Path *best_path, return scan_plan; } +/* + * create_samplescan_plan + * Returns a samplecan plan for the base relation scanned by 'best_path' + * with restriction clauses 'scan_clauses' and targetlist 'tlist'. + */ +static SampleScan * +create_samplescan_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses) +{ + SampleScan *scan_plan; + Index scan_relid = best_path->parent->relid; + + /* it should be a base rel with tablesample clause... */ + Assert(scan_relid > 0); + Assert(best_path->parent->rtekind == RTE_RELATION); + Assert(best_path->pathtype == T_SampleScan); + + /* Sort clauses into best execution order */ + scan_clauses = order_qual_clauses(root, scan_clauses); + + /* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */ + scan_clauses = extract_actual_clauses(scan_clauses, false); + + /* Replace any outer-relation variables with nestloop params */ + if (best_path->param_info) + { + scan_clauses = (List *) + replace_nestloop_params(root, (Node *) scan_clauses); + } + + scan_plan = make_samplescan(tlist, + scan_clauses, + scan_relid); + + copy_path_costsize(&scan_plan->plan, best_path); + + return scan_plan; +} + /* * create_indexscan_plan * Returns an indexscan plan for the base relation scanned by 'best_path' @@ -3378,6 +3429,24 @@ make_seqscan(List *qptlist, return node; } +static SampleScan * +make_samplescan(List *qptlist, + List *qpqual, + Index scanrelid) +{ + SampleScan *node = makeNode(SampleScan); + Plan *plan = &node->plan; + + /* cost should be inserted by caller */ + plan->targetlist = qptlist; + plan->qual = qpqual; + plan->lefttree = NULL; + plan->righttree = NULL; + node->scanrelid = scanrelid; + + return node; +} + static IndexScan * make_indexscan(List *qptlist, List *qpqual, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 8de57c8e6b..9ba10516bb 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -60,6 +60,7 @@ planner_hook_type planner_hook = NULL; #define EXPRKIND_LIMIT 6 #define EXPRKIND_APPINFO 7 #define EXPRKIND_PHV 8 +#define EXPRKIND_TABLESAMPLE 9 /* Passthrough data for standard_qp_callback */ typedef struct @@ -486,7 +487,19 @@ subquery_planner(PlannerGlobal *glob, Query *parse, RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); int kind; - if (rte->rtekind == RTE_SUBQUERY) + if (rte->rtekind == RTE_RELATION) + { + if (rte->tablesample) + { + rte->tablesample->args = (List *) + preprocess_expression(root, (Node *) rte->tablesample->args, + EXPRKIND_TABLESAMPLE); + rte->tablesample->repeatable = (Node *) + preprocess_expression(root, rte->tablesample->repeatable, + EXPRKIND_TABLESAMPLE); + } + } + else if (rte->rtekind == RTE_SUBQUERY) { /* * We don't want to do all preprocessing yet on the subquery's diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 517409d28a..73b1988836 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -451,6 +451,17 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) fix_scan_list(root, splan->plan.qual, rtoffset); } break; + case T_SampleScan: + { + SampleScan *splan = (SampleScan *) plan; + + splan->scanrelid += rtoffset; + splan->plan.targetlist = + fix_scan_list(root, splan->plan.targetlist, rtoffset); + splan->plan.qual = + fix_scan_list(root, splan->plan.qual, rtoffset); + } + break; case T_IndexScan: { IndexScan *splan = (IndexScan *) plan; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index afccee53ac..2f7f5c0df0 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2167,6 +2167,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, break; case T_SeqScan: + case T_SampleScan: context.paramids = bms_add_members(context.paramids, scan_params); break; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index faca30b322..ea7a47bdf4 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -705,6 +705,26 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) return pathnode; } +/* + * create_samplescan_path + * Like seqscan but uses sampling function while scanning. + */ +Path * +create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) +{ + Path *pathnode = makeNode(Path); + + pathnode->pathtype = T_SampleScan; + pathnode->parent = rel; + pathnode->param_info = get_baserel_parampathinfo(root, rel, + required_outer); + pathnode->pathkeys = NIL; /* samplescan has unordered result */ + + cost_samplescan(pathnode, root, rel); + + return pathnode; +} + /* * create_index_path * Creates a path node for an index scan. @@ -1778,6 +1798,8 @@ reparameterize_path(PlannerInfo *root, Path *path, case T_SubqueryScan: return create_subqueryscan_path(root, rel, path->pathkeys, required_outer); + case T_SampleScan: + return (Path *) create_samplescan_path(root, rel, required_outer); default: break; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 2dce87879e..1439783068 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -454,6 +454,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type relation_expr %type relation_expr_opt_alias %type target_el single_set_clause set_target insert_column_item +%type relation_expr_tablesample tablesample_clause opt_repeatable_clause %type generic_option_name %type generic_option_arg @@ -625,8 +626,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); STATEMENT STATISTICS STDIN STDOUT STORAGE STRICT_P STRIP_P SUBSTRING SYMMETRIC SYSID SYSTEM_P - TABLE TABLES TABLESPACE TEMP TEMPLATE TEMPORARY TEXT_P THEN TIME TIMESTAMP - TO TRAILING TRANSACTION TRANSFORM TREAT TRIGGER TRIM TRUE_P + TABLE TABLES TABLESAMPLE TABLESPACE TEMP TEMPLATE TEMPORARY TEXT_P THEN + TIME TIMESTAMP TO TRAILING TRANSACTION TRANSFORM TREAT TRIGGER TRIM TRUE_P TRUNCATE TRUSTED TYPE_P TYPES_P UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED @@ -10386,6 +10387,10 @@ table_ref: relation_expr opt_alias_clause $1->alias = $2; $$ = (Node *) $1; } + | relation_expr_tablesample + { + $$ = (Node *) $1; + } | func_table func_alias_clause { RangeFunction *n = (RangeFunction *) $1; @@ -10711,6 +10716,32 @@ relation_expr_opt_alias: relation_expr %prec UMINUS } ; + +relation_expr_tablesample: relation_expr opt_alias_clause tablesample_clause + { + RangeTableSample *n = (RangeTableSample *) $3; + n->relation = $1; + n->relation->alias = $2; + $$ = (Node *) n; + } + ; + +tablesample_clause: + TABLESAMPLE ColId '(' expr_list ')' opt_repeatable_clause + { + RangeTableSample *n = makeNode(RangeTableSample); + n->method = $2; + n->args = $4; + n->repeatable = $6; + $$ = (Node *) n; + } + ; + +opt_repeatable_clause: + REPEATABLE '(' a_expr ')' { $$ = (Node *) $3; } + | /*EMPTY*/ { $$ = NULL; } + ; + /* * func_table represents a function invocation in a FROM list. It can be * a plain function call, like "foo(...)", or a ROWS FROM expression with @@ -13804,6 +13835,7 @@ type_func_name_keyword: | OVERLAPS | RIGHT | SIMILAR + | TABLESAMPLE | VERBOSE ; diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 73c505ed85..6b1bbe57d0 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -17,6 +17,7 @@ #include "access/heapam.h" #include "catalog/catalog.h" +#include "access/htup_details.h" #include "catalog/heap.h" #include "catalog/pg_constraint.h" #include "catalog/pg_type.h" @@ -31,6 +32,7 @@ #include "parser/parse_coerce.h" #include "parser/parse_collate.h" #include "parser/parse_expr.h" +#include "parser/parse_func.h" #include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" @@ -39,6 +41,7 @@ #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/syscache.h" /* Convenience macro for the most common makeNamespaceItem() case */ @@ -419,6 +422,39 @@ transformJoinOnClause(ParseState *pstate, JoinExpr *j, List *namespace) return result; } +static RangeTblEntry * +transformTableSampleEntry(ParseState *pstate, RangeTableSample *rv) +{ + RangeTblEntry *rte = NULL; + CommonTableExpr *cte = NULL; + TableSampleClause *tablesample = NULL; + + /* if relation has an unqualified name, it might be a CTE reference */ + if (!rv->relation->schemaname) + { + Index levelsup; + cte = scanNameSpaceForCTE(pstate, rv->relation->relname, &levelsup); + } + + /* We first need to build a range table entry */ + if (!cte) + rte = transformTableEntry(pstate, rv->relation); + + if (!rte || + (rte->relkind != RELKIND_RELATION && + rte->relkind != RELKIND_MATVIEW)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("TABLESAMPLE clause can only be used on tables and materialized views"), + parser_errposition(pstate, rv->relation->location))); + + tablesample = ParseTableSample(pstate, rv->method, rv->repeatable, + rv->args, rv->relation->location); + rte->tablesample = tablesample; + + return rte; +} + /* * transformTableEntry --- transform a RangeVar (simple relation reference) */ @@ -1127,6 +1163,26 @@ transformFromClauseItem(ParseState *pstate, Node *n, return (Node *) j; } + else if (IsA(n, RangeTableSample)) + { + /* Tablesample reference */ + RangeTableSample *rv = (RangeTableSample *) n; + RangeTblRef *rtr; + RangeTblEntry *rte = NULL; + int rtindex; + + rte = transformTableSampleEntry(pstate, rv); + + /* assume new rte is at end */ + rtindex = list_length(pstate->p_rtable); + Assert(rte == rt_fetch(rtindex, pstate->p_rtable)); + *top_rte = rte; + *top_rti = rtindex; + *namespace = list_make1(makeDefaultNSItem(rte)); + rtr = makeNode(RangeTblRef); + rtr->rtindex = rtindex; + return (Node *) rtr; + } else elog(ERROR, "unrecognized node type: %d", (int) nodeTag(n)); return NULL; /* can't get here, keep compiler quiet */ diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index f7affebf84..fa50f92d8d 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -18,6 +18,7 @@ #include "catalog/pg_aggregate.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "catalog/pg_tablesample_method.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "nodes/makefuncs.h" @@ -26,6 +27,7 @@ #include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parse_func.h" +#include "parser/parse_expr.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" #include "parser/parse_type.h" @@ -767,6 +769,147 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, } +/* + * ParseTableSample + * + * Parse TABLESAMPLE clause and process the arguments + */ +TableSampleClause * +ParseTableSample(ParseState *pstate, char *samplemethod, Node *repeatable, + List *sampleargs, int location) +{ + HeapTuple tuple; + Form_pg_tablesample_method tsm; + Form_pg_proc procform; + TableSampleClause *tablesample; + List *fargs; + ListCell *larg; + int nargs, initnargs; + Oid init_arg_types[FUNC_MAX_ARGS]; + + /* Load the tablesample method */ + tuple = SearchSysCache1(TABLESAMPLEMETHODNAME, PointerGetDatum(samplemethod)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("tablesample method \"%s\" does not exist", + samplemethod), + parser_errposition(pstate, location))); + + tablesample = makeNode(TableSampleClause); + tablesample->tsmid = HeapTupleGetOid(tuple); + + tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); + + tablesample->tsmseqscan = tsm->tsmseqscan; + tablesample->tsmpagemode = tsm->tsmpagemode; + tablesample->tsminit = tsm->tsminit; + tablesample->tsmnextblock = tsm->tsmnextblock; + tablesample->tsmnexttuple = tsm->tsmnexttuple; + tablesample->tsmexaminetuple = tsm->tsmexaminetuple; + tablesample->tsmend = tsm->tsmend; + tablesample->tsmreset = tsm->tsmreset; + tablesample->tsmcost = tsm->tsmcost; + + ReleaseSysCache(tuple); + + /* Validate the parameters against init function definition. */ + tuple = SearchSysCache1(PROCOID, + ObjectIdGetDatum(tablesample->tsminit)); + + if (!HeapTupleIsValid(tuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for function %u", + tablesample->tsminit); + + procform = (Form_pg_proc) GETSTRUCT(tuple); + initnargs = procform->pronargs; + Assert(initnargs >= 3); + + /* + * First parameter is used to pass the SampleScanState, second is + * seed (REPEATABLE), skip the processing for them here, just assert + * that the types are correct. + */ + Assert(procform->proargtypes.values[0] == INTERNALOID); + Assert(procform->proargtypes.values[1] == INT4OID); + initnargs -= 2; + memcpy(init_arg_types, procform->proargtypes.values + 2, + initnargs * sizeof(Oid)); + + /* Now we are done with the catalog */ + ReleaseSysCache(tuple); + + /* Process repeatable (seed) */ + if (repeatable != NULL) + { + Node *arg = repeatable; + + if (arg && IsA(arg, A_Const)) + { + A_Const *con = (A_Const *) arg; + + if (con->val.type == T_Null) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("REPEATABLE clause must be NOT NULL numeric value"), + parser_errposition(pstate, con->location))); + + } + + arg = transformExpr(pstate, arg, EXPR_KIND_FROM_FUNCTION); + arg = coerce_to_specific_type(pstate, arg, INT4OID, "REPEATABLE"); + tablesample->repeatable = arg; + } + else + tablesample->repeatable = NULL; + + /* Check user provided expected number of arguments. */ + if (list_length(sampleargs) != initnargs) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg_plural("tablesample method \"%s\" expects %d argument got %d", + "tablesample method \"%s\" expects %d arguments got %d", + initnargs, + samplemethod, + initnargs, list_length(sampleargs)), + parser_errposition(pstate, location))); + + /* Transform the arguments, typecasting them as needed. */ + fargs = NIL; + nargs = 0; + foreach(larg, sampleargs) + { + Node *inarg = (Node *) lfirst(larg); + Node *arg = transformExpr(pstate, inarg, EXPR_KIND_FROM_FUNCTION); + Oid argtype = exprType(arg); + + if (argtype != init_arg_types[nargs]) + { + if (!can_coerce_type(1, &argtype, &init_arg_types[nargs], + COERCION_IMPLICIT)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("wrong parameter %d for tablesample method \"%s\"", + nargs + 1, samplemethod), + errdetail("Expected type %s got %s.", + format_type_be(init_arg_types[nargs]), + format_type_be(argtype)), + parser_errposition(pstate, exprLocation(inarg)))); + + arg = coerce_type(pstate, arg, argtype, init_arg_types[nargs], -1, + COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, -1); + } + + fargs = lappend(fargs, arg); + nargs++; + } + + /* Pass the arguments down */ + tablesample->args = fargs; + + return tablesample; +} + /* func_match_argtypes() * * Given a list of candidate functions (having the right name and number diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 39302a410b..e27afd1a3e 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -2209,6 +2209,9 @@ view_query_is_auto_updatable(Query *viewquery, bool check_cols) base_rte->relkind != RELKIND_VIEW)) return gettext_noop("Views that do not select from a single table or view are not automatically updatable."); + if (base_rte->tablesample) + return gettext_noop("Views containing TABLESAMPLE are not automatically updatable."); + /* * Check that the view has at least one updatable column. This is required * for INSERT/UPDATE but not for DELETE. diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 903e80aea3..298eebf5e6 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -32,6 +32,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" +#include "catalog/pg_tablesample_method.h" #include "catalog/pg_trigger.h" #include "catalog/pg_type.h" #include "commands/defrem.h" @@ -345,6 +346,8 @@ static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags); static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags, int wrapColumn); +static void get_tablesample_def(TableSampleClause *tablesample, + deparse_context *context); static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, TupleDesc resultDesc, int prettyFlags, int wrapColumn, int startIndent); @@ -4220,6 +4223,50 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, heap_close(ev_relation, AccessShareLock); } +/* ---------- + * get_tablesample_def - Convert TableSampleClause back to SQL + * ---------- + */ +static void +get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) +{ + StringInfo buf = context->buf; + HeapTuple tuple; + Form_pg_tablesample_method tsm; + char *tsmname; + int nargs; + ListCell *l; + + /* Load the tablesample method */ + tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tablesample->tsmid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("cache lookup failed for tablesample method %u", + tablesample->tsmid))); + + tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); + tsmname = NameStr(tsm->tsmname); + appendStringInfo(buf, " TABLESAMPLE %s (", quote_identifier(tsmname)); + + ReleaseSysCache(tuple); + + nargs = 0; + foreach(l, tablesample->args) + { + if (nargs++ > 0) + appendStringInfoString(buf, ", "); + get_rule_expr((Node *) lfirst(l), context, true); + } + appendStringInfoChar(buf, ')'); + + if (tablesample->repeatable != NULL) + { + appendStringInfoString(buf, " REPEATABLE ("); + get_rule_expr(tablesample->repeatable, context, true); + appendStringInfoChar(buf, ')'); + } +} /* ---------- * get_query_def - Parse back one query parsetree @@ -8529,6 +8576,9 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) only_marker(rte), generate_relation_name(rte->relid, context->namespaces)); + + if (rte->tablesample) + get_tablesample_def(rte->tablesample, context); break; case RTE_SUBQUERY: /* Subquery RTE */ diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 1dc293297d..f259751e15 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -32,6 +32,7 @@ #include "catalog/pg_range.h" #include "catalog/pg_statistic.h" #include "catalog/pg_transform.h" +#include "catalog/pg_tablesample_method.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -2996,3 +2997,29 @@ get_range_subtype(Oid rangeOid) else return InvalidOid; } + +/* ---------- PG_TABLESAMPLE_METHOD CACHE ---------- */ + +/* + * get_tablesample_method_name - given a tablesample method OID, + * look up the name or NULL if not found + */ +char * +get_tablesample_method_name(Oid tsmid) +{ + HeapTuple tuple; + + tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tsmid)); + if (HeapTupleIsValid(tuple)) + { + Form_pg_tablesample_method tup = + (Form_pg_tablesample_method) GETSTRUCT(tuple); + char *result; + + result = pstrdup(NameStr(tup->tsmname)); + ReleaseSysCache(tuple); + return result; + } + else + return NULL; +} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index f58e1cebf2..7def1be32a 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -56,6 +56,7 @@ #include "catalog/pg_shseclabel.h" #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_tablesample_method.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_ts_config.h" @@ -666,6 +667,28 @@ static const struct cachedesc cacheinfo[] = { }, 128 }, + {TableSampleMethodRelationId, /* TABLESAMPLEMETHODNAME */ + TableSampleMethodNameIndexId, + 1, + { + Anum_pg_tablesample_method_tsmname, + 0, + 0, + 0, + }, + 2 + }, + {TableSampleMethodRelationId, /* TABLESAMPLEMETHODOID */ + TableSampleMethodOidIndexId, + 1, + { + ObjectIdAttributeNumber, + 0, + 0, + 0, + }, + 2 + }, {TableSpaceRelationId, /* TABLESPACEOID */ TablespaceOidIndexId, 1, diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c index 1eeabaf158..9becc63bf8 100644 --- a/src/backend/utils/misc/sampling.c +++ b/src/backend/utils/misc/sampling.c @@ -46,6 +46,8 @@ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, bs->n = samplesize; bs->t = 0; /* blocks scanned so far */ bs->m = 0; /* blocks selected so far */ + + sampler_random_init_state(randseed, bs->randstate); } bool @@ -92,7 +94,7 @@ BlockSampler_Next(BlockSampler bs) * less than k, which means that we cannot fail to select enough blocks. *---------- */ - V = sampler_random_fract(); + V = sampler_random_fract(bs->randstate); p = 1.0 - (double) k / (double) K; while (V < p) { @@ -126,8 +128,14 @@ BlockSampler_Next(BlockSampler bs) void reservoir_init_selection_state(ReservoirState rs, int n) { + /* + * Reservoir sampling is not used anywhere where it would need to return + * repeatable results so we can initialize it randomly. + */ + sampler_random_init_state(random(), rs->randstate); + /* Initial value of W (for use when Algorithm Z is first applied) */ - *rs = exp(-log(sampler_random_fract()) / n); + rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n); } double @@ -142,7 +150,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) double V, quot; - V = sampler_random_fract(); /* Generate V */ + V = sampler_random_fract(rs->randstate); /* Generate V */ S = 0; t += 1; /* Note: "num" in Vitter's code is always equal to t - n */ @@ -158,7 +166,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) else { /* Now apply Algorithm Z */ - double W = *rs; + double W = rs->W; double term = t - (double) n + 1; for (;;) @@ -174,7 +182,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) tmp; /* Generate U and X */ - U = sampler_random_fract(); + U = sampler_random_fract(rs->randstate); X = t * (W - 1.0); S = floor(X); /* S is tentatively set to floor(X) */ /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ @@ -203,11 +211,11 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) y *= numer / denom; denom -= 1; } - W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */ + W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */ if (exp(log(y) / n) <= (t + X) / t) break; } - *rs = W; + rs->W = W; } return S; } @@ -217,10 +225,17 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) * Random number generator used by sampling *---------- */ +void +sampler_random_init_state(long seed, SamplerRandomState randstate) +{ + randstate[0] = RAND48_SEED_0; + randstate[1] = (unsigned short) seed; + randstate[2] = (unsigned short) (seed >> 16); +} /* Select a random value R uniformly distributed in (0 - 1) */ double -sampler_random_fract() +sampler_random_fract(SamplerRandomState randstate) { - return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2); + return pg_erand48(randstate); } diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 49c8ca4d66..eec7c95b21 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -114,8 +114,12 @@ extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot, bool allow_strat, bool allow_sync); extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key); +extern HeapScanDesc heap_beginscan_sampling(Relation relation, + Snapshot snapshot, int nkeys, ScanKey key, + bool allow_strat, bool allow_pagemode); extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber endBlk); +extern void heapgetpage(HeapScanDesc scan, BlockNumber page); extern void heap_rescan(HeapScanDesc scan, ScanKey key); extern void heap_endscan(HeapScanDesc scan); extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 5a0d724aca..1b9b299395 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -29,6 +29,7 @@ typedef struct HeapScanDescData int rs_nkeys; /* number of scan keys */ ScanKey rs_key; /* array of scan key descriptors */ bool rs_bitmapscan; /* true if this is really a bitmap scan */ + bool rs_samplescan; /* true if this is really a sample scan */ bool rs_pageatatime; /* verify visibility page-at-a-time? */ bool rs_allow_strat; /* allow or disallow use of access strategy */ bool rs_allow_sync; /* allow or disallow use of syncscan */ diff --git a/src/include/access/tablesample.h b/src/include/access/tablesample.h new file mode 100644 index 0000000000..222fa8d556 --- /dev/null +++ b/src/include/access/tablesample.h @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------- + * + * tablesample.h + * Public header file for TABLESAMPLE clause interface + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tablesample.h + * + *------------------------------------------------------------------------- + */ +#ifndef TABLESAMPLE_H +#define TABLESAMPLE_H + +#include "access/relscan.h" +#include "executor/executor.h" + +typedef struct TableSampleDesc { + HeapScanDesc heapScan; + TupleDesc tupDesc; /* Mostly useful for tsmexaminetuple */ + + void *tsmdata; /* private method data */ + + /* These point to he function of the TABLESAMPLE Method. */ + FmgrInfo tsminit; + FmgrInfo tsmnextblock; + FmgrInfo tsmnexttuple; + FmgrInfo tsmexaminetuple; + FmgrInfo tsmreset; + FmgrInfo tsmend; +} TableSampleDesc; + + +extern TableSampleDesc *tablesample_init(SampleScanState *scanstate, + TableSampleClause *tablesample); +extern HeapTuple tablesample_getnext(TableSampleDesc *desc); +extern void tablesample_reset(TableSampleDesc *desc); +extern void tablesample_end(TableSampleDesc *desc); +extern HeapTuple tablesample_source_getnext(TableSampleDesc *desc); +extern HeapTuple tablesample_source_gettup(TableSampleDesc *desc, ItemPointer tid, + bool *visible); + +extern Datum tsm_system_init(PG_FUNCTION_ARGS); +extern Datum tsm_system_nextblock(PG_FUNCTION_ARGS); +extern Datum tsm_system_nexttuple(PG_FUNCTION_ARGS); +extern Datum tsm_system_end(PG_FUNCTION_ARGS); +extern Datum tsm_system_reset(PG_FUNCTION_ARGS); +extern Datum tsm_system_cost(PG_FUNCTION_ARGS); + +extern Datum tsm_bernoulli_init(PG_FUNCTION_ARGS); +extern Datum tsm_bernoulli_nextblock(PG_FUNCTION_ARGS); +extern Datum tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS); +extern Datum tsm_bernoulli_end(PG_FUNCTION_ARGS); +extern Datum tsm_bernoulli_reset(PG_FUNCTION_ARGS); +extern Datum tsm_bernoulli_cost(PG_FUNCTION_ARGS); + + +#endif diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 71e0010a6f..f20567ed5f 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -316,6 +316,11 @@ DECLARE_UNIQUE_INDEX(pg_replication_origin_roiident_index, 6001, on pg_replicati DECLARE_UNIQUE_INDEX(pg_replication_origin_roname_index, 6002, on pg_replication_origin using btree(roname varchar_pattern_ops)); #define ReplicationOriginNameIndex 6002 +DECLARE_UNIQUE_INDEX(pg_tablesample_method_name_index, 3331, on pg_tablesample_method using btree(tsmname name_ops)); +#define TableSampleMethodNameIndexId 3331 +DECLARE_UNIQUE_INDEX(pg_tablesample_method_oid_index, 3332, on pg_tablesample_method using btree(oid oid_ops)); +#define TableSampleMethodOidIndexId 3332 + /* last step of initialization script: build the indexes declared above */ BUILD_INDICES diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 1c9edbc3b3..c2185bd9ad 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5297,6 +5297,33 @@ DESCR("get an individual replication origin's replication progress"); DATA(insert OID = 6014 ( pg_show_replication_origin_status PGNSP PGUID 12 1 100 0 0 f f f f f t v 0 0 2249 "" "{26,25,3220,3220}" "{o,o,o,o}" "{local_id, external_id, remote_lsn, local_lsn}" _null_ _null_ pg_show_replication_origin_status _null_ _null_ _null_ )); DESCR("get progress for all replication origins"); +/* tablesample */ +DATA(insert OID = 3335 ( tsm_system_init PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2278 "2281 23 700" _null_ _null_ _null_ _null_ _null_ tsm_system_init _null_ _null_ _null_ )); +DESCR("tsm_system_init(internal)"); +DATA(insert OID = 3336 ( tsm_system_nextblock PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 23 "2281 16" _null_ _null_ _null_ _null_ _null_ tsm_system_nextblock _null_ _null_ _null_ )); +DESCR("tsm_system_nextblock(internal)"); +DATA(insert OID = 3337 ( tsm_system_nexttuple PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 21 "2281 23 21 16" _null_ _null_ _null_ _null_ _null_ tsm_system_nexttuple _null_ _null_ _null_ )); +DESCR("tsm_system_nexttuple(internal)"); +DATA(insert OID = 3338 ( tsm_system_end PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_system_end _null_ _null_ _null_ )); +DESCR("tsm_system_end(internal)"); +DATA(insert OID = 3339 ( tsm_system_reset PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_system_reset _null_ _null_ _null_ )); +DESCR("tsm_system_reset(internal)"); +DATA(insert OID = 3340 ( tsm_system_cost PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ _null_ tsm_system_cost _null_ _null_ _null_ )); +DESCR("tsm_system_cost(internal)"); + +DATA(insert OID = 3341 ( tsm_bernoulli_init PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2278 "2281 23 700" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_init _null_ _null_ _null_ )); +DESCR("tsm_bernoulli_init(internal)"); +DATA(insert OID = 3342 ( tsm_bernoulli_nextblock PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 23 "2281 16" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_nextblock _null_ _null_ _null_ )); +DESCR("tsm_bernoulli_nextblock(internal)"); +DATA(insert OID = 3343 ( tsm_bernoulli_nexttuple PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 21 "2281 23 21 16" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_nexttuple _null_ _null_ _null_ )); +DESCR("tsm_bernoulli_nexttuple(internal)"); +DATA(insert OID = 3344 ( tsm_bernoulli_end PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_end _null_ _null_ _null_ )); +DESCR("tsm_bernoulli_end(internal)"); +DATA(insert OID = 3345 ( tsm_bernoulli_reset PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_reset _null_ _null_ _null_ )); +DESCR("tsm_bernoulli_reset(internal)"); +DATA(insert OID = 3346 ( tsm_bernoulli_cost PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_cost _null_ _null_ _null_ )); +DESCR("tsm_bernoulli_cost(internal)"); + /* * Symbolic values for provolatile column: these indicate whether the result * of a function is dependent *only* on the values of its explicit arguments, diff --git a/src/include/catalog/pg_tablesample_method.h b/src/include/catalog/pg_tablesample_method.h new file mode 100644 index 0000000000..968d1e696a --- /dev/null +++ b/src/include/catalog/pg_tablesample_method.h @@ -0,0 +1,78 @@ +/*------------------------------------------------------------------------- + * + * pg_tablesample_method.h + * definition of the table scan methods. + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/pg_tablesample_method.h + * + * + *------------------------------------------------------------------------- + */ +#ifndef PG_TABLESAMPLE_METHOD_H +#define PG_TABLESAMPLE_METHOD_H + +#include "catalog/genbki.h" +#include "catalog/objectaddress.h" + +/* ---------------- + * pg_tablesample_method definition. cpp turns this into + * typedef struct FormData_pg_tablesample_method + * ---------------- + */ +#define TableSampleMethodRelationId 3330 + +CATALOG(pg_tablesample_method,3330) +{ + NameData tsmname; /* tablesample method name */ + bool tsmseqscan; /* does this method scan whole table sequentially? */ + bool tsmpagemode; /* does this method scan page at a time? */ + regproc tsminit; /* init scan function */ + regproc tsmnextblock; /* function returning next block to sample + or InvalidBlockOffset if finished */ + regproc tsmnexttuple; /* function returning next tuple offset from current block + or InvalidOffsetNumber if end of the block was reacher */ + regproc tsmexaminetuple; /* optional function which can examine tuple contents and + decide if tuple should be returned or not */ + regproc tsmend; /* end scan function*/ + regproc tsmreset; /* reset state - used by rescan */ + regproc tsmcost; /* costing function */ +} FormData_pg_tablesample_method; + +/* ---------------- + * Form_pg_tablesample_method corresponds to a pointer to a tuple with + * the format of pg_tablesample_method relation. + * ---------------- + */ +typedef FormData_pg_tablesample_method *Form_pg_tablesample_method; + +/* ---------------- + * compiler constants for pg_tablesample_method + * ---------------- + */ +#define Natts_pg_tablesample_method 10 +#define Anum_pg_tablesample_method_tsmname 1 +#define Anum_pg_tablesample_method_tsmseqscan 2 +#define Anum_pg_tablesample_method_tsmpagemode 3 +#define Anum_pg_tablesample_method_tsminit 4 +#define Anum_pg_tablesample_method_tsmnextblock 5 +#define Anum_pg_tablesample_method_tsmnexttuple 6 +#define Anum_pg_tablesample_method_tsmexaminetuple 7 +#define Anum_pg_tablesample_method_tsmend 8 +#define Anum_pg_tablesample_method_tsmreset 9 +#define Anum_pg_tablesample_method_tsmcost 10 + +/* ---------------- + * initial contents of pg_tablesample_method + * ---------------- + */ + +DATA(insert OID = 3333 ( system false true tsm_system_init tsm_system_nextblock tsm_system_nexttuple - tsm_system_end tsm_system_reset tsm_system_cost )); +DESCR("SYSTEM table sampling method"); +DATA(insert OID = 3334 ( bernoulli true false tsm_bernoulli_init tsm_bernoulli_nextblock tsm_bernoulli_nexttuple - tsm_bernoulli_end tsm_bernoulli_reset tsm_bernoulli_cost )); +DESCR("BERNOULLI table sampling method"); + +#endif /* PG_TABLESAMPLE_METHOD_H */ diff --git a/src/include/executor/nodeSamplescan.h b/src/include/executor/nodeSamplescan.h new file mode 100644 index 0000000000..4b769daec8 --- /dev/null +++ b/src/include/executor/nodeSamplescan.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * nodeSamplescan.h + * + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeSamplescan.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODESAMPLESCAN_H +#define NODESAMPLESCAN_H + +#include "nodes/execnodes.h" + +extern SampleScanState *ExecInitSampleScan(SampleScan *node, EState *estate, int eflags); +extern TupleTableSlot *ExecSampleScan(SampleScanState *node); +extern void ExecEndSampleScan(SampleScanState *node); +extern void ExecReScanSampleScan(SampleScanState *node); + +#endif /* NODESAMPLESCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index fcfe1107f9..972368019a 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1237,6 +1237,15 @@ typedef struct ScanState */ typedef ScanState SeqScanState; +/* + * SampleScan + */ +typedef struct SampleScanState +{ + ScanState ss; + struct TableSampleDesc *tsdesc; +} SampleScanState; + /* * These structs store information about index quals that don't have simple * constant right-hand sides. See comments for ExecIndexBuildScanKeys() diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 768f413a45..8b275f6e26 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -61,6 +61,7 @@ typedef enum NodeTag T_ValuesScan, T_CteScan, T_WorkTableScan, + T_SampleScan, T_ForeignScan, T_CustomScan, T_Join, @@ -97,6 +98,7 @@ typedef enum NodeTag T_BitmapOrState, T_ScanState, T_SeqScanState, + T_SampleScanState, T_IndexScanState, T_IndexOnlyScanState, T_BitmapIndexScanState, @@ -419,6 +421,8 @@ typedef enum NodeTag T_OnConflictClause, T_CommonTableExpr, T_RoleSpec, + T_RangeTableSample, + T_TableSampleClause, /* * TAGS FOR REPLICATION GRAMMAR PARSE NODES (replnodes.h) diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 053f1b0121..6723f46f3f 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -335,6 +335,26 @@ typedef struct FuncCall int location; /* token location, or -1 if unknown */ } FuncCall; +/* + * TableSampleClause - a sampling method information + */ +typedef struct TableSampleClause +{ + NodeTag type; + Oid tsmid; + bool tsmseqscan; + bool tsmpagemode; + Oid tsminit; + Oid tsmnextblock; + Oid tsmnexttuple; + Oid tsmexaminetuple; + Oid tsmend; + Oid tsmreset; + Oid tsmcost; + Node *repeatable; + List *args; +} TableSampleClause; + /* * A_Star - '*' representing all columns of a table or compound field * @@ -535,6 +555,22 @@ typedef struct RangeFunction * of function returning RECORD */ } RangeFunction; +/* + * RangeTableSample - represents TABLESAMPLE () REPEATABLE () + * + * SQL Standard specifies only one parameter which is percentage. But we allow + * custom tablesample methods which may need different input arguments so we + * accept list of arguments. + */ +typedef struct RangeTableSample +{ + NodeTag type; + RangeVar *relation; + char *method; /* sampling method */ + Node *repeatable; + List *args; /* arguments for sampling method */ +} RangeTableSample; + /* * ColumnDef - column definition (used in various creates) * @@ -772,6 +808,7 @@ typedef struct RangeTblEntry */ Oid relid; /* OID of the relation */ char relkind; /* relation kind (see pg_class.relkind) */ + TableSampleClause *tablesample; /* sampling method and parameters */ /* * Fields valid for a subquery RTE (else NULL): diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 65f71d8170..4e655b0e6c 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -286,6 +286,12 @@ typedef struct Scan */ typedef Scan SeqScan; +/* ---------------- + * table sample scan node + * ---------------- + */ +typedef Scan SampleScan; + /* ---------------- * index scan node * diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 9c2000b15a..24003ae359 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -68,6 +68,7 @@ extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, double index_pages, PlannerInfo *root); extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); +extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_index(IndexPath *path, PlannerInfo *root, double loop_count); extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 9923f0eb3e..89c8deda95 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -32,6 +32,8 @@ extern bool add_path_precheck(RelOptInfo *parent_rel, extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer); +extern Path *create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, + Relids required_outer); extern IndexPath *create_index_path(PlannerInfo *root, IndexOptInfo *index, List *indexclauses, diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index faea99108c..7d5f857ae5 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -368,6 +368,7 @@ PG_KEYWORD("sysid", SYSID, UNRESERVED_KEYWORD) PG_KEYWORD("system", SYSTEM_P, UNRESERVED_KEYWORD) PG_KEYWORD("table", TABLE, RESERVED_KEYWORD) PG_KEYWORD("tables", TABLES, UNRESERVED_KEYWORD) +PG_KEYWORD("tablesample", TABLESAMPLE, TYPE_FUNC_NAME_KEYWORD) PG_KEYWORD("tablespace", TABLESPACE, UNRESERVED_KEYWORD) PG_KEYWORD("temp", TEMP, UNRESERVED_KEYWORD) PG_KEYWORD("template", TEMPLATE, UNRESERVED_KEYWORD) diff --git a/src/include/parser/parse_func.h b/src/include/parser/parse_func.h index 32646918e2..40c007c35f 100644 --- a/src/include/parser/parse_func.h +++ b/src/include/parser/parse_func.h @@ -33,6 +33,11 @@ typedef enum extern Node *ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, FuncCall *fn, int location); +extern TableSampleClause *ParseTableSample(ParseState *pstate, + char *samplemethod, + Node *repeatable, List *args, + int location); + extern FuncDetailCode func_get_detail(List *funcname, List *fargs, List *fargnames, int nargs, Oid *argtypes, diff --git a/src/include/port.h b/src/include/port.h index 3787cbfb76..71113c0394 100644 --- a/src/include/port.h +++ b/src/include/port.h @@ -357,6 +357,10 @@ extern off_t ftello(FILE *stream); #endif #endif +#define RAND48_SEED_0 (0x330e) +#define RAND48_SEED_1 (0xabcd) +#define RAND48_SEED_2 (0x1234) + extern double pg_erand48(unsigned short xseed[3]); extern long pg_lrand48(void); extern void pg_srand48(long seed); diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index ffbaa61e5e..e2e5734ea7 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -156,6 +156,7 @@ extern void free_attstatsslot(Oid atttype, extern char *get_namespace_name(Oid nspid); extern char *get_namespace_name_or_temp(Oid nspid); extern Oid get_range_subtype(Oid rangeOid); +extern char *get_tablesample_method_name(Oid tsmid); #define type_is_array(typid) (get_element_type(typid) != InvalidOid) /* type_is_array_domain accepts both plain arrays and domains over arrays */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 9e17d87413..fd40366bcd 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -63,7 +63,6 @@ typedef struct RelationAmInfo FmgrInfo amcanreturn; } RelationAmInfo; - /* * Here are the contents of a relation cache entry. */ diff --git a/src/include/utils/sampling.h b/src/include/utils/sampling.h index e3e7f9cf6a..4ac208dc36 100644 --- a/src/include/utils/sampling.h +++ b/src/include/utils/sampling.h @@ -15,7 +15,12 @@ #include "storage/bufmgr.h" -extern double sampler_random_fract(void); +/* Random generator for sampling code */ +typedef unsigned short SamplerRandomState[3]; + +extern void sampler_random_init_state(long seed, + SamplerRandomState randstate); +extern double sampler_random_fract(SamplerRandomState randstate); /* Block sampling methods */ /* Data structure for Algorithm S from Knuth 3.4.2 */ @@ -25,6 +30,7 @@ typedef struct int n; /* desired sample size */ BlockNumber t; /* current block number */ int m; /* blocks selected so far */ + SamplerRandomState randstate; /* random generator state */ } BlockSamplerData; typedef BlockSamplerData *BlockSampler; @@ -35,7 +41,12 @@ extern bool BlockSampler_HasMore(BlockSampler bs); extern BlockNumber BlockSampler_Next(BlockSampler bs); /* Reservoid sampling methods */ -typedef double ReservoirStateData; +typedef struct +{ + double W; + SamplerRandomState randstate; /* random generator state */ +} ReservoirStateData; + typedef ReservoirStateData *ReservoirState; extern void reservoir_init_selection_state(ReservoirState rs, int n); diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 6634099cbe..2dbd38488b 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -81,6 +81,8 @@ enum SysCacheIdentifier REPLORIGNAME, RULERELNAME, STATRELATTINH, + TABLESAMPLEMETHODNAME, + TABLESAMPLEMETHODOID, TABLESPACEOID, TRFOID, TRFTYPELANG, diff --git a/src/port/erand48.c b/src/port/erand48.c index 9d471197c3..12efd8193c 100644 --- a/src/port/erand48.c +++ b/src/port/erand48.c @@ -33,9 +33,6 @@ #include -#define RAND48_SEED_0 (0x330e) -#define RAND48_SEED_1 (0xabcd) -#define RAND48_SEED_2 (0x1234) #define RAND48_MULT_0 (0xe66d) #define RAND48_MULT_1 (0xdeec) #define RAND48_MULT_2 (0x0005) diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 82bc47895a..0ae555783b 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -101,6 +101,17 @@ NOTICE: f_leak => great manga 44 | 8 | 1 | rls_regress_user2 | great manga | manga (4 rows) +SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; +NOTICE: f_leak => my first novel +NOTICE: f_leak => my first manga +NOTICE: f_leak => great science fiction + did | cid | dlevel | dauthor | dtitle +-----+-----+--------+-------------------+----------------------- + 1 | 11 | 1 | rls_regress_user1 | my first novel + 4 | 44 | 1 | rls_regress_user1 | my first manga + 6 | 22 | 1 | rls_regress_user2 | great science fiction +(3 rows) + -- viewpoint from rls_regress_user2 SET SESSION AUTHORIZATION rls_regress_user2; SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did; @@ -145,6 +156,21 @@ NOTICE: f_leak => great manga 44 | 8 | 1 | rls_regress_user2 | great manga | manga (8 rows) +SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; +NOTICE: f_leak => my first novel +NOTICE: f_leak => my second novel +NOTICE: f_leak => my first manga +NOTICE: f_leak => great science fiction +NOTICE: f_leak => great technology book + did | cid | dlevel | dauthor | dtitle +-----+-----+--------+-------------------+----------------------- + 1 | 11 | 1 | rls_regress_user1 | my first novel + 2 | 11 | 2 | rls_regress_user1 | my second novel + 4 | 44 | 1 | rls_regress_user1 | my first manga + 6 | 22 | 1 | rls_regress_user2 | great science fiction + 7 | 33 | 2 | rls_regress_user2 | great technology book +(5 rows) + EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle); QUERY PLAN ---------------------------------------------------------- diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index eb0bc88ef1..14acd16da3 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -128,6 +128,7 @@ pg_shdepend|t pg_shdescription|t pg_shseclabel|t pg_statistic|t +pg_tablesample_method|t pg_tablespace|t pg_transform|t pg_trigger|t diff --git a/src/test/regress/expected/tablesample.out b/src/test/regress/expected/tablesample.out new file mode 100644 index 0000000000..04e5eb8b80 --- /dev/null +++ b/src/test/regress/expected/tablesample.out @@ -0,0 +1,231 @@ +CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages +INSERT INTO test_tablesample SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i) ORDER BY i; +SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (10); + id +---- + 0 + 1 + 2 + 3 + 4 + 5 + 9 +(7 rows) + +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (9999); + id +---- + 6 + 7 + 8 +(3 rows) + +SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100); + count +------- + 10 +(1 row) + +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); + id +---- + 0 + 1 + 2 + 6 + 7 + 8 + 9 +(7 rows) + +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (100); + id +---- + 0 + 1 + 3 + 4 + 5 +(5 rows) + +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); + id +---- + 0 + 5 +(2 rows) + +CREATE VIEW test_tablesample_v1 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); +CREATE VIEW test_tablesample_v2 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); +SELECT pg_get_viewdef('test_tablesample_v1'::regclass); + pg_get_viewdef +-------------------------------------------------------------------------------- + SELECT test_tablesample.id + + FROM test_tablesample TABLESAMPLE system (((10 * 2))::real) REPEATABLE (2); +(1 row) + +SELECT pg_get_viewdef('test_tablesample_v2'::regclass); + pg_get_viewdef +----------------------------------------------------------- + SELECT test_tablesample.id + + FROM test_tablesample TABLESAMPLE system ((99)::real); +(1 row) + +BEGIN; +DECLARE tablesample_cur CURSOR FOR SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); +FETCH FIRST FROM tablesample_cur; + id +---- + 0 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 1 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 2 +(1 row) + +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); + id +---- + 0 + 1 + 2 + 3 + 4 + 5 + 9 +(7 rows) + +FETCH NEXT FROM tablesample_cur; + id +---- + 6 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 7 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 8 +(1 row) + +FETCH FIRST FROM tablesample_cur; + id +---- + 0 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 1 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 2 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 6 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 7 +(1 row) + +FETCH NEXT FROM tablesample_cur; + id +---- + 8 +(1 row) + +CLOSE tablesample_cur; +END; +EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); + QUERY PLAN +------------------------------------------------------------------------------- + Sample Scan (system) on test_tablesample (cost=0.00..26.35 rows=635 width=4) +(1 row) + +EXPLAIN SELECT * FROM test_tablesample_v1; + QUERY PLAN +------------------------------------------------------------------------------- + Sample Scan (system) on test_tablesample (cost=0.00..10.54 rows=254 width=4) +(1 row) + +-- errors +SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); +ERROR: tablesample method "foobar" does not exist +LINE 1: SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); + ^ +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (NULL); +ERROR: REPEATABLE clause must be NOT NULL numeric value +LINE 1: ... test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (NULL); + ^ +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (-1); +ERROR: invalid sample size +HINT: Sample size must be numeric value between 0 and 100 (inclusive). +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (200); +ERROR: invalid sample size +HINT: Sample size must be numeric value between 0 and 100 (inclusive). +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (-1); +ERROR: invalid sample size +HINT: Sample size must be numeric value between 0 and 100 (inclusive). +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (200); +ERROR: invalid sample size +HINT: Sample size must be numeric value between 0 and 100 (inclusive). +SELECT id FROM test_tablesample_v1 TABLESAMPLE BERNOULLI (1); +ERROR: TABLESAMPLE clause can only be used on tables and materialized views +LINE 1: SELECT id FROM test_tablesample_v1 TABLESAMPLE BERNOULLI (1)... + ^ +INSERT INTO test_tablesample_v1 VALUES(1); +ERROR: cannot insert into view "test_tablesample_v1" +DETAIL: Views containing TABLESAMPLE are not automatically updatable. +HINT: To enable inserting into the view, provide an INSTEAD OF INSERT trigger or an unconditional ON INSERT DO INSTEAD rule. +WITH query_select AS (SELECT * FROM test_tablesample) +SELECT * FROM query_select TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); +ERROR: TABLESAMPLE clause can only be used on tables and materialized views +LINE 2: SELECT * FROM query_select TABLESAMPLE BERNOULLI (5.5) REPEA... + ^ +SELECT q.* FROM (SELECT * FROM test_tablesample) as q TABLESAMPLE BERNOULLI (5); +ERROR: syntax error at or near "TABLESAMPLE" +LINE 1: ...CT q.* FROM (SELECT * FROM test_tablesample) as q TABLESAMPL... + ^ +-- catalog sanity +SELECT * +FROM pg_tablesample_method +WHERE tsminit IS NULL + OR tsmseqscan IS NULL + OR tsmpagemode IS NULL + OR tsmnextblock IS NULL + OR tsmnexttuple IS NULL + OR tsmend IS NULL + OR tsmreset IS NULL + OR tsmcost IS NULL; + tsmname | tsmseqscan | tsmpagemode | tsminit | tsmnextblock | tsmnexttuple | tsmexaminetuple | tsmend | tsmreset | tsmcost +---------+------------+-------------+---------+--------------+--------------+-----------------+--------+----------+--------- +(0 rows) + +-- done +DROP TABLE test_tablesample CASCADE; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to view test_tablesample_v1 +drop cascades to view test_tablesample_v2 diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index b0ebb6b3f4..f39b73abc2 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -84,7 +84,7 @@ test: select_into select_distinct select_distinct_on select_implicit select_havi # ---------- # Another group of parallel tests # ---------- -test: brin gin gist spgist privileges security_label collate matview lock replica_identity rowsecurity object_address +test: brin gin gist spgist privileges security_label collate matview lock replica_identity rowsecurity object_address tablesample # ---------- # Another group of parallel tests diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 8409c0f3ef..9441b97e3a 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -154,3 +154,4 @@ test: with test: xml test: event_trigger test: stats +test: tablesample diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index e8c5932b20..fdadf99fd6 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -94,11 +94,15 @@ SET row_security TO ON; SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did; SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did; +SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; + -- viewpoint from rls_regress_user2 SET SESSION AUTHORIZATION rls_regress_user2; SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did; SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did; +SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; + EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle); EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle); diff --git a/src/test/regress/sql/tablesample.sql b/src/test/regress/sql/tablesample.sql new file mode 100644 index 0000000000..7b3eb9bedf --- /dev/null +++ b/src/test/regress/sql/tablesample.sql @@ -0,0 +1,74 @@ +CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages + +INSERT INTO test_tablesample SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i) ORDER BY i; + +SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (10); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (9999); +SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (100); +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); + +CREATE VIEW test_tablesample_v1 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); +CREATE VIEW test_tablesample_v2 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); +SELECT pg_get_viewdef('test_tablesample_v1'::regclass); +SELECT pg_get_viewdef('test_tablesample_v2'::regclass); + +BEGIN; +DECLARE tablesample_cur CURSOR FOR SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); +FETCH FIRST FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; + +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); + +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; + +FETCH FIRST FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; +FETCH NEXT FROM tablesample_cur; + +CLOSE tablesample_cur; +END; + +EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); +EXPLAIN SELECT * FROM test_tablesample_v1; + +-- errors +SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); + +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (NULL); + +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (-1); +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (200); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (-1); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (200); + +SELECT id FROM test_tablesample_v1 TABLESAMPLE BERNOULLI (1); +INSERT INTO test_tablesample_v1 VALUES(1); + +WITH query_select AS (SELECT * FROM test_tablesample) +SELECT * FROM query_select TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); + +SELECT q.* FROM (SELECT * FROM test_tablesample) as q TABLESAMPLE BERNOULLI (5); + +-- catalog sanity + +SELECT * +FROM pg_tablesample_method +WHERE tsminit IS NULL + OR tsmseqscan IS NULL + OR tsmpagemode IS NULL + OR tsmnextblock IS NULL + OR tsmnexttuple IS NULL + OR tsmend IS NULL + OR tsmreset IS NULL + OR tsmcost IS NULL; + +-- done +DROP TABLE test_tablesample CASCADE;