diff --git a/contrib/tsm_system_rows/tsm_system_rows.c b/contrib/tsm_system_rows/tsm_system_rows.c index 1d35ea3c53..3611d05833 100644 --- a/contrib/tsm_system_rows/tsm_system_rows.c +++ b/contrib/tsm_system_rows/tsm_system_rows.c @@ -28,7 +28,6 @@ #include "postgres.h" -#include "access/heapam.h" #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" @@ -46,7 +45,6 @@ typedef struct { uint32 seed; /* random seed */ int64 ntuples; /* number of tuples to return */ - int64 donetuples; /* number of tuples already returned */ OffsetNumber lt; /* last tuple returned from current block */ BlockNumber doneblocks; /* number of already-scanned blocks */ BlockNumber lb; /* last block visited */ @@ -67,11 +65,10 @@ static void system_rows_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); -static BlockNumber system_rows_nextsampleblock(SampleScanState *node); +static BlockNumber system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_rows_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); -static bool SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan); static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate); @@ -187,7 +184,6 @@ system_rows_beginsamplescan(SampleScanState *node, sampler->seed = seed; sampler->ntuples = ntuples; - sampler->donetuples = 0; sampler->lt = InvalidOffsetNumber; sampler->doneblocks = 0; /* lb will be initialized during first NextSampleBlock call */ @@ -206,11 +202,9 @@ system_rows_beginsamplescan(SampleScanState *node, * Uses linear probing algorithm for picking next block. */ static BlockNumber -system_rows_nextsampleblock(SampleScanState *node) +system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; /* First call within scan? */ if (sampler->doneblocks == 0) @@ -222,14 +216,14 @@ system_rows_nextsampleblock(SampleScanState *node) SamplerRandomState randstate; /* If relation is empty, there's nothing to scan */ - if (hscan->rs_nblocks == 0) + if (nblocks == 0) return InvalidBlockNumber; /* We only need an RNG during this setup step */ sampler_random_init_state(sampler->seed, randstate); /* Compute nblocks/firstblock/step only once per query */ - sampler->nblocks = hscan->rs_nblocks; + sampler->nblocks = nblocks; /* Choose random starting block within the relation */ /* (Actually this is the predecessor of the first block visited) */ @@ -246,7 +240,7 @@ system_rows_nextsampleblock(SampleScanState *node) /* If we've read all blocks or returned all needed tuples, we're done */ if (++sampler->doneblocks > sampler->nblocks || - sampler->donetuples >= sampler->ntuples) + node->donetuples >= sampler->ntuples) return InvalidBlockNumber; /* @@ -259,7 +253,7 @@ system_rows_nextsampleblock(SampleScanState *node) { /* Advance lb, using uint64 arithmetic to forestall overflow */ sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks; - } while (sampler->lb >= hscan->rs_nblocks); + } while (sampler->lb >= nblocks); return sampler->lb; } @@ -279,77 +273,27 @@ system_rows_nextsampletuple(SampleScanState *node, OffsetNumber maxoffset) { SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; OffsetNumber tupoffset = sampler->lt; /* Quit if we've returned all needed tuples */ - if (sampler->donetuples >= sampler->ntuples) + if (node->donetuples >= sampler->ntuples) return InvalidOffsetNumber; - /* - * Because we should only count visible tuples as being returned, we need - * to search for a visible tuple rather than just let the core code do it. - */ + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; - /* We rely on the data accumulated in pagemode access */ - Assert(scan->rs_pageatatime); - for (;;) - { - /* Advance to next possible offset on page */ - if (tupoffset == InvalidOffsetNumber) - tupoffset = FirstOffsetNumber; - else - tupoffset++; - - /* Done? */ - if (tupoffset > maxoffset) - { - tupoffset = InvalidOffsetNumber; - break; - } - - /* Found a candidate? */ - if (SampleOffsetVisible(tupoffset, hscan)) - { - sampler->donetuples++; - break; - } - } + /* Done? */ + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; sampler->lt = tupoffset; return tupoffset; } -/* - * Check if tuple offset is visible - * - * In pageatatime mode, heapgetpage() already did visibility checks, - * so just look at the info it left in rs_vistuples[]. - */ -static bool -SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan) -{ - int start = 0, - end = scan->rs_ntuples - 1; - - while (start <= end) - { - int mid = (start + end) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (tupoffset == curoffset) - return true; - else if (tupoffset < curoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; -} - /* * Compute greatest common divisor of two uint32's. */ diff --git a/contrib/tsm_system_time/tsm_system_time.c b/contrib/tsm_system_time/tsm_system_time.c index 1cc7264e08..ab9f685f0a 100644 --- a/contrib/tsm_system_time/tsm_system_time.c +++ b/contrib/tsm_system_time/tsm_system_time.c @@ -26,7 +26,6 @@ #include -#include "access/heapam.h" #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" @@ -66,7 +65,7 @@ static void system_time_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); -static BlockNumber system_time_nextsampleblock(SampleScanState *node); +static BlockNumber system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_time_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); @@ -213,11 +212,9 @@ system_time_beginsamplescan(SampleScanState *node, * Uses linear probing algorithm for picking next block. */ static BlockNumber -system_time_nextsampleblock(SampleScanState *node) +system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; instr_time cur_time; /* First call within scan? */ @@ -230,14 +227,14 @@ system_time_nextsampleblock(SampleScanState *node) SamplerRandomState randstate; /* If relation is empty, there's nothing to scan */ - if (hscan->rs_nblocks == 0) + if (nblocks == 0) return InvalidBlockNumber; /* We only need an RNG during this setup step */ sampler_random_init_state(sampler->seed, randstate); /* Compute nblocks/firstblock/step only once per query */ - sampler->nblocks = hscan->rs_nblocks; + sampler->nblocks = nblocks; /* Choose random starting block within the relation */ /* (Actually this is the predecessor of the first block visited) */ @@ -273,7 +270,7 @@ system_time_nextsampleblock(SampleScanState *node) { /* Advance lb, using uint64 arithmetic to forestall overflow */ sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks; - } while (sampler->lb >= hscan->rs_nblocks); + } while (sampler->lb >= nblocks); return sampler->lb; } diff --git a/doc/src/sgml/tablesample-method.sgml b/doc/src/sgml/tablesample-method.sgml index b84b7ba885..880e42c950 100644 --- a/doc/src/sgml/tablesample-method.sgml +++ b/doc/src/sgml/tablesample-method.sgml @@ -227,7 +227,7 @@ BeginSampleScan (SampleScanState *node, BlockNumber -NextSampleBlock (SampleScanState *node); +NextSampleBlock (SampleScanState *node, BlockNumber nblocks); Returns the block number of the next page to be scanned, or @@ -262,10 +262,9 @@ NextSampleTuple (SampleScanState *node, numbers in the range 1 .. maxoffset actually contain valid tuples. This is not normally a problem since the core code ignores requests to sample missing or invisible tuples; that should not result in - any bias in the sample. However, if necessary, the function can - examine node->ss.ss_currentScanDesc->rs_vistuples[] - to identify which tuples are valid and visible. (This - requires node->use_pagemode to be true.) + any bias in the sample. However, if necessary, the function can use + node->donetuples to examine how many of the tuples + it returned were vlaid and visible. diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 314806bdcd..5e8bf0aa5c 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -28,6 +28,7 @@ #include "access/multixact.h" #include "access/rewriteheap.h" #include "access/tableam.h" +#include "access/tsmapi.h" #include "access/xact.h" #include "catalog/catalog.h" #include "catalog/index.h" @@ -52,6 +53,9 @@ static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); +static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset); static const TableAmRoutine heapam_methods; @@ -1943,6 +1947,181 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths, } +/* ------------------------------------------------------------------------ + * Executor related callbacks for the heap AM + * ------------------------------------------------------------------------ + */ + +static bool +heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno; + + /* return false immediately if relation is empty */ + if (hscan->rs_nblocks == 0) + return false; + + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks); + hscan->rs_cblock = blockno; + } + else + { + /* scanning table sequentially */ + + if (hscan->rs_cblock == InvalidBlockNumber) + { + Assert(!hscan->rs_inited); + blockno = hscan->rs_startblock; + } + else + { + Assert(hscan->rs_inited); + + blockno = hscan->rs_cblock + 1; + + if (blockno >= hscan->rs_nblocks) + { + /* wrap to begining of rel, might not have started at 0 */ + blockno = 0; + } + + /* + * Report our new scan position for synchronization purposes. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, blockno); + + if (blockno == hscan->rs_startblock) + { + blockno = InvalidBlockNumber; + } + } + } + + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(hscan->rs_cbuf)) + ReleaseBuffer(hscan->rs_cbuf); + hscan->rs_cbuf = InvalidBuffer; + hscan->rs_cblock = InvalidBlockNumber; + hscan->rs_inited = false; + + return false; + } + + heapgetpage(scan, blockno); + hscan->rs_inited = true; + + return true; +} + +static bool +heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, + TupleTableSlot *slot) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + TsmRoutine *tsm = scanstate->tsmroutine; + BlockNumber blockno = hscan->rs_cblock; + bool pagemode = scan->rs_pageatatime; + + Page page; + bool all_visible; + OffsetNumber maxoffset; + + /* + * When not using pagemode, we must lock the buffer during tuple + * visibility checks. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(hscan->rs_cbuf); + all_visible = PageIsAllVisible(page) && + !scan->rs_snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + OffsetNumber tupoffset; + + CHECK_FOR_INTERRUPTS(); + + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, + blockno, + maxoffset); + + if (OffsetNumberIsValid(tupoffset)) + { + ItemId itemid; + bool visible; + HeapTuple tuple = &(hscan->rs_ctup); + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + + if (all_visible) + visible = true; + else + visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf, + tuple, tupoffset); + + /* in pagemode, heapgetpage did this for us */ + if (!pagemode) + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + hscan->rs_cbuf, scan->rs_snapshot); + + /* Try next tuple from same page. */ + if (!visible) + continue; + + /* Found visible tuple, return it. */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf); + + /* Count successfully-fetched tuples as heap fetches */ + pgstat_count_heap_getnext(scan->rs_rd); + + return true; + } + else + { + /* + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. + */ + if (!pagemode) + LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + ExecClearTuple(slot); + return false; + } + } + + Assert(0); +} + + /* ---------------------------------------------------------------------------- * Helper functions for the above. * ---------------------------------------------------------------------------- @@ -1991,6 +2170,53 @@ reform_and_rewrite_tuple(HeapTuple tuple, heap_freetuple(copiedTuple); } +/* + * Check visibility of the tuple. + */ +static bool +SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, + HeapTuple tuple, + OffsetNumber tupoffset) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + if (scan->rs_pageatatime) + { + /* + * In pageatatime mode, heapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. + * + * We use a binary search over the known-sorted array. Note: we could + * save some effort if we insisted that NextSampleTuple select tuples + * in increasing order, but it's not clear that there would be enough + * gain to justify the restriction. + */ + int start = 0, + end = hscan->rs_ntuples - 1; + + while (start <= end) + { + int mid = (start + end) / 2; + OffsetNumber curoffset = hscan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* Otherwise, we have to check the tuple individually. */ + return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot, + buffer); + } +} + /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2039,6 +2265,9 @@ static const TableAmRoutine heapam_methods = { .index_validate_scan = heapam_index_validate_scan, .relation_estimate_size = heapam_estimate_rel_size, + + .scan_sample_next_block = heapam_scan_sample_next_block, + .scan_sample_next_tuple = heapam_scan_sample_next_tuple }; diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 96660bf907..db937829d4 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -89,6 +89,9 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->index_validate_scan != NULL); Assert(routine->relation_estimate_size != NULL); + Assert(routine->scan_sample_next_block != NULL); + Assert(routine->scan_sample_next_tuple != NULL); + return routine; } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index 26f7de3e45..b1fb9b3201 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -26,7 +26,6 @@ #include -#include "access/heapam.h" #include "access/relscan.h" #include "access/tsmapi.h" #include "catalog/pg_type.h" @@ -56,7 +55,7 @@ static void system_beginsamplescan(SampleScanState *node, Datum *params, int nparams, uint32 seed); -static BlockNumber system_nextsampleblock(SampleScanState *node); +static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); static OffsetNumber system_nextsampletuple(SampleScanState *node, BlockNumber blockno, OffsetNumber maxoffset); @@ -177,11 +176,9 @@ system_beginsamplescan(SampleScanState *node, * Select next block to sample. */ static BlockNumber -system_nextsampleblock(SampleScanState *node) +system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) { SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; - TableScanDesc scan = node->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; BlockNumber nextblock = sampler->nextblock; uint32 hashinput[2]; @@ -200,7 +197,7 @@ system_nextsampleblock(SampleScanState *node) * Loop over block numbers until finding suitable block or reaching end of * relation. */ - for (; nextblock < hscan->rs_nblocks; nextblock++) + for (; nextblock < nblocks; nextblock++) { uint32 hash; @@ -212,7 +209,7 @@ system_nextsampleblock(SampleScanState *node) break; } - if (nextblock < hscan->rs_nblocks) + if (nextblock < nblocks) { /* Found a suitable block; remember where we should start next time */ sampler->nextblock = nextblock + 1; diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 817e4ca41f..14a0a6357d 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -14,7 +14,6 @@ */ #include "postgres.h" -#include "access/heapam.h" #include "access/relscan.h" #include "access/tableam.h" #include "access/tsmapi.h" @@ -29,9 +28,7 @@ static TupleTableSlot *SampleNext(SampleScanState *node); static void tablesample_init(SampleScanState *scanstate); -static HeapTuple tablesample_getnext(SampleScanState *scanstate); -static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, - HeapScanDesc scan); +static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate); /* ---------------------------------------------------------------- * Scan Support @@ -47,10 +44,6 @@ static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, static TupleTableSlot * SampleNext(SampleScanState *node) { - HeapTuple tuple; - TupleTableSlot *slot; - HeapScanDesc hscan; - /* * if this is first call within a scan, initialize */ @@ -60,19 +53,7 @@ SampleNext(SampleScanState *node) /* * get the next tuple, and store it in our result slot */ - tuple = tablesample_getnext(node); - - slot = node->ss.ss_ScanTupleSlot; - hscan = (HeapScanDesc) node->ss.ss_currentScanDesc; - - if (tuple) - ExecStoreBufferHeapTuple(tuple, /* tuple to store */ - slot, /* slot to store in */ - hscan->rs_cbuf); /* tuple's buffer */ - else - ExecClearTuple(slot); - - return slot; + return tablesample_getnext(node); } /* @@ -237,6 +218,9 @@ ExecReScanSampleScan(SampleScanState *node) { /* Remember we need to do BeginSampleScan again (if we did it at all) */ node->begun = false; + node->done = false; + node->haveblock = false; + node->donetuples = 0; ExecScanReScan(&node->ss); } @@ -258,6 +242,7 @@ tablesample_init(SampleScanState *scanstate) int i; ListCell *arg; + scanstate->donetuples = 0; params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum)); i = 0; @@ -345,225 +330,49 @@ tablesample_init(SampleScanState *scanstate) /* * Get next tuple from TABLESAMPLE method. - * - * Note: an awful lot of this is copied-and-pasted from heapam.c. It would - * perhaps be better to refactor to share more code. */ -static HeapTuple +static TupleTableSlot * tablesample_getnext(SampleScanState *scanstate) { - TsmRoutine *tsm = scanstate->tsmroutine; TableScanDesc scan = scanstate->ss.ss_currentScanDesc; - HeapScanDesc hscan = (HeapScanDesc) scan; - HeapTuple tuple = &(hscan->rs_ctup); - Snapshot snapshot = scan->rs_snapshot; - bool pagemode = scan->rs_pageatatime; - BlockNumber blockno; - Page page; - bool all_visible; - OffsetNumber maxoffset; + TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot; - if (!hscan->rs_inited) - { - /* - * return null immediately if relation is empty - */ - if (hscan->rs_nblocks == 0) - { - Assert(!BufferIsValid(hscan->rs_cbuf)); - tuple->t_data = NULL; - return NULL; - } - if (tsm->NextSampleBlock) - { - blockno = tsm->NextSampleBlock(scanstate); - if (!BlockNumberIsValid(blockno)) - { - tuple->t_data = NULL; - return NULL; - } - } - else - blockno = hscan->rs_startblock; - Assert(blockno < hscan->rs_nblocks); - heapgetpage(scan, blockno); - hscan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - blockno = hscan->rs_cblock; /* current page */ - } + ExecClearTuple(slot); - /* - * When not using pagemode, we must lock the buffer during tuple - * visibility checks. - */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(hscan->rs_cbuf); - all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - maxoffset = PageGetMaxOffsetNumber(page); + if (scanstate->done) + return NULL; for (;;) { - OffsetNumber tupoffset; - bool finished; - - CHECK_FOR_INTERRUPTS(); - - /* Ask the tablesample method which tuples to check on this page. */ - tupoffset = tsm->NextSampleTuple(scanstate, - blockno, - maxoffset); - - if (OffsetNumberIsValid(tupoffset)) + if (!scanstate->haveblock) { - ItemId itemid; - bool visible; - - /* Skip invalid tuple pointers. */ - itemid = PageGetItemId(page, tupoffset); - if (!ItemIdIsNormal(itemid)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); - - if (all_visible) - visible = true; - else - visible = SampleTupleVisible(tuple, tupoffset, hscan); - - /* in pagemode, heapgetpage did this for us */ - if (!pagemode) - CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, - hscan->rs_cbuf, snapshot); - - if (visible) + if (!table_scan_sample_next_block(scan, scanstate)) { - /* Found visible tuple, return it. */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - break; - } - else - { - /* Try next tuple from same page. */ - continue; + scanstate->haveblock = false; + scanstate->done = true; + + /* exhausted relation */ + return NULL; } + + scanstate->haveblock = true; } - /* - * if we get here, it means we've exhausted the items on this page and - * it's time to move to the next. - */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - if (tsm->NextSampleBlock) + if (!table_scan_sample_next_tuple(scan, scanstate, slot)) { - blockno = tsm->NextSampleBlock(scanstate); - Assert(!scan->rs_syncscan); - finished = !BlockNumberIsValid(blockno); - } - else - { - /* Without NextSampleBlock, just do a plain forward seqscan. */ - blockno++; - if (blockno >= hscan->rs_nblocks) - blockno = 0; - /* - * Report our new scan position for synchronization purposes. - * - * Note: we do this before checking for end of scan so that the - * final state of the position hint is back at the start of the - * rel. That's not strictly necessary, but otherwise when you run - * the same query multiple times the starting position would shift - * a little bit backwards on every invocation, which is confusing. - * We don't guarantee any specific ordering in general, though. + * If we get here, it means we've exhausted the items on this page + * and it's time to move to the next. */ - if (scan->rs_syncscan) - ss_report_location(scan->rs_rd, blockno); - - finished = (blockno == hscan->rs_startblock); + scanstate->haveblock = false; + continue; } - /* - * Reached end of scan? - */ - if (finished) - { - if (BufferIsValid(hscan->rs_cbuf)) - ReleaseBuffer(hscan->rs_cbuf); - hscan->rs_cbuf = InvalidBuffer; - hscan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - hscan->rs_inited = false; - return NULL; - } - - Assert(blockno < hscan->rs_nblocks); - heapgetpage(scan, blockno); - - /* Re-establish state for new page */ - if (!pagemode) - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(hscan->rs_cbuf); - all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - maxoffset = PageGetMaxOffsetNumber(page); + /* Found visible tuple, return it. */ + break; } - /* Count successfully-fetched tuples as heap fetches */ - pgstat_count_heap_getnext(scan->rs_rd); + scanstate->donetuples++; - return &(hscan->rs_ctup); -} - -/* - * Check visibility of the tuple. - */ -static bool -SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) -{ - if (scan->rs_base.rs_pageatatime) - { - /* - * In pageatatime mode, heapgetpage() already did visibility checks, - * so just look at the info it left in rs_vistuples[]. - * - * We use a binary search over the known-sorted array. Note: we could - * save some effort if we insisted that NextSampleTuple select tuples - * in increasing order, but it's not clear that there would be enough - * gain to justify the restriction. - */ - int start = 0, - end = scan->rs_ntuples - 1; - - while (start <= end) - { - int mid = (start + end) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (tupoffset == curoffset) - return true; - else if (tupoffset < curoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; - } - else - { - /* Otherwise, we have to check the tuple individually. */ - return HeapTupleSatisfiesVisibility(tuple, - scan->rs_base.rs_snapshot, - scan->rs_cbuf); - } + return slot; } diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 0151f17af4..34813e4099 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -30,6 +30,7 @@ extern bool synchronize_seqscans; struct BulkInsertStateData; struct IndexInfo; struct IndexBuildCallback; +struct SampleScanState; struct VacuumParams; struct ValidateIndexState; @@ -519,6 +520,56 @@ typedef struct TableAmRoutine BlockNumber *pages, double *tuples, double *allvisfrac); + + /* ------------------------------------------------------------------------ + * Executor related functions. + * ------------------------------------------------------------------------ + */ + + /* + * Acquire the next block in a sample scan. Return false if the sample + * scan is finished, true otherwise. + * + * Typically this will first determine the target block by call the + * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively + * perform a sequential scan over all blocks. The determined block is + * then typically read and pinned. + * + * As the TsmRoutine interface is block based, a block needs to be passed + * to NextSampleBlock(). If that's not appropriate for an AM, it + * internally needs to perform mapping between the internal and a block + * based representation. + * + * Note that it's not acceptable to hold deadlock prone resources such as + * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the + * block - the tuple is likely to be returned to an upper query node, and + * the next call could be off a long while. Holding buffer pins etc is + * obviously OK. + * + * Currently it is required to implement this interface, as there's no + * alternative way (contrary e.g. to bitmap scans) to implement sample + * scans. If infeasible to implement the AM may raise an error. + */ + bool (*scan_sample_next_block) (TableScanDesc scan, + struct SampleScanState *scanstate); + + /* + * This callback, only called after scan_sample_next_block has returned + * true, should determine the next tuple to be returned from the selected + * block using the TsmRoutine's NextSampleTuple() callback. + * + * The callback needs to perform visibility checks, and only return + * visible tuples. That obviously can mean calling NextSampletuple() + * multiple times. + * + * The TsmRoutine interface assumes that there's a maximum offset on a + * given page, so if that doesn't apply to an AM, it needs to emulate that + * assumption somehow. + */ + bool (*scan_sample_next_tuple) (TableScanDesc scan, + struct SampleScanState *scanstate, + TupleTableSlot *slot); + } TableAmRoutine; @@ -1339,6 +1390,43 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths, } +/* ---------------------------------------------------------------------------- + * Executor related functionality + * ---------------------------------------------------------------------------- + */ + +/* + * Acquire the next block in a sample scan. Returns false if the sample scan + * is finished, true otherwise. + * + * This will call the TsmRoutine's NextSampleBlock() callback if necessary + * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the + * underlying relation. + */ +static inline bool +table_scan_sample_next_block(TableScanDesc scan, + struct SampleScanState *scanstate) +{ + return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); +} + +/* + * Fetch the next sample tuple into `slot` and return true if a visible tuple + * was found, false otherwise. table_scan_sample_next_block() needs to + * previously have selected a block (i.e. returned true). + * + * This will call the TsmRoutine's NextSampleTuple() callback. + */ +static inline bool +table_scan_sample_next_tuple(TableScanDesc scan, + struct SampleScanState *scanstate, + TupleTableSlot *slot) +{ + return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, + slot); +} + + /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. * ---------------------------------------------------------------------------- diff --git a/src/include/access/tsmapi.h b/src/include/access/tsmapi.h index a5c0b4cafe..d3bdb754b5 100644 --- a/src/include/access/tsmapi.h +++ b/src/include/access/tsmapi.h @@ -34,7 +34,8 @@ typedef void (*BeginSampleScan_function) (SampleScanState *node, int nparams, uint32 seed); -typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node); +typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node, + BlockNumber nblocks); typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node, BlockNumber blockno, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index dbd7ed0363..437a191b45 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1305,6 +1305,9 @@ typedef struct SampleScanState bool use_pagemode; /* use page-at-a-time visibility checking? */ bool begun; /* false means need to call BeginSampleScan */ uint32 seed; /* random seed */ + int64 donetuples; /* number of tuples already returned */ + bool haveblock; /* has a block for sampling been determined */ + bool done; /* exhausted all tuples? */ } SampleScanState; /*