diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 7ebd1e5181..95a0bbcf5f 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -56,11 +56,30 @@ typedef struct OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; - /* marked[i] is true if item i is entered in one of the above arrays */ + + /* + * marked[i] is true if item i is entered in one of the above arrays. + * + * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is + * 1. Otherwise every access would need to subtract 1. + */ bool marked[MaxHeapTuplesPerPage + 1]; + + /* + * Tuple visibility is only computed once for each tuple, for correctness + * and efficiency reasons; see comment in heap_page_prune() for + * details. This is of type int8[,] intead of HTSV_Result[], so we can use + * -1 to indicate no visibility has been computed, e.g. for LP_DEAD items. + * + * Same indexing as ->marked. + */ + int8 htsv[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ +static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, + HeapTuple tup, + Buffer buffer); static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate); @@ -252,6 +271,7 @@ heap_page_prune(Relation relation, Buffer buffer, OffsetNumber offnum, maxoff; PruneState prstate; + HeapTupleData tup; /* * Our strategy is to scan the page and make lists of items to change, @@ -274,8 +294,60 @@ heap_page_prune(Relation relation, Buffer buffer, prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); - /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); + tup.t_tableOid = RelationGetRelid(prstate.rel); + + /* + * Determine HTSV for all tuples. + * + * This is required for correctness to deal with cases where running HTSV + * twice could result in different results (e.g. RECENTLY_DEAD can turn to + * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid() + * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the + * inserting transaction aborts, ...). That in turn could cause + * heap_prune_chain() to behave incorrectly if a tuple is reached twice, + * once directly via a heap_prune_chain() and once following a HOT chain. + * + * It's also good for performance. Most commonly tuples within a page are + * stored at decreasing offsets (while the items are stored at increasing + * offsets). When processing all tuples on a page this leads to reading + * memory at decreasing offsets within a page, with a variable stride. + * That's hard for CPU prefetchers to deal with. Processing the items in + * reverse order (and thus the tuples in increasing order) increases + * prefetching efficiency significantly / decreases the number of cache + * misses. + */ + for (offnum = maxoff; + offnum >= FirstOffsetNumber; + offnum = OffsetNumberPrev(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + HeapTupleHeader htup; + + /* Nothing to do if slot doesn't contain a tuple */ + if (!ItemIdIsNormal(itemid)) + { + prstate.htsv[offnum] = -1; + continue; + } + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + tup.t_data = htup; + tup.t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); + + /* + * Set the offset number so that we can display it along with any + * error that occurred while processing this tuple. + */ + if (off_loc) + *off_loc = offnum; + + prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, + buffer); + } + + /* Scan the page */ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) @@ -286,10 +358,7 @@ heap_page_prune(Relation relation, Buffer buffer, if (prstate.marked[offnum]) continue; - /* - * Set the offset number so that we can display it along with any - * error that occurred while processing this tuple. - */ + /* see preceding loop */ if (off_loc) *off_loc = offnum; @@ -491,7 +560,7 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really - * DEAD, the heap_prune_satisfies_vacuum test is just too coarse to detect it. + * DEAD, our visibility test is just too coarse to detect it. * * In general, pruning must never leave behind a DEAD tuple that still has * tuple storage. VACUUM isn't prepared to deal with that case. That's why @@ -505,11 +574,7 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) * pointer is marked LP_DEAD. (This includes the case of a DEAD simple * tuple, which we treat as a chain of length 1.) * - * prstate->vistest is used to distinguish whether tuples are DEAD or - * RECENTLY_DEAD. - * - * We don't actually change the page here, except perhaps for hint-bit updates - * caused by heap_prune_satisfies_vacuum. We just add entries to the arrays in + * We don't actually change the page here. We just add entries to the arrays in * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED @@ -531,9 +596,6 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) OffsetNumber chainitems[MaxHeapTuplesPerPage]; int nchain = 0, i; - HeapTupleData tup; - - tup.t_tableOid = RelationGetRelid(prstate->rel); rootlp = PageGetItemId(dp, rootoffnum); @@ -542,12 +604,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) */ if (ItemIdIsNormal(rootlp)) { + Assert(prstate->htsv[rootoffnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, rootlp); - tup.t_data = htup; - tup.t_len = ItemIdGetLength(rootlp); - ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum); - if (HeapTupleHeaderIsHeapOnly(htup)) { /* @@ -568,8 +627,8 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ - if (heap_prune_satisfies_vacuum(prstate, &tup, buffer) - == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) + if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD && + !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); HeapTupleHeaderAdvanceLatestRemovedXid(htup, @@ -636,12 +695,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; Assert(ItemIdIsNormal(lp)); + Assert(prstate->htsv[offnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, lp); - tup.t_data = htup; - tup.t_len = ItemIdGetLength(lp); - ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); - /* * Check the tuple XMIN against prior XMAX, if any */ @@ -659,7 +715,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) */ tupdead = recent_dead = false; - switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer)) + switch ((HTSV_Result) prstate->htsv[offnum]) { case HEAPTUPLE_DEAD: tupdead = true;