Fix SPGiST vacuum algorithm to handle concurrent tuple motion properly.
A leaf tuple that we need to delete could get moved as a consequence of an insertion happening concurrently with the VACUUM scan. If it moves from a page past the current scan point to a page before, we'll miss it, which is not acceptable. Hence, when we see a leaf-page REDIRECT that could have been made since our scan started, chase down the redirection pointer much as if we were doing a normal index search, and be sure to vacuum every page it leads to. This fixes the issue because, if the tuple was on page N at the instant we start our scan, we will surely find it as a consequence of chasing the redirect from page N, no matter how much it moves around in between. Problem noted by Takashi Yamamoto.
This commit is contained in:
parent
bad250f4f3
commit
b4af1c25bb
@ -314,6 +314,27 @@ the reverse map of the nextOffset links (ie, when we see tuple x links to
|
|||||||
tuple y, we set predecessor[y] = x). Then head tuples are the ones with
|
tuple y, we set predecessor[y] = x). Then head tuples are the ones with
|
||||||
no predecessor.
|
no predecessor.
|
||||||
|
|
||||||
|
Because insertions can occur while VACUUM runs, a pure sequential scan
|
||||||
|
could miss deleting some target leaf tuples, because they could get moved
|
||||||
|
from a not-yet-visited leaf page to an already-visited leaf page as a
|
||||||
|
consequence of a PickSplit or MoveLeafs operation. Failing to delete any
|
||||||
|
target TID is not acceptable, so we have to extend the algorithm to cope
|
||||||
|
with such cases. We recognize that such a move might have occurred when
|
||||||
|
we see a leaf-page REDIRECT tuple whose XID indicates it might have been
|
||||||
|
created after the VACUUM scan started. We add the redirection target TID
|
||||||
|
to a "pending list" of places we need to recheck. Between pages of the
|
||||||
|
main sequential scan, we empty the pending list by visiting each listed
|
||||||
|
TID. If it points to an inner tuple (from a PickSplit), add each downlink
|
||||||
|
TID to the pending list. If it points to a leaf page, vacuum that page.
|
||||||
|
(We could just vacuum the single pointed-to chain, but vacuuming the
|
||||||
|
whole page simplifies the code and reduces the odds of VACUUM having to
|
||||||
|
modify the same page multiple times.) To ensure that pending-list
|
||||||
|
processing can never get into an endless loop, even in the face of
|
||||||
|
concurrent index changes, we don't remove list entries immediately but
|
||||||
|
only after we've completed all pending-list processing; instead we just
|
||||||
|
mark items as done after processing them. Adding a TID that's already in
|
||||||
|
the list is a no-op, whether or not that item is marked done yet.
|
||||||
|
|
||||||
spgbulkdelete also updates the index's free space map.
|
spgbulkdelete also updates the index's free space map.
|
||||||
|
|
||||||
Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was
|
Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was
|
||||||
|
@ -25,9 +25,18 @@
|
|||||||
#include "storage/indexfsm.h"
|
#include "storage/indexfsm.h"
|
||||||
#include "storage/lmgr.h"
|
#include "storage/lmgr.h"
|
||||||
#include "storage/procarray.h"
|
#include "storage/procarray.h"
|
||||||
|
#include "utils/snapmgr.h"
|
||||||
|
|
||||||
|
|
||||||
/* local state for vacuum operations */
|
/* Entry in pending-list of TIDs we need to revisit */
|
||||||
|
typedef struct spgVacPendingItem
|
||||||
|
{
|
||||||
|
ItemPointerData tid; /* redirection target to visit */
|
||||||
|
bool done; /* have we dealt with this? */
|
||||||
|
struct spgVacPendingItem *next; /* list link */
|
||||||
|
} spgVacPendingItem;
|
||||||
|
|
||||||
|
/* Local state for vacuum operations */
|
||||||
typedef struct spgBulkDeleteState
|
typedef struct spgBulkDeleteState
|
||||||
{
|
{
|
||||||
/* Parameters passed in to spgvacuumscan */
|
/* Parameters passed in to spgvacuumscan */
|
||||||
@ -35,22 +44,87 @@ typedef struct spgBulkDeleteState
|
|||||||
IndexBulkDeleteResult *stats;
|
IndexBulkDeleteResult *stats;
|
||||||
IndexBulkDeleteCallback callback;
|
IndexBulkDeleteCallback callback;
|
||||||
void *callback_state;
|
void *callback_state;
|
||||||
|
|
||||||
/* Additional working state */
|
/* Additional working state */
|
||||||
SpGistState spgstate;
|
SpGistState spgstate; /* for SPGiST operations that need one */
|
||||||
TransactionId OldestXmin;
|
spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */
|
||||||
BlockNumber lastFilledBlock;
|
TransactionId myXmin; /* for detecting newly-added redirects */
|
||||||
|
TransactionId OldestXmin; /* for deciding a redirect is obsolete */
|
||||||
|
BlockNumber lastFilledBlock; /* last non-deletable block */
|
||||||
} spgBulkDeleteState;
|
} spgBulkDeleteState;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add TID to pendingList, but only if not already present.
|
||||||
|
*
|
||||||
|
* Note that new items are always appended at the end of the list; this
|
||||||
|
* ensures that scans of the list don't miss items added during the scan.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
|
||||||
|
{
|
||||||
|
spgVacPendingItem *pitem;
|
||||||
|
spgVacPendingItem **listLink;
|
||||||
|
|
||||||
|
/* search the list for pre-existing entry */
|
||||||
|
listLink = &bds->pendingList;
|
||||||
|
while (*listLink != NULL)
|
||||||
|
{
|
||||||
|
pitem = *listLink;
|
||||||
|
if (ItemPointerEquals(tid, &pitem->tid))
|
||||||
|
return; /* already in list, do nothing */
|
||||||
|
listLink = &pitem->next;
|
||||||
|
}
|
||||||
|
/* not there, so append new entry */
|
||||||
|
pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
|
||||||
|
pitem->tid = *tid;
|
||||||
|
pitem->done = false;
|
||||||
|
pitem->next = NULL;
|
||||||
|
*listLink = pitem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Clear pendingList
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
spgClearPendingList(spgBulkDeleteState *bds)
|
||||||
|
{
|
||||||
|
spgVacPendingItem *pitem;
|
||||||
|
spgVacPendingItem *nitem;
|
||||||
|
|
||||||
|
for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
|
||||||
|
{
|
||||||
|
nitem = pitem->next;
|
||||||
|
/* All items in list should have been dealt with */
|
||||||
|
Assert(pitem->done);
|
||||||
|
pfree(pitem);
|
||||||
|
}
|
||||||
|
bds->pendingList = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Vacuum a regular (non-root) leaf page
|
* Vacuum a regular (non-root) leaf page
|
||||||
*
|
*
|
||||||
* We must delete tuples that are targeted for deletion by the VACUUM,
|
* We must delete tuples that are targeted for deletion by the VACUUM,
|
||||||
* but not move any tuples that are referenced by outside links; we assume
|
* but not move any tuples that are referenced by outside links; we assume
|
||||||
* those are the ones that are heads of chains.
|
* those are the ones that are heads of chains.
|
||||||
|
*
|
||||||
|
* If we find a REDIRECT that was made by a concurrently-running transaction,
|
||||||
|
* we must add its target TID to pendingList. (We don't try to visit the
|
||||||
|
* target immediately, first because we don't want VACUUM locking more than
|
||||||
|
* one buffer at a time, and second because the duplicate-filtering logic
|
||||||
|
* in spgAddPendingTID is useful to ensure we can't get caught in an infinite
|
||||||
|
* loop in the face of continuous concurrent insertions.)
|
||||||
|
*
|
||||||
|
* If forPending is true, we are examining the page as a consequence of
|
||||||
|
* chasing a redirect link, not as part of the normal sequential scan.
|
||||||
|
* We still vacuum the page normally, but we don't increment the stats
|
||||||
|
* about live tuples; else we'd double-count those tuples, since the page
|
||||||
|
* has been or will be visited in the sequential scan as well.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
|
vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
|
||||||
|
bool forPending)
|
||||||
{
|
{
|
||||||
Page page = BufferGetPage(buffer);
|
Page page = BufferGetPage(buffer);
|
||||||
spgxlogVacuumLeaf xlrec;
|
spgxlogVacuumLeaf xlrec;
|
||||||
@ -90,7 +164,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bds->stats->num_index_tuples += 1;
|
if (!forPending)
|
||||||
|
bds->stats->num_index_tuples += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Form predecessor map, too */
|
/* Form predecessor map, too */
|
||||||
@ -106,6 +181,25 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
|
|||||||
predecessor[lt->nextOffset] = i;
|
predecessor[lt->nextOffset] = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (lt->tupstate == SPGIST_REDIRECT)
|
||||||
|
{
|
||||||
|
SpGistDeadTuple dt = (SpGistDeadTuple) lt;
|
||||||
|
|
||||||
|
Assert(dt->nextOffset == InvalidOffsetNumber);
|
||||||
|
Assert(ItemPointerIsValid(&dt->pointer));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add target TID to pending list if the redirection could have
|
||||||
|
* happened since VACUUM started.
|
||||||
|
*
|
||||||
|
* Note: we could make a tighter test by seeing if the xid is
|
||||||
|
* "running" according to the active snapshot; but tqual.c doesn't
|
||||||
|
* currently export a suitable API, and it's not entirely clear
|
||||||
|
* that a tighter test is worth the cycles anyway.
|
||||||
|
*/
|
||||||
|
if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
|
||||||
|
spgAddPendingTID(bds, &dt->pointer);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Assert(lt->nextOffset == InvalidOffsetNumber);
|
Assert(lt->nextOffset == InvalidOffsetNumber);
|
||||||
@ -545,7 +639,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
vacuumLeafPage(bds, index, buffer);
|
vacuumLeafPage(bds, index, buffer, false);
|
||||||
vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
|
vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -556,8 +650,8 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The root page must never be deleted, nor marked as available in FSM,
|
* The root pages must never be deleted, nor marked as available in FSM,
|
||||||
* because we don't want it ever returned by a search for a place to
|
* because we don't want them ever returned by a search for a place to
|
||||||
* put a new tuple. Otherwise, check for empty/deletable page, and
|
* put a new tuple. Otherwise, check for empty/deletable page, and
|
||||||
* make sure FSM knows about it.
|
* make sure FSM knows about it.
|
||||||
*/
|
*/
|
||||||
@ -585,6 +679,118 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
|
|||||||
UnlockReleaseBuffer(buffer);
|
UnlockReleaseBuffer(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process the pending-TID list between pages of the main scan
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
spgprocesspending(spgBulkDeleteState *bds)
|
||||||
|
{
|
||||||
|
Relation index = bds->info->index;
|
||||||
|
spgVacPendingItem *pitem;
|
||||||
|
spgVacPendingItem *nitem;
|
||||||
|
BlockNumber blkno;
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
|
||||||
|
for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
|
||||||
|
{
|
||||||
|
if (pitem->done)
|
||||||
|
continue; /* ignore already-done items */
|
||||||
|
|
||||||
|
/* call vacuum_delay_point while not holding any buffer lock */
|
||||||
|
vacuum_delay_point();
|
||||||
|
|
||||||
|
/* examine the referenced page */
|
||||||
|
blkno = ItemPointerGetBlockNumber(&pitem->tid);
|
||||||
|
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
|
||||||
|
RBM_NORMAL, bds->info->strategy);
|
||||||
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
|
||||||
|
if (PageIsNew(page) || SpGistPageIsDeleted(page))
|
||||||
|
{
|
||||||
|
/* Probably shouldn't happen, but ignore it */
|
||||||
|
}
|
||||||
|
else if (SpGistPageIsLeaf(page))
|
||||||
|
{
|
||||||
|
if (SpGistBlockIsRoot(blkno))
|
||||||
|
{
|
||||||
|
/* this should definitely not happen */
|
||||||
|
elog(ERROR, "redirection leads to root page of index \"%s\"",
|
||||||
|
RelationGetRelationName(index));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* deal with any deletable tuples */
|
||||||
|
vacuumLeafPage(bds, index, buffer, true);
|
||||||
|
/* might as well do this while we are here */
|
||||||
|
vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
|
||||||
|
|
||||||
|
SpGistSetLastUsedPage(index, buffer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We can mark as done not only this item, but any later ones
|
||||||
|
* pointing at the same page, since we vacuumed the whole page.
|
||||||
|
*/
|
||||||
|
pitem->done = true;
|
||||||
|
for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
|
||||||
|
{
|
||||||
|
if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
|
||||||
|
nitem->done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* On an inner page, visit the referenced inner tuple and add
|
||||||
|
* all its downlinks to the pending list. We might have pending
|
||||||
|
* items for more than one inner tuple on the same page (in fact
|
||||||
|
* this is pretty likely given the way space allocation works),
|
||||||
|
* so get them all while we are here.
|
||||||
|
*/
|
||||||
|
for (nitem = pitem; nitem != NULL; nitem = nitem->next)
|
||||||
|
{
|
||||||
|
if (nitem->done)
|
||||||
|
continue;
|
||||||
|
if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
|
||||||
|
{
|
||||||
|
OffsetNumber offset;
|
||||||
|
SpGistInnerTuple innerTuple;
|
||||||
|
|
||||||
|
offset = ItemPointerGetOffsetNumber(&nitem->tid);
|
||||||
|
innerTuple = (SpGistInnerTuple) PageGetItem(page,
|
||||||
|
PageGetItemId(page, offset));
|
||||||
|
if (innerTuple->tupstate == SPGIST_LIVE)
|
||||||
|
{
|
||||||
|
SpGistNodeTuple node;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
SGITITERATE(innerTuple, i, node)
|
||||||
|
{
|
||||||
|
if (ItemPointerIsValid(&node->t_tid))
|
||||||
|
spgAddPendingTID(bds, &node->t_tid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (innerTuple->tupstate == SPGIST_REDIRECT)
|
||||||
|
{
|
||||||
|
/* transfer attention to redirect point */
|
||||||
|
spgAddPendingTID(bds,
|
||||||
|
&((SpGistDeadTuple) innerTuple)->pointer);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
elog(ERROR, "unexpected SPGiST tuple state: %d",
|
||||||
|
innerTuple->tupstate);
|
||||||
|
|
||||||
|
nitem->done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UnlockReleaseBuffer(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
spgClearPendingList(bds);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Perform a bulkdelete scan
|
* Perform a bulkdelete scan
|
||||||
*/
|
*/
|
||||||
@ -598,6 +804,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
|
|||||||
|
|
||||||
/* Finish setting up spgBulkDeleteState */
|
/* Finish setting up spgBulkDeleteState */
|
||||||
initSpGistState(&bds->spgstate, index);
|
initSpGistState(&bds->spgstate, index);
|
||||||
|
bds->pendingList = NULL;
|
||||||
|
bds->myXmin = GetActiveSnapshot()->xmin;
|
||||||
bds->OldestXmin = GetOldestXmin(true, false);
|
bds->OldestXmin = GetOldestXmin(true, false);
|
||||||
bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
|
bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
|
||||||
|
|
||||||
@ -637,6 +845,9 @@ spgvacuumscan(spgBulkDeleteState *bds)
|
|||||||
for (; blkno < num_pages; blkno++)
|
for (; blkno < num_pages; blkno++)
|
||||||
{
|
{
|
||||||
spgvacuumpage(bds, blkno);
|
spgvacuumpage(bds, blkno);
|
||||||
|
/* empty the pending-list after each page */
|
||||||
|
if (bds->pendingList != NULL)
|
||||||
|
spgprocesspending(bds);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -747,7 +958,7 @@ spgvacuumcleanup(PG_FUNCTION_ARGS)
|
|||||||
IndexFreeSpaceMapVacuum(index);
|
IndexFreeSpaceMapVacuum(index);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* It's quite possible for us to be fooled by concurrent page splits into
|
* It's quite possible for us to be fooled by concurrent tuple moves into
|
||||||
* double-counting some index tuples, so disbelieve any total that exceeds
|
* double-counting some index tuples, so disbelieve any total that exceeds
|
||||||
* the underlying heap's count ... if we know that accurately. Otherwise
|
* the underlying heap's count ... if we know that accurately. Otherwise
|
||||||
* this might just make matters worse.
|
* this might just make matters worse.
|
||||||
|
Loading…
Reference in New Issue
Block a user