Adjust btbulkdelete logic so that only one WAL record is issued while
deleting multiple index entries on a single index page. This makes for a very substantial reduction in the amount of WAL traffic during a large delete operation.
This commit is contained in:
parent
13dadef8b5
commit
3bbd6af37c
@ -9,7 +9,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $
|
||||||
*
|
*
|
||||||
* NOTES
|
* NOTES
|
||||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||||
@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Delete an item from a btree page.
|
* Delete item(s) from a btree page.
|
||||||
*
|
*
|
||||||
* This must only be used for deleting leaf items. Deleting an item on a
|
* This must only be used for deleting leaf items. Deleting an item on a
|
||||||
* non-leaf page has to be done as part of an atomic action that includes
|
* non-leaf page has to be done as part of an atomic action that includes
|
||||||
* deleting the page it points to.
|
* deleting the page it points to.
|
||||||
*
|
*
|
||||||
* This routine assumes that the caller has pinned and locked the buffer,
|
* This routine assumes that the caller has pinned and locked the buffer,
|
||||||
* and will write the buffer afterwards.
|
* and will write the buffer afterwards. Also, the given itemnos *must*
|
||||||
|
* appear in increasing order in the array.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
|
_bt_delitems(Relation rel, Buffer buf,
|
||||||
|
OffsetNumber *itemnos, int nitems)
|
||||||
{
|
{
|
||||||
Page page = BufferGetPage(buf);
|
Page page = BufferGetPage(buf);
|
||||||
OffsetNumber offno;
|
int i;
|
||||||
|
|
||||||
offno = ItemPointerGetOffsetNumber(tid);
|
|
||||||
|
|
||||||
|
/* No elog(ERROR) until changes are logged */
|
||||||
START_CRIT_SECTION();
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
PageIndexTupleDelete(page, offno);
|
/*
|
||||||
|
* Delete the items in reverse order so we don't have to think about
|
||||||
|
* adjusting item numbers for previous deletions.
|
||||||
|
*/
|
||||||
|
for (i = nitems - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
PageIndexTupleDelete(page, itemnos[i]);
|
||||||
|
}
|
||||||
|
|
||||||
/* XLOG stuff */
|
/* XLOG stuff */
|
||||||
if (!rel->rd_istemp)
|
if (!rel->rd_istemp)
|
||||||
@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
|
|||||||
XLogRecPtr recptr;
|
XLogRecPtr recptr;
|
||||||
XLogRecData rdata[2];
|
XLogRecData rdata[2];
|
||||||
|
|
||||||
xlrec.target.node = rel->rd_node;
|
xlrec.node = rel->rd_node;
|
||||||
xlrec.target.tid = *tid;
|
xlrec.block = BufferGetBlockNumber(buf);
|
||||||
|
|
||||||
rdata[0].buffer = InvalidBuffer;
|
rdata[0].buffer = InvalidBuffer;
|
||||||
rdata[0].data = (char *) &xlrec;
|
rdata[0].data = (char *) &xlrec;
|
||||||
rdata[0].len = SizeOfBtreeDelete;
|
rdata[0].len = SizeOfBtreeDelete;
|
||||||
rdata[0].next = &(rdata[1]);
|
rdata[0].next = &(rdata[1]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The target-offsets array is not in the buffer, but pretend
|
||||||
|
* that it is. When XLogInsert stores the whole buffer, the offsets
|
||||||
|
* array need not be stored too.
|
||||||
|
*/
|
||||||
rdata[1].buffer = buf;
|
rdata[1].buffer = buf;
|
||||||
|
if (nitems > 0)
|
||||||
|
{
|
||||||
|
rdata[1].data = (char *) itemnos;
|
||||||
|
rdata[1].len = nitems * sizeof(OffsetNumber);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
rdata[1].data = NULL;
|
rdata[1].data = NULL;
|
||||||
rdata[1].len = 0;
|
rdata[1].len = 0;
|
||||||
|
}
|
||||||
rdata[1].next = NULL;
|
rdata[1].next = NULL;
|
||||||
|
|
||||||
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -572,120 +572,109 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
|||||||
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
|
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
|
||||||
void *callback_state = (void *) PG_GETARG_POINTER(2);
|
void *callback_state = (void *) PG_GETARG_POINTER(2);
|
||||||
IndexBulkDeleteResult *result;
|
IndexBulkDeleteResult *result;
|
||||||
BlockNumber num_pages;
|
|
||||||
double tuples_removed;
|
double tuples_removed;
|
||||||
double num_index_tuples;
|
double num_index_tuples;
|
||||||
IndexScanDesc scan;
|
OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)];
|
||||||
BTScanOpaque so;
|
int ndeletable;
|
||||||
ItemPointer current;
|
Buffer buf;
|
||||||
|
BlockNumber num_pages;
|
||||||
|
|
||||||
tuples_removed = 0;
|
tuples_removed = 0;
|
||||||
num_index_tuples = 0;
|
num_index_tuples = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We use a standard IndexScanDesc scan object, but to speed up the
|
* The outer loop iterates over index leaf pages, the inner over items
|
||||||
* loop, we skip most of the wrapper layers of index_getnext and
|
* on a leaf page. We issue just one _bt_delitems() call per page,
|
||||||
* instead call _bt_step directly. This implies holding buffer lock
|
* so as to minimize WAL traffic.
|
||||||
* on a target page throughout the loop over the page's tuples.
|
|
||||||
*
|
*
|
||||||
* Whenever we step onto a new page, we have to trade in the read
|
* Note that we exclusive-lock every leaf page containing data items,
|
||||||
* lock acquired by _bt_first or _bt_step for an exclusive write lock
|
* in sequence left to right. It sounds attractive to only exclusive-lock
|
||||||
* (fortunately, _bt_relbuf doesn't care which kind of lock it's
|
|
||||||
* releasing when it comes time for _bt_step to release our lock).
|
|
||||||
*
|
|
||||||
* Note that we exclusive-lock every leaf page, or at least every one
|
|
||||||
* containing data items. It sounds attractive to only exclusive-lock
|
|
||||||
* those containing items we need to delete, but unfortunately that
|
* those containing items we need to delete, but unfortunately that
|
||||||
* is not safe: we could then pass a stopped indexscan, which could
|
* is not safe: we could then pass a stopped indexscan, which could
|
||||||
* in rare cases lead to deleting the item it needs to find when it
|
* in rare cases lead to deleting the item it needs to find when it
|
||||||
* resumes. (See _bt_restscan --- this could only happen if an indexscan
|
* resumes. (See _bt_restscan --- this could only happen if an indexscan
|
||||||
* stops on a deletable item and then a page split moves that item
|
* stops on a deletable item and then a page split moves that item
|
||||||
* into a page further to its right, which the indexscan will have no
|
* into a page further to its right, which the indexscan will have no
|
||||||
* pin on.)
|
* pin on.) We can skip obtaining exclusive lock on empty pages
|
||||||
|
* though, since no indexscan could be stopped on those.
|
||||||
*/
|
*/
|
||||||
scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
|
buf = _bt_get_endpoint(rel, 0, false);
|
||||||
so = (BTScanOpaque) scan->opaque;
|
if (BufferIsValid(buf)) /* check for empty index */
|
||||||
current = &(scan->currentItemData);
|
|
||||||
|
|
||||||
/* Use _bt_first to get started, then _bt_step to remaining tuples */
|
|
||||||
if (_bt_first(scan, ForwardScanDirection))
|
|
||||||
{
|
{
|
||||||
Buffer buf;
|
for (;;)
|
||||||
BlockNumber lockedBlock = InvalidBlockNumber;
|
|
||||||
|
|
||||||
/* we have the buffer pinned and read-locked */
|
|
||||||
buf = so->btso_curbuf;
|
|
||||||
Assert(BufferIsValid(buf));
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
Page page;
|
Page page;
|
||||||
BlockNumber blkno;
|
|
||||||
OffsetNumber offnum;
|
|
||||||
BTItem btitem;
|
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
IndexTuple itup;
|
OffsetNumber offnum,
|
||||||
ItemPointer htup;
|
minoff,
|
||||||
|
maxoff;
|
||||||
|
BlockNumber nextpage;
|
||||||
|
|
||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
/* current is the next index tuple */
|
ndeletable = 0;
|
||||||
page = BufferGetPage(buf);
|
page = BufferGetPage(buf);
|
||||||
blkno = ItemPointerGetBlockNumber(current);
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
minoff = P_FIRSTDATAKEY(opaque);
|
||||||
/*
|
maxoff = PageGetMaxOffsetNumber(page);
|
||||||
* Make sure we have a super-exclusive write lock on this page.
|
/* We probably cannot see deleted pages, but skip 'em if so */
|
||||||
*
|
if (minoff <= maxoff && !P_ISDELETED(opaque))
|
||||||
* We assume that only concurrent insertions, not deletions,
|
|
||||||
* can occur while we're not holding the page lock (the
|
|
||||||
* caller should hold a suitable relation lock to ensure
|
|
||||||
* this). Therefore, no items can escape being scanned because
|
|
||||||
* of this temporary lock release.
|
|
||||||
*/
|
|
||||||
if (blkno != lockedBlock)
|
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Trade in the initial read lock for a super-exclusive
|
||||||
|
* write lock on this page.
|
||||||
|
*/
|
||||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||||
LockBufferForCleanup(buf);
|
LockBufferForCleanup(buf);
|
||||||
lockedBlock = blkno;
|
|
||||||
/*
|
/*
|
||||||
* If the page was formerly rightmost but was split while we
|
* Recompute minoff/maxoff, both of which could have changed
|
||||||
* didn't hold the lock, and ip_posid is pointing to item
|
* while we weren't holding the lock.
|
||||||
* 1, then ip_posid now points at the high key not a valid
|
|
||||||
* data item. In this case we need to step forward.
|
|
||||||
*/
|
*/
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
minoff = P_FIRSTDATAKEY(opaque);
|
||||||
if (current->ip_posid < P_FIRSTDATAKEY(opaque))
|
maxoff = PageGetMaxOffsetNumber(page);
|
||||||
current->ip_posid = P_FIRSTDATAKEY(opaque);
|
/*
|
||||||
}
|
* Scan over all items to see which ones need deleted
|
||||||
|
* according to the callback function.
|
||||||
offnum = ItemPointerGetOffsetNumber(current);
|
*/
|
||||||
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
for (offnum = minoff;
|
||||||
itup = &btitem->bti_itup;
|
offnum <= maxoff;
|
||||||
htup = &(itup->t_tid);
|
offnum = OffsetNumberNext(offnum))
|
||||||
|
{
|
||||||
|
BTItem btitem;
|
||||||
|
ItemPointer htup;
|
||||||
|
|
||||||
|
btitem = (BTItem) PageGetItem(page,
|
||||||
|
PageGetItemId(page, offnum));
|
||||||
|
htup = &(btitem->bti_itup.t_tid);
|
||||||
if (callback(htup, callback_state))
|
if (callback(htup, callback_state))
|
||||||
{
|
{
|
||||||
/* Okay to delete the item from the page */
|
deletable[ndeletable++] = offnum;
|
||||||
_bt_itemdel(rel, buf, current);
|
|
||||||
|
|
||||||
/* Mark buffer dirty, but keep the lock and pin */
|
|
||||||
WriteNoReleaseBuffer(buf);
|
|
||||||
|
|
||||||
tuples_removed += 1;
|
tuples_removed += 1;
|
||||||
|
|
||||||
/*
|
|
||||||
* We now need to back up the scan one item, so that the next
|
|
||||||
* cycle will re-examine the same offnum on this page (which
|
|
||||||
* now holds the next item).
|
|
||||||
*/
|
|
||||||
current->ip_posid--;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
num_index_tuples += 1;
|
num_index_tuples += 1;
|
||||||
} while (_bt_step(scan, &buf, ForwardScanDirection));
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
index_endscan(scan);
|
/*
|
||||||
|
* If we need to delete anything, do it and write the buffer;
|
||||||
|
* else just release the buffer.
|
||||||
|
*/
|
||||||
|
nextpage = opaque->btpo_next;
|
||||||
|
if (ndeletable > 0)
|
||||||
|
{
|
||||||
|
_bt_delitems(rel, buf, deletable, ndeletable);
|
||||||
|
_bt_wrtbuf(rel, buf);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_bt_relbuf(rel, buf);
|
||||||
|
}
|
||||||
|
/* And advance to next page, if any */
|
||||||
|
if (nextpage == P_NONE)
|
||||||
|
break;
|
||||||
|
buf = _bt_getbuf(rel, nextpage, BT_READ);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* return statistics */
|
/* return statistics */
|
||||||
num_pages = RelationGetNumberOfBlocks(rel);
|
num_pages = RelationGetNumberOfBlocks(rel);
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
xlrec = (xl_btree_delete *) XLogRecGetData(record);
|
xlrec = (xl_btree_delete *) XLogRecGetData(record);
|
||||||
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
|
||||||
if (!RelationIsValid(reln))
|
if (!RelationIsValid(reln))
|
||||||
return;
|
return;
|
||||||
buffer = XLogReadBuffer(false, reln,
|
buffer = XLogReadBuffer(false, reln, xlrec->block);
|
||||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
|
||||||
if (!BufferIsValid(buffer))
|
if (!BufferIsValid(buffer))
|
||||||
elog(PANIC, "btree_delete_redo: block unfound");
|
elog(PANIC, "btree_delete_redo: block unfound");
|
||||||
page = (Page) BufferGetPage(buffer);
|
page = (Page) BufferGetPage(buffer);
|
||||||
@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
|
if (record->xl_len > SizeOfBtreeDelete)
|
||||||
|
{
|
||||||
|
OffsetNumber *unused;
|
||||||
|
OffsetNumber *unend;
|
||||||
|
|
||||||
|
unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
|
||||||
|
unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
|
||||||
|
|
||||||
|
/* be careful to delete from back to front */
|
||||||
|
while (unused < unend)
|
||||||
|
{
|
||||||
|
unend--;
|
||||||
|
PageIndexTupleDelete(page, *unend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
PageSetLSN(page, lsn);
|
||||||
PageSetSUI(page, ThisStartUpID);
|
PageSetSUI(page, ThisStartUpID);
|
||||||
@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec)
|
|||||||
{
|
{
|
||||||
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
|
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
|
||||||
|
|
||||||
strcat(buf, "delete: ");
|
sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u",
|
||||||
out_target(buf, &(xlrec->target));
|
xlrec->node.tblNode, xlrec->node.relNode, xlrec->block);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case XLOG_BTREE_DELETE_PAGE:
|
case XLOG_BTREE_DELETE_PAGE:
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $
|
* $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -263,14 +263,18 @@ typedef struct xl_btree_split
|
|||||||
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is what we need to know about delete of an individual leaf btitem
|
* This is what we need to know about delete of individual leaf btitems.
|
||||||
|
* The WAL record can represent deletion of any number of btitems on a
|
||||||
|
* single index page.
|
||||||
*/
|
*/
|
||||||
typedef struct xl_btree_delete
|
typedef struct xl_btree_delete
|
||||||
{
|
{
|
||||||
xl_btreetid target; /* deleted tuple id */
|
RelFileNode node;
|
||||||
|
BlockNumber block;
|
||||||
|
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
|
||||||
} xl_btree_delete;
|
} xl_btree_delete;
|
||||||
|
|
||||||
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
#define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is what we need to know about deletion of a btree page. The target
|
* This is what we need to know about deletion of a btree page. The target
|
||||||
@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
|
|||||||
extern void _bt_pageinit(Page page, Size size);
|
extern void _bt_pageinit(Page page, Size size);
|
||||||
extern bool _bt_page_recyclable(Page page);
|
extern bool _bt_page_recyclable(Page page);
|
||||||
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
|
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
|
||||||
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
|
extern void _bt_delitems(Relation rel, Buffer buf,
|
||||||
|
OffsetNumber *itemnos, int nitems);
|
||||||
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
|
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
Loading…
x
Reference in New Issue
Block a user