diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index f6be865b17..9e535124c4 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -28,9 +28,7 @@ static bool _bt_posting_valid(IndexTuple posting); #endif /* - * Deduplicate items on a leaf page. The page will have to be split by caller - * if we cannot successfully free at least newitemsz (we also need space for - * newitem's line pointer, which isn't included in caller's newitemsz). + * Perform a deduplication pass. * * The general approach taken here is to perform as much deduplication as * possible to free as much space as possible. Note, however, that "single @@ -43,76 +41,32 @@ static bool _bt_posting_valid(IndexTuple posting); * handle those if and when the anticipated right half page gets its own * deduplication pass, following further inserts of duplicates.) * - * This function should be called during insertion, when the page doesn't have - * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag - * was set, caller should have removed any LP_DEAD items by calling - * _bt_vacuum_one_page() before calling here. We may still have to kill - * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset, - * but that should be rare. Also, _bt_vacuum_one_page() won't unset the - * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful - * deduplication pass will always clear it, just to keep things tidy. + * The page will have to be split if we cannot successfully free at least + * newitemsz (we also need space for newitem's line pointer, which isn't + * included in caller's newitemsz). + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. */ void -_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, - IndexTuple newitem, Size newitemsz, bool checkingunique) +_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, + Size newitemsz, bool checkingunique) { OffsetNumber offnum, minoff, maxoff; Page page = BufferGetPage(buf); - BTPageOpaque opaque; + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); Page newpage; - OffsetNumber deletable[MaxIndexTuplesPerPage]; BTDedupState state; - int ndeletable = 0; Size pagesaving = 0; bool singlevalstrat = false; int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - /* - * We can't assume that there are no LP_DEAD items. For one thing, VACUUM - * will clear the BTP_HAS_GARBAGE hint without reliably removing items - * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD - * bits when deduplicating items. Allowing it would be correct, though - * wasteful. - */ - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = minoff; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid = PageGetItemId(page, offnum); - - if (ItemIdIsDead(itemid)) - deletable[ndeletable++] = offnum; - } - - if (ndeletable > 0) - { - _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel); - - /* - * Return when a split will be avoided. This is equivalent to - * avoiding a split using the usual _bt_vacuum_one_page() path. - */ - if (PageGetFreeSpace(page) >= newitemsz) - return; - - /* - * Reconsider number of items on page, in case _bt_delitems_delete() - * managed to delete an item or two - */ - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); - } - /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); /* - * By here, it's clear that deduplication will definitely be attempted. * Initialize deduplication state. * * It would be possible for maxpostingsize (limit on posting list tuple @@ -138,6 +92,9 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, /* nintervals should be initialized to zero */ state->nintervals = 0; + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + /* Determine if "single value" strategy should be used */ if (!checkingunique) singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); @@ -259,10 +216,9 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, /* * By here, it's clear that deduplication will definitely go ahead. * - * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is - * still falsely set, just to keep things tidy. (We can't rely on - * _bt_vacuum_one_page() having done this already, and we can't rely on a - * page split or VACUUM getting to it in the near future.) + * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace + * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. + * But keep things tidy. */ if (P_HAS_GARBAGE(opaque)) { diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b6ddf0ec4c..dde43b1415 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -58,7 +58,10 @@ static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem); -static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); +static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, + BTInsertState insertstate, + bool lpdeadonly, bool checkingunique, + bool uniquedup); /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. @@ -871,38 +874,14 @@ _bt_findinsertloc(Relation rel, } /* - * If the target page is full, see if we can obtain enough space by - * erasing LP_DEAD items. If that fails to free enough space, see if - * we can avoid a page split by performing a deduplication pass over - * the page. - * - * We only perform a deduplication pass for a checkingunique caller - * when the incoming item is a duplicate of an existing item on the - * leaf page. This heuristic avoids wasting cycles -- we only expect - * to benefit from deduplicating a unique index page when most or all - * recently added items are duplicates. See nbtree/README. + * If the target page is full, see if we can obtain enough space using + * one or more strategies (e.g. erasing LP_DEAD items, deduplication). + * Page splits are expensive, and should only go ahead when truly + * necessary. */ if (PageGetFreeSpace(page) < insertstate->itemsz) - { - if (P_HAS_GARBAGE(opaque)) - { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; - - /* Might as well assume duplicates (if checkingunique) */ - uniquedup = true; - } - - if (itup_key->allequalimage && BTGetDeduplicateItems(rel) && - (!checkingunique || uniquedup) && - PageGetFreeSpace(page) < insertstate->itemsz) - { - _bt_dedup_one_page(rel, insertstate->buf, heapRel, - insertstate->itup, insertstate->itemsz, - checkingunique); - insertstate->bounds_valid = false; - } - } + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, + checkingunique, uniquedup); } else { @@ -942,8 +921,9 @@ _bt_findinsertloc(Relation rel, */ if (P_HAS_GARBAGE(opaque)) { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; + /* Erase LP_DEAD items (won't deduplicate) */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + checkingunique, false); if (PageGetFreeSpace(page) >= insertstate->itemsz) break; /* OK, now we have enough space */ @@ -993,14 +973,17 @@ _bt_findinsertloc(Relation rel, * performing a posting list split, so delete all LP_DEAD items early. * This is the only case where LP_DEAD deletes happen even though * there is space for newitem on the page. + * + * This can only erase LP_DEAD items (it won't deduplicate). */ - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + checkingunique, false); /* * Do new binary search. New insert location cannot overlap with any * posting list now. */ - insertstate->bounds_valid = false; + Assert(!insertstate->bounds_valid); insertstate->postingoff = 0; newitemoff = _bt_binsrch_insert(rel, insertstate); Assert(insertstate->postingoff == 0); @@ -2623,32 +2606,59 @@ _bt_pgaddtup(Page page, } /* - * _bt_vacuum_one_page - vacuum just one index page. + * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split by attempting + * a variety of operations. * - * Try to remove LP_DEAD items from the given page. The passed buffer - * must be exclusive-locked, but unlike a real VACUUM, we don't need a - * super-exclusive "cleanup" lock (see nbtree/README). + * There are two operations performed here: deleting items already marked + * LP_DEAD, and deduplication. If both operations fail to free enough space + * for the incoming item then caller will go on to split the page. We always + * attempt our preferred strategy (which is to delete items whose LP_DEAD bit + * are set) first. If that doesn't work out we move on to deduplication. + * + * Caller's checkingunique and uniquedup arguments help us decide if we should + * perform deduplication, which is primarily useful with low cardinality data, + * but can sometimes absorb version churn. + * + * Callers that only want us to look for/delete LP_DEAD items can ask for that + * directly by passing true 'lpdeadonly' argument. + * + * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page + * level flag was found set. The flag was useful back when there wasn't + * necessarily one single page for a duplicate tuple to go on (before heap TID + * became a part of the key space in version 4 indexes). But we don't + * actually look at the flag anymore (it's not a gating condition for our + * caller). That would cause us to miss tuples that are safe to delete, + * without getting any benefit in return. We know that the alternative is to + * split the page; scanning the line pointer array in passing won't have + * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite + * all this because !heapkeyspace indexes must still do a "getting tired" + * linear search, and so are likely to get some benefit from using it as a + * gating condition.) */ static void -_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) +_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, + BTInsertState insertstate, + bool lpdeadonly, bool checkingunique, + bool uniquedup) { OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable = 0; OffsetNumber offnum, - minoff, maxoff; + Buffer buffer = insertstate->buf; + BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(buffer); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); + Assert(lpdeadonly || itup_key->heapkeyspace); /* * Scan over all items to see which ones need to be deleted according to * LP_DEAD flags. */ - minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - for (offnum = minoff; + for (offnum = P_FIRSTDATAKEY(opaque); offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { @@ -2659,12 +2669,50 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) } if (ndeletable > 0) + { _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + insertstate->bounds_valid = false; + + /* Return when a page split has already been avoided */ + if (PageGetFreeSpace(page) >= insertstate->itemsz) + return; + + /* Might as well assume duplicates (if checkingunique) */ + uniquedup = true; + } /* - * Note: if we didn't find any LP_DEAD items, then the page's - * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a - * separate write to clear it, however. We will clear it when we split - * the page, or when deduplication runs. + * Some callers only want to delete LP_DEAD items. Return early for these + * callers. + * + * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we + * return at this point (or when we go on the try either or both of our + * other strategies and they also fail). We do not bother expending a + * separate write to clear it, however. Caller will definitely clear it + * when it goes on to split the page (plus deduplication knows to clear + * the flag when it actually modifies the page). */ + if (lpdeadonly) + return; + + /* + * We can get called in the checkingunique case when there is no reason to + * believe that there are any duplicates on the page; we should at least + * still check for LP_DEAD items. If that didn't work out, give up and + * let caller split the page. Deduplication cannot be justified given + * there is no reason to think that there are duplicates. + */ + if (checkingunique && !uniquedup) + return; + + /* Assume bounds about to be invalidated (this is almost certain now) */ + insertstate->bounds_valid = false; + + /* + * Perform deduplication pass, though only when it is enabled for the + * index and known to be safe (it must be an allequalimage index). + */ + if (BTGetDeduplicateItems(rel) && itup_key->allequalimage) + _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup, + insertstate->itemsz, checkingunique); } diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 7f392480ac..848123d921 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1187,8 +1187,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * array of offset numbers. * * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it - * happens to already be set. Although we unset the BTP_HAS_GARBAGE page - * level flag, unsetting individual LP_DEAD bits should still be avoided. + * happens to already be set. It's important that we not interfere with + * _bt_delitems_delete(). */ for (int i = 0; i < nupdatable; i++) { @@ -1215,20 +1215,12 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, opaque->btpo_cycleid = 0; /* - * Mark the page as not containing any LP_DEAD items. This is not - * certainly true (there might be some that have recently been marked, but - * weren't targeted by VACUUM's heap scan), but it will be true often - * enough. VACUUM does not delete items purely because they have their - * LP_DEAD bit set, since doing so would necessitate explicitly logging a - * latestRemovedXid cutoff (this is how _bt_delitems_delete works). + * Clear the BTP_HAS_GARBAGE page flag. * - * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly - * limited, since we never falsely unset an LP_DEAD bit. Workloads that - * are particularly dependent on LP_DEAD bits being set quickly will - * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up - * again anyway. Furthermore, attempting a deduplication pass will remove - * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit - * is set or not. + * This flag indicates the presence of LP_DEAD items on the page (though + * not reliably). Note that we only trust it with pg_upgrade'd + * !heapkeyspace indexes. That's why clearing it here won't usually + * interfere with _bt_delitems_delete(). */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; @@ -1310,10 +1302,17 @@ _bt_delitems_delete(Relation rel, Buffer buf, /* * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID, - * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE - * page flag, since we deleted all items with their LP_DEAD bit set. + * because this is not called by VACUUM */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Clear the BTP_HAS_GARBAGE page flag. + * + * This flag indicates the presence of LP_DEAD items on the page (though + * not reliably). Note that we only trust it with pg_upgrade'd + * !heapkeyspace indexes. + */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; MarkBufferDirty(buf); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 81589b9056..2f5f14e527 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1877,7 +1877,8 @@ _bt_killitems(IndexScanDesc scan) * Since this can be redone later if needed, mark as dirty hint. * * Whenever we mark anything LP_DEAD, we also set the page's - * BTP_HAS_GARBAGE flag, which is likewise just a hint. + * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we + * only rely on the page-level flag in !heapkeyspace indexes.) */ if (killedsomething) { diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index e913226760..5135b800af 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -1065,7 +1065,8 @@ btree_mask(char *pagedata, BlockNumber blkno) /* * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See - * _bt_killitems(), _bt_check_unique() for details. + * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique() + * for details. */ maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 65d9698b89..e8fecc6026 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -75,7 +75,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_META (1 << 3) /* meta-page */ #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ -#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */ +#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ /* @@ -1027,9 +1027,9 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); /* * prototypes for functions in nbtdedup.c */ -extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, - IndexTuple newitem, Size newitemsz, - bool checkingunique); +extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique); extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff); extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);