Fix "single value strategy" index deletion issue.
It is not appropriate for deduplication to apply single value strategy when triggered by a bottom-up index deletion pass. This wastes cycles because later bottom-up deletion passes will overinterpret older duplicate tuples that deduplication actually just skipped over "by design". It also makes bottom-up deletion much less effective for low cardinality indexes that happen to cross a meaningless "index has single key value per leaf page" threshold. To fix, slightly narrow the conditions under which deduplication's single value strategy is considered. We already avoided the strategy for a unique index, since our high level goal must just be to buy time for VACUUM to run (not to buy space). We'll now also avoid it when we just had a bottom-up pass that reported failure. The two cases share the same high level goal, and already overlapped significantly, so this approach is quite natural. Oversight in commit d168b666, which added bottom-up index deletion. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-WznaOvM+Gyj-JQ0X=JxoMDxctDTYjiEuETdAGbF5EUc3MA@mail.gmail.com Backpatch: 14-, where bottom-up deletion was introduced.
This commit is contained in:
parent
1a9d802828
commit
dd94c2852e
@ -34,14 +34,17 @@ static bool _bt_posting_valid(IndexTuple posting);
|
|||||||
*
|
*
|
||||||
* The general approach taken here is to perform as much deduplication as
|
* The general approach taken here is to perform as much deduplication as
|
||||||
* possible to free as much space as possible. Note, however, that "single
|
* possible to free as much space as possible. Note, however, that "single
|
||||||
* value" strategy is sometimes used for !checkingunique callers, in which
|
* value" strategy is used for !bottomupdedup callers when the page is full of
|
||||||
* case deduplication will leave a few tuples untouched at the end of the
|
* tuples of a single value. Deduplication passes that apply the strategy
|
||||||
* page. The general idea is to prepare the page for an anticipated page
|
* will leave behind a few untouched tuples at the end of the page, preparing
|
||||||
* split that uses nbtsplitloc.c's "single value" strategy to determine a
|
* the page for an anticipated page split that uses nbtsplitloc.c's own single
|
||||||
* split point. (There is no reason to deduplicate items that will end up on
|
* value strategy. Our high level goal is to delay merging the untouched
|
||||||
* the right half of the page after the anticipated page split; better to
|
* tuples until after the page splits.
|
||||||
* handle those if and when the anticipated right half page gets its own
|
*
|
||||||
* deduplication pass, following further inserts of duplicates.)
|
* When a call to _bt_bottomupdel_pass() just took place (and failed), our
|
||||||
|
* high level goal is to prevent a page split entirely by buying more time.
|
||||||
|
* We still hope that a page split can be avoided altogether. That's why
|
||||||
|
* single value strategy is not even considered for bottomupdedup callers.
|
||||||
*
|
*
|
||||||
* The page will have to be split if we cannot successfully free at least
|
* The page will have to be split if we cannot successfully free at least
|
||||||
* newitemsz (we also need space for newitem's line pointer, which isn't
|
* newitemsz (we also need space for newitem's line pointer, which isn't
|
||||||
@ -52,7 +55,7 @@ static bool _bt_posting_valid(IndexTuple posting);
|
|||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
|
_bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
|
||||||
Size newitemsz, bool checkingunique)
|
Size newitemsz, bool bottomupdedup)
|
||||||
{
|
{
|
||||||
OffsetNumber offnum,
|
OffsetNumber offnum,
|
||||||
minoff,
|
minoff,
|
||||||
@ -97,8 +100,11 @@ _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
|
|||||||
minoff = P_FIRSTDATAKEY(opaque);
|
minoff = P_FIRSTDATAKEY(opaque);
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
maxoff = PageGetMaxOffsetNumber(page);
|
||||||
|
|
||||||
/* Determine if "single value" strategy should be used */
|
/*
|
||||||
if (!checkingunique)
|
* Consider applying "single value" strategy, though only if the page
|
||||||
|
* seems likely to be split in the near future
|
||||||
|
*/
|
||||||
|
if (!bottomupdedup)
|
||||||
singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
|
singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -764,14 +770,6 @@ _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
|
|||||||
* the first pass) won't spend many cycles on the large posting list tuples
|
* the first pass) won't spend many cycles on the large posting list tuples
|
||||||
* left by previous passes. Each pass will find a large contiguous group of
|
* left by previous passes. Each pass will find a large contiguous group of
|
||||||
* smaller duplicate tuples to merge together at the end of the page.
|
* smaller duplicate tuples to merge together at the end of the page.
|
||||||
*
|
|
||||||
* Note: We deliberately don't bother checking if the high key is a distinct
|
|
||||||
* value (prior to the TID tiebreaker column) before proceeding, unlike
|
|
||||||
* nbtsplitloc.c. Its single value strategy only gets applied on the
|
|
||||||
* rightmost page of duplicates of the same value (other leaf pages full of
|
|
||||||
* duplicates will get a simple 50:50 page split instead of splitting towards
|
|
||||||
* the end of the page). There is little point in making the same distinction
|
|
||||||
* here.
|
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
_bt_do_singleval(Relation rel, Page page, BTDedupState state,
|
_bt_do_singleval(Relation rel, Page page, BTDedupState state,
|
||||||
|
@ -2748,7 +2748,7 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
|
|||||||
/* Perform deduplication pass (when enabled and index-is-allequalimage) */
|
/* Perform deduplication pass (when enabled and index-is-allequalimage) */
|
||||||
if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
|
if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
|
||||||
_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
|
_bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
|
||||||
insertstate->itemsz, checkingunique);
|
insertstate->itemsz, (indexUnchanged || uniquedup));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1155,7 +1155,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
|
|||||||
*/
|
*/
|
||||||
extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
|
extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
|
||||||
IndexTuple newitem, Size newitemsz,
|
IndexTuple newitem, Size newitemsz,
|
||||||
bool checkingunique);
|
bool bottomupdedup);
|
||||||
extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
|
extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
|
||||||
Size newitemsz);
|
Size newitemsz);
|
||||||
extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
|
extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user