diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index bbb21d235c..9aab9cf64a 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \ - nbtutils.o nbtsort.o nbtvalidate.o nbtxlog.o + nbtsplitloc.o nbtutils.o nbtsort.o nbtvalidate.o nbtxlog.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index b93b546d22..266c1cd4cd 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -143,9 +143,9 @@ Lehman and Yao assume fixed-size keys, but we must deal with variable-size keys. Therefore there is not a fixed maximum number of keys per page; we just stuff in as many as will fit. When we split a page, we try to equalize the number of bytes, not items, assigned to -each of the resulting pages. Note we must include the incoming item in -this calculation, otherwise it is possible to find that the incoming -item doesn't fit on the split page where it needs to go! +pages (though suffix truncation is also considered). Note we must include +the incoming item in this calculation, otherwise it is possible to find +that the incoming item doesn't fit on the split page where it needs to go! The Deletion Algorithm ---------------------- @@ -649,6 +649,47 @@ variable-length types, such as text. An opclass support function could manufacture the shortest possible key value that still correctly separates each half of a leaf page split. +There is sophisticated criteria for choosing a leaf page split point. The +general idea is to make suffix truncation effective without unduly +influencing the balance of space for each half of the page split. The +choice of leaf split point can be thought of as a choice among points +*between* items on the page to be split, at least if you pretend that the +incoming tuple was placed on the page already (you have to pretend because +there won't actually be enough space for it on the page). Choosing the +split point between two index tuples where the first non-equal attribute +appears as early as possible results in truncating away as many suffix +attributes as possible. Evenly balancing space among each half of the +split is usually the first concern, but even small adjustments in the +precise split point can allow truncation to be far more effective. + +Suffix truncation is primarily valuable because it makes pivot tuples +smaller, which delays splits of internal pages, but that isn't the only +reason why it's effective. Even truncation that doesn't make pivot tuples +smaller due to alignment still prevents pivot tuples from being more +restrictive than truly necessary in how they describe which values belong +on which pages. + +While it's not possible to correctly perform suffix truncation during +internal page splits, it's still useful to be discriminating when splitting +an internal page. The split point that implies a downlink be inserted in +the parent that's the smallest one available within an acceptable range of +the fillfactor-wise optimal split point is chosen. This idea also comes +from the Prefix B-Tree paper. This process has much in common with what +happens at the leaf level to make suffix truncation effective. The overall +effect is that suffix truncation tends to produce smaller, more +discriminating pivot tuples, especially early in the lifetime of the index, +while biasing internal page splits makes the earlier, smaller pivot tuples +end up in the root page, delaying root page splits. + +Logical duplicates are given special consideration. The logic for +selecting a split point goes to great lengths to avoid having duplicates +span more than one page, and almost always manages to pick a split point +between two user-key-distinct tuples, accepting a completely lopsided split +if it must. When a page that's already full of duplicates must be split, +the fallback strategy assumes that duplicates are mostly inserted in +ascending heap TID order. The page is split in a way that leaves the left +half of the page mostly full, and the right half of the page mostly empty. + Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b2ee0adb50..2c98405aac 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -28,26 +28,6 @@ /* Minimum tree height for application of fastpath optimization */ #define BTREE_FASTPATH_MIN_LEVEL 2 -typedef struct -{ - /* context data for _bt_checksplitloc */ - Size newitemsz; /* size of new item to be inserted */ - int fillfactor; /* needed when splitting rightmost page */ - bool is_leaf; /* T if splitting a leaf page */ - bool is_rightmost; /* T if splitting a rightmost page */ - OffsetNumber newitemoff; /* where the new item is to be inserted */ - int leftspace; /* space available for items on left page */ - int rightspace; /* space available for items on right page */ - int olddataitemstotal; /* space taken by old items */ - - bool have_split; /* found a valid split? */ - - /* these fields valid only if have_split is true */ - bool newitemonleft; /* new item on left or right of best split */ - OffsetNumber firstright; /* best split point */ - int best_delta; /* best size delta so far */ -} FindSplitData; - static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); @@ -73,13 +53,6 @@ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Size newitemsz, IndexTuple newitem, bool newitemonleft); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); -static OffsetNumber _bt_findsplitloc(Relation rel, Page page, - OffsetNumber newitemoff, - Size newitemsz, - bool *newitemonleft); -static void _bt_checksplitloc(FindSplitData *state, - OffsetNumber firstoldonright, bool newitemonleft, - int dataitemstoleft, Size firstoldonrightsz); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, @@ -1003,7 +976,7 @@ _bt_insertonpg(Relation rel, /* Choose the split point */ firstright = _bt_findsplitloc(rel, page, - newitemoff, itemsz, + newitemoff, itemsz, itup, &newitemonleft); /* split the buffer into left and right halves */ @@ -1687,264 +1660,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, return rbuf; } -/* - * _bt_findsplitloc() -- find an appropriate place to split a page. - * - * The idea here is to equalize the free space that will be on each split - * page, *after accounting for the inserted tuple*. (If we fail to account - * for it, we might find ourselves with too little room on the page that - * it needs to go into!) - * - * If the page is the rightmost page on its level, we instead try to arrange - * to leave the left split page fillfactor% full. In this way, when we are - * inserting successively increasing keys (consider sequences, timestamps, - * etc) we will end up with a tree whose pages are about fillfactor% full, - * instead of the 50% full result that we'd get without this special case. - * This is the same as nbtsort.c produces for a newly-created tree. Note - * that leaf and nonleaf pages use different fillfactors. - * - * We are passed the intended insert position of the new tuple, expressed as - * the offsetnumber of the tuple it must go in front of. (This could be - * maxoff+1 if the tuple is to go at the end.) - * - * We return the index of the first existing tuple that should go on the - * righthand page, plus a boolean indicating whether the new tuple goes on - * the left or right page. The bool is necessary to disambiguate the case - * where firstright == newitemoff. - */ -static OffsetNumber -_bt_findsplitloc(Relation rel, - Page page, - OffsetNumber newitemoff, - Size newitemsz, - bool *newitemonleft) -{ - BTPageOpaque opaque; - OffsetNumber offnum; - OffsetNumber maxoff; - ItemId itemid; - FindSplitData state; - int leftspace, - rightspace, - goodenough, - olddataitemstotal, - olddataitemstoleft; - bool goodenoughfound; - - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ - newitemsz += sizeof(ItemIdData); - - /* Total free space available on a btree page, after fixed overhead */ - leftspace = rightspace = - PageGetPageSize(page) - SizeOfPageHeaderData - - MAXALIGN(sizeof(BTPageOpaqueData)); - - /* The right page will have the same high key as the old page */ - if (!P_RIGHTMOST(opaque)) - { - itemid = PageGetItemId(page, P_HIKEY); - rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + - sizeof(ItemIdData)); - } - - /* Count up total space in data items without actually scanning 'em */ - olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); - - state.newitemsz = newitemsz; - state.is_leaf = P_ISLEAF(opaque); - state.is_rightmost = P_RIGHTMOST(opaque); - state.have_split = false; - if (state.is_leaf) - state.fillfactor = RelationGetFillFactor(rel, - BTREE_DEFAULT_FILLFACTOR); - else - state.fillfactor = BTREE_NONLEAF_FILLFACTOR; - state.newitemonleft = false; /* these just to keep compiler quiet */ - state.firstright = 0; - state.best_delta = 0; - state.leftspace = leftspace; - state.rightspace = rightspace; - state.olddataitemstotal = olddataitemstotal; - state.newitemoff = newitemoff; - - /* - * Finding the best possible split would require checking all the possible - * split points, because of the high-key and left-key special cases. - * That's probably more work than it's worth; instead, stop as soon as we - * find a "good-enough" split, where good-enough is defined as an - * imbalance in free space of no more than pagesize/16 (arbitrary...) This - * should let us stop near the middle on most pages, instead of plowing to - * the end. - */ - goodenough = leftspace / 16; - - /* - * Scan through the data items and calculate space usage for a split at - * each possible position. - */ - olddataitemstoleft = 0; - goodenoughfound = false; - maxoff = PageGetMaxOffsetNumber(page); - - for (offnum = P_FIRSTDATAKEY(opaque); - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - Size itemsz; - - itemid = PageGetItemId(page, offnum); - itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); - - /* - * Will the new item go to left or right of split? - */ - if (offnum > newitemoff) - _bt_checksplitloc(&state, offnum, true, - olddataitemstoleft, itemsz); - - else if (offnum < newitemoff) - _bt_checksplitloc(&state, offnum, false, - olddataitemstoleft, itemsz); - else - { - /* need to try it both ways! */ - _bt_checksplitloc(&state, offnum, true, - olddataitemstoleft, itemsz); - - _bt_checksplitloc(&state, offnum, false, - olddataitemstoleft, itemsz); - } - - /* Abort scan once we find a good-enough choice */ - if (state.have_split && state.best_delta <= goodenough) - { - goodenoughfound = true; - break; - } - - olddataitemstoleft += itemsz; - } - - /* - * If the new item goes as the last item, check for splitting so that all - * the old items go to the left page and the new item goes to the right - * page. - */ - if (newitemoff > maxoff && !goodenoughfound) - _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0); - - /* - * I believe it is not possible to fail to find a feasible split, but just - * in case ... - */ - if (!state.have_split) - elog(ERROR, "could not find a feasible split point for index \"%s\"", - RelationGetRelationName(rel)); - - *newitemonleft = state.newitemonleft; - return state.firstright; -} - -/* - * Subroutine to analyze a particular possible split choice (ie, firstright - * and newitemonleft settings), and record the best split so far in *state. - * - * firstoldonright is the offset of the first item on the original page - * that goes to the right page, and firstoldonrightsz is the size of that - * tuple. firstoldonright can be > max offset, which means that all the old - * items go to the left page and only the new item goes to the right page. - * In that case, firstoldonrightsz is not used. - * - * olddataitemstoleft is the total size of all old items to the left of - * firstoldonright. - */ -static void -_bt_checksplitloc(FindSplitData *state, - OffsetNumber firstoldonright, - bool newitemonleft, - int olddataitemstoleft, - Size firstoldonrightsz) -{ - int leftfree, - rightfree; - Size firstrightitemsz; - bool newitemisfirstonright; - - /* Is the new item going to be the first item on the right page? */ - newitemisfirstonright = (firstoldonright == state->newitemoff - && !newitemonleft); - - if (newitemisfirstonright) - firstrightitemsz = state->newitemsz; - else - firstrightitemsz = firstoldonrightsz; - - /* Account for all the old tuples */ - leftfree = state->leftspace - olddataitemstoleft; - rightfree = state->rightspace - - (state->olddataitemstotal - olddataitemstoleft); - - /* - * The first item on the right page becomes the high key of the left page; - * therefore it counts against left space as well as right space. When - * index has included attributes, then those attributes of left page high - * key will be truncated leaving that page with slightly more free space. - * However, that shouldn't affect our ability to find valid split - * location, because anyway split location should exists even without high - * key truncation. - */ - leftfree -= firstrightitemsz; - - /* account for the new item */ - if (newitemonleft) - leftfree -= (int) state->newitemsz; - else - rightfree -= (int) state->newitemsz; - - /* - * If we are not on the leaf level, we will be able to discard the key - * data from the first item that winds up on the right page. - */ - if (!state->is_leaf) - rightfree += (int) firstrightitemsz - - (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); - - /* - * If feasible split point, remember best delta. - */ - if (leftfree >= 0 && rightfree >= 0) - { - int delta; - - if (state->is_rightmost) - { - /* - * If splitting a rightmost page, try to put (100-fillfactor)% of - * free space on left page. See comments for _bt_findsplitloc. - */ - delta = (state->fillfactor * leftfree) - - ((100 - state->fillfactor) * rightfree); - } - else - { - /* Otherwise, aim for equal free space on both sides */ - delta = leftfree - rightfree; - } - - if (delta < 0) - delta = -delta; - if (!state->have_split || delta < state->best_delta) - { - state->have_split = true; - state->newitemonleft = newitemonleft; - state->firstright = firstoldonright; - state->best_delta = delta; - } - } -} - /* * _bt_insert_parent() -- Insert downlink into parent after a page split. * diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c new file mode 100644 index 0000000000..34228b133e --- /dev/null +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -0,0 +1,846 @@ +/*------------------------------------------------------------------------- + * + * nbtsplitloc.c + * Choose split point code for Postgres btree implementation. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsplitloc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "storage/lmgr.h" + +/* limits on split interval (default strategy only) */ +#define MAX_LEAF_INTERVAL 9 +#define MAX_INTERNAL_INTERVAL 18 + +typedef enum +{ + /* strategy for searching through materialized list of split points */ + SPLIT_DEFAULT, /* give some weight to truncation */ + SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */ + SPLIT_SINGLE_VALUE /* leave left page almost full */ +} FindSplitStrat; + +typedef struct +{ + /* details of free space left by split */ + int16 curdelta; /* current leftfree/rightfree delta */ + int16 leftfree; /* space left on left page post-split */ + int16 rightfree; /* space left on right page post-split */ + + /* split point identifying fields (returned by _bt_findsplitloc) */ + OffsetNumber firstoldonright; /* first item on new right page */ + bool newitemonleft; /* new item goes on left, or right? */ + +} SplitPoint; + +typedef struct +{ + /* context data for _bt_recsplitloc */ + Relation rel; /* index relation */ + Page page; /* page undergoing split */ + IndexTuple newitem; /* new item (cause of page split) */ + Size newitemsz; /* size of newitem (includes line pointer) */ + bool is_leaf; /* T if splitting a leaf page */ + bool is_rightmost; /* T if splitting rightmost page on level */ + OffsetNumber newitemoff; /* where the new item is to be inserted */ + int leftspace; /* space available for items on left page */ + int rightspace; /* space available for items on right page */ + int olddataitemstotal; /* space taken by old items */ + Size minfirstrightsz; /* smallest firstoldonright tuple size */ + + /* candidate split point data */ + int maxsplits; /* maximum number of splits */ + int nsplits; /* current number of splits */ + SplitPoint *splits; /* all candidate split points for page */ + int interval; /* current range of acceptable split points */ +} FindSplitData; + +static void _bt_recsplitloc(FindSplitData *state, + OffsetNumber firstoldonright, bool newitemonleft, + int olddataitemstoleft, Size firstoldonrightsz); +static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, + bool usemult); +static int _bt_splitcmp(const void *arg1, const void *arg2); +static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, + bool *newitemonleft); +static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, + SplitPoint *rightpage, FindSplitStrat *strategy); +static void _bt_interval_edges(FindSplitData *state, + SplitPoint **leftinterval, SplitPoint **rightinterval); +static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split); +static inline IndexTuple _bt_split_lastleft(FindSplitData *state, + SplitPoint *split); +static inline IndexTuple _bt_split_firstright(FindSplitData *state, + SplitPoint *split); + + +/* + * _bt_findsplitloc() -- find an appropriate place to split a page. + * + * The main goal here is to equalize the free space that will be on each + * split page, *after accounting for the inserted tuple*. (If we fail to + * account for it, we might find ourselves with too little room on the page + * that it needs to go into!) + * + * If the page is the rightmost page on its level, we instead try to arrange + * to leave the left split page fillfactor% full. In this way, when we are + * inserting successively increasing keys (consider sequences, timestamps, + * etc) we will end up with a tree whose pages are about fillfactor% full, + * instead of the 50% full result that we'd get without this special case. + * This is the same as nbtsort.c produces for a newly-created tree. Note + * that leaf and nonleaf pages use different fillfactors. Note also that + * there are a number of further special cases where fillfactor is not + * applied in the standard way. + * + * We are passed the intended insert position of the new tuple, expressed as + * the offsetnumber of the tuple it must go in front of (this could be + * maxoff+1 if the tuple is to go at the end). The new tuple itself is also + * passed, since it's needed to give some weight to how effective suffix + * truncation will be. The implementation picks the split point that + * maximizes the effectiveness of suffix truncation from a small list of + * alternative candidate split points that leave each side of the split with + * about the same share of free space. Suffix truncation is secondary to + * equalizing free space, except in cases with large numbers of duplicates. + * Note that it is always assumed that caller goes on to perform truncation, + * even with pg_upgrade'd indexes where that isn't actually the case + * (!heapkeyspace indexes). See nbtree/README for more information about + * suffix truncation. + * + * We return the index of the first existing tuple that should go on the + * righthand page, plus a boolean indicating whether the new tuple goes on + * the left or right page. The bool is necessary to disambiguate the case + * where firstright == newitemoff. + */ +OffsetNumber +_bt_findsplitloc(Relation rel, + Page page, + OffsetNumber newitemoff, + Size newitemsz, + IndexTuple newitem, + bool *newitemonleft) +{ + BTPageOpaque opaque; + int leftspace, + rightspace, + olddataitemstotal, + olddataitemstoleft, + perfectpenalty, + leaffillfactor; + FindSplitData state; + FindSplitStrat strategy; + ItemId itemid; + OffsetNumber offnum, + maxoff, + foundfirstright; + double fillfactormult; + bool usemult; + SplitPoint leftpage, + rightpage; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* Total free space available on a btree page, after fixed overhead */ + leftspace = rightspace = + PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + + /* The right page will have the same high key as the old page */ + if (!P_RIGHTMOST(opaque)) + { + itemid = PageGetItemId(page, P_HIKEY); + rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + + sizeof(ItemIdData)); + } + + /* Count up total space in data items before actually scanning 'em */ + olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); + leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + state.rel = rel; + state.page = page; + state.newitem = newitem; + state.newitemsz = newitemsz; + state.is_leaf = P_ISLEAF(opaque); + state.is_rightmost = P_RIGHTMOST(opaque); + state.leftspace = leftspace; + state.rightspace = rightspace; + state.olddataitemstotal = olddataitemstotal; + state.minfirstrightsz = SIZE_MAX; + state.newitemoff = newitemoff; + + /* + * maxsplits should never exceed maxoff because there will be at most as + * many candidate split points as there are points _between_ tuples, once + * you imagine that the new item is already on the original page (the + * final number of splits may be slightly lower because not all points + * between tuples will be legal). + */ + state.maxsplits = maxoff; + state.splits = palloc(sizeof(SplitPoint) * state.maxsplits); + state.nsplits = 0; + + /* + * Scan through the data items and calculate space usage for a split at + * each possible position. We start at the first data offset rather than + * the second data offset to handle the "newitemoff == first data offset" + * case (any other split whose firstoldonright is the first data offset + * can't be legal, though, and so won't actually end up being recorded in + * first loop iteration). + */ + olddataitemstoleft = 0; + + for (offnum = P_FIRSTDATAKEY(opaque); + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + Size itemsz; + + itemid = PageGetItemId(page, offnum); + itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + + /* + * Will the new item go to left or right of split? + */ + if (offnum > newitemoff) + _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); + else if (offnum < newitemoff) + _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); + else + { + /* may need to record a split on one or both sides of new item */ + _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); + _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); + } + + olddataitemstoleft += itemsz; + } + + /* + * If the new item goes as the last item, record the split point that + * leaves all the old items on the left page, and the new item on the + * right page. This is required because a split that leaves the new item + * as the firstoldonright won't have been reached within the loop. + */ + Assert(olddataitemstoleft == olddataitemstotal); + if (newitemoff > maxoff) + _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0); + + /* + * I believe it is not possible to fail to find a feasible split, but just + * in case ... + */ + if (state.nsplits == 0) + elog(ERROR, "could not find a feasible split point for index \"%s\"", + RelationGetRelationName(rel)); + + /* + * Start search for a split point among list of legal split points. Give + * primary consideration to equalizing available free space in each half + * of the split initially (start with default strategy), while applying + * rightmost optimization where appropriate. Either of the two other + * fallback strategies may be required for cases with a large number of + * duplicates around the original/space-optimal split point. + * + * Default strategy gives some weight to suffix truncation in deciding a + * split point on leaf pages. It attempts to select a split point where a + * distinguishing attribute appears earlier in the new high key for the + * left side of the split, in order to maximize the number of trailing + * attributes that can be truncated away. Only candidate split points + * that imply an acceptable balance of free space on each side are + * considered. + */ + if (!state.is_leaf) + { + /* fillfactormult only used on rightmost page */ + usemult = state.is_rightmost; + fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0; + } + else if (state.is_rightmost) + { + /* Rightmost leaf page -- fillfactormult always used */ + usemult = true; + fillfactormult = leaffillfactor / 100.0; + } + else + { + /* Other leaf page. 50:50 page split. */ + usemult = false; + /* fillfactormult not used, but be tidy */ + fillfactormult = 0.50; + } + + /* + * Set an initial limit on the split interval/number of candidate split + * points as appropriate. The "Prefix B-Trees" paper refers to this as + * sigma l for leaf splits and sigma b for internal ("branch") splits. + * It's hard to provide a theoretical justification for the initial size + * of the split interval, though it's clear that a small split interval + * makes suffix truncation much more effective without noticeably + * affecting space utilization over time. + */ + state.interval = Min(Max(1, state.nsplits * 0.05), + state.is_leaf ? MAX_LEAF_INTERVAL : + MAX_INTERNAL_INTERVAL); + + /* + * Save leftmost and rightmost splits for page before original ordinal + * sort order is lost by delta/fillfactormult sort + */ + leftpage = state.splits[0]; + rightpage = state.splits[state.nsplits - 1]; + + /* Give split points a fillfactormult-wise delta, and sort on deltas */ + _bt_deltasortsplits(&state, fillfactormult, usemult); + + /* + * Determine if default strategy/split interval will produce a + * sufficiently distinguishing split, or if we should change strategies. + * Alternative strategies change the range of split points that are + * considered acceptable (split interval), and possibly change + * fillfactormult, in order to deal with pages with a large number of + * duplicates gracefully. + * + * Pass low and high splits for the entire page (including even newitem). + * These are used when the initial split interval encloses split points + * that are full of duplicates, and we need to consider if it's even + * possible to avoid appending a heap TID. + */ + perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy); + + if (strategy == SPLIT_DEFAULT) + { + /* + * Default strategy worked out (always works out with internal page). + * Original split interval still stands. + */ + } + + /* + * Many duplicates strategy is used when a heap TID would otherwise be + * appended, but the page isn't completely full of logical duplicates. + * + * The split interval is widened to include all legal candidate split + * points. There may be a few as two distinct values in the whole-page + * split interval. Many duplicates strategy has no hard requirements for + * space utilization, though it still keeps the use of space balanced as a + * non-binding secondary goal (perfect penalty is set so that the + * first/lowest delta split points that avoids appending a heap TID is + * used). + * + * Single value strategy is used when it is impossible to avoid appending + * a heap TID. It arranges to leave the left page very full. This + * maximizes space utilization in cases where tuples with the same + * attribute values span many pages. Newly inserted duplicates will tend + * to have higher heap TID values, so we'll end up splitting to the right + * consistently. (Single value strategy is harmless though not + * particularly useful with !heapkeyspace indexes.) + */ + else if (strategy == SPLIT_MANY_DUPLICATES) + { + Assert(state.is_leaf); + /* No need to resort splits -- no change in fillfactormult/deltas */ + state.interval = state.nsplits; + } + else if (strategy == SPLIT_SINGLE_VALUE) + { + Assert(state.is_leaf); + /* Split near the end of the page */ + usemult = true; + fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0; + /* Resort split points with new delta */ + _bt_deltasortsplits(&state, fillfactormult, usemult); + /* Appending a heap TID is unavoidable, so interval of 1 is fine */ + state.interval = 1; + } + + /* + * Search among acceptable split points (using final split interval) for + * the entry that has the lowest penalty, and is therefore expected to + * maximize fan-out. Sets *newitemonleft for us. + */ + foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft); + pfree(state.splits); + + return foundfirstright; +} + +/* + * Subroutine to record a particular point between two tuples (possibly the + * new item) on page (ie, combination of firstright and newitemonleft + * settings) in *state for later analysis. This is also a convenient point + * to check if the split is legal (if it isn't, it won't be recorded). + * + * firstoldonright is the offset of the first item on the original page that + * goes to the right page, and firstoldonrightsz is the size of that tuple. + * firstoldonright can be > max offset, which means that all the old items go + * to the left page and only the new item goes to the right page. In that + * case, firstoldonrightsz is not used. + * + * olddataitemstoleft is the total size of all old items to the left of the + * split point that is recorded here when legal. Should not include + * newitemsz, since that is handled here. + */ +static void +_bt_recsplitloc(FindSplitData *state, + OffsetNumber firstoldonright, + bool newitemonleft, + int olddataitemstoleft, + Size firstoldonrightsz) +{ + int16 leftfree, + rightfree; + Size firstrightitemsz; + bool newitemisfirstonright; + + /* Is the new item going to be the first item on the right page? */ + newitemisfirstonright = (firstoldonright == state->newitemoff + && !newitemonleft); + + if (newitemisfirstonright) + firstrightitemsz = state->newitemsz; + else + firstrightitemsz = firstoldonrightsz; + + /* Account for all the old tuples */ + leftfree = state->leftspace - olddataitemstoleft; + rightfree = state->rightspace - + (state->olddataitemstotal - olddataitemstoleft); + + /* + * The first item on the right page becomes the high key of the left page; + * therefore it counts against left space as well as right space (we + * cannot assume that suffix truncation will make it any smaller). When + * index has included attributes, then those attributes of left page high + * key will be truncated leaving that page with slightly more free space. + * However, that shouldn't affect our ability to find valid split + * location, since we err in the direction of being pessimistic about free + * space on the left half. Besides, even when suffix truncation of + * non-TID attributes occurs, the new high key often won't even be a + * single MAXALIGN() quantum smaller than the firstright tuple it's based + * on. + * + * If we are on the leaf level, assume that suffix truncation cannot avoid + * adding a heap TID to the left half's new high key when splitting at the + * leaf level. In practice the new high key will often be smaller and + * will rarely be larger, but conservatively assume the worst case. + */ + if (state->is_leaf) + leftfree -= (int16) (firstrightitemsz + + MAXALIGN(sizeof(ItemPointerData))); + else + leftfree -= (int16) firstrightitemsz; + + /* account for the new item */ + if (newitemonleft) + leftfree -= (int16) state->newitemsz; + else + rightfree -= (int16) state->newitemsz; + + /* + * If we are not on the leaf level, we will be able to discard the key + * data from the first item that winds up on the right page. + */ + if (!state->is_leaf) + rightfree += (int16) firstrightitemsz - + (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); + + /* Record split if legal */ + if (leftfree >= 0 && rightfree >= 0) + { + Assert(state->nsplits < state->maxsplits); + + /* Determine smallest firstright item size on page */ + state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz); + + state->splits[state->nsplits].curdelta = 0; + state->splits[state->nsplits].leftfree = leftfree; + state->splits[state->nsplits].rightfree = rightfree; + state->splits[state->nsplits].firstoldonright = firstoldonright; + state->splits[state->nsplits].newitemonleft = newitemonleft; + state->nsplits++; + } +} + +/* + * Subroutine to assign space deltas to materialized array of candidate split + * points based on current fillfactor, and to sort array using that fillfactor + */ +static void +_bt_deltasortsplits(FindSplitData *state, double fillfactormult, + bool usemult) +{ + for (int i = 0; i < state->nsplits; i++) + { + SplitPoint *split = state->splits + i; + int16 delta; + + if (usemult) + delta = fillfactormult * split->leftfree - + (1.0 - fillfactormult) * split->rightfree; + else + delta = split->leftfree - split->rightfree; + + if (delta < 0) + delta = -delta; + + /* Save delta */ + split->curdelta = delta; + } + + qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp); +} + +/* + * qsort-style comparator used by _bt_deltasortsplits() + */ +static int +_bt_splitcmp(const void *arg1, const void *arg2) +{ + SplitPoint *split1 = (SplitPoint *) arg1; + SplitPoint *split2 = (SplitPoint *) arg2; + + if (split1->curdelta > split2->curdelta) + return 1; + if (split1->curdelta < split2->curdelta) + return -1; + + return 0; +} + +/* + * Subroutine to find the "best" split point among an array of acceptable + * candidate split points that split without there being an excessively high + * delta between the space left free on the left and right halves. The "best" + * split point is the split point with the lowest penalty among split points + * that fall within current/final split interval. Penalty is an abstract + * score, with a definition that varies depending on whether we're splitting a + * leaf page or an internal page. See _bt_split_penalty() for details. + * + * "perfectpenalty" is assumed to be the lowest possible penalty among + * candidate split points. This allows us to return early without wasting + * cycles on calculating the first differing attribute for all candidate + * splits when that clearly cannot improve our choice (or when we only want a + * minimally distinguishing split point, and don't want to make the split any + * more unbalanced than is necessary). + * + * We return the index of the first existing tuple that should go on the right + * page, plus a boolean indicating if new item is on left of split point. + */ +static OffsetNumber +_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft) +{ + int bestpenalty, + lowsplit; + int highsplit = Min(state->interval, state->nsplits); + + /* No point in calculating penalty when there's only one choice */ + if (state->nsplits == 1) + { + *newitemonleft = state->splits[0].newitemonleft; + return state->splits[0].firstoldonright; + } + + bestpenalty = INT_MAX; + lowsplit = 0; + for (int i = lowsplit; i < highsplit; i++) + { + int penalty; + + penalty = _bt_split_penalty(state, state->splits + i); + + if (penalty <= perfectpenalty) + { + bestpenalty = penalty; + lowsplit = i; + break; + } + + if (penalty < bestpenalty) + { + bestpenalty = penalty; + lowsplit = i; + } + } + + *newitemonleft = state->splits[lowsplit].newitemonleft; + return state->splits[lowsplit].firstoldonright; +} + +/* + * Subroutine to decide whether split should use default strategy/initial + * split interval, or whether it should finish splitting the page using + * alternative strategies (this is only possible with leaf pages). + * + * Caller uses alternative strategy (or sticks with default strategy) based + * on how *strategy is set here. Return value is "perfect penalty", which is + * passed to _bt_bestsplitloc() as a final constraint on how far caller is + * willing to go to avoid appending a heap TID when using the many duplicates + * strategy (it also saves _bt_bestsplitloc() useless cycles). + */ +static int +_bt_strategy(FindSplitData *state, SplitPoint *leftpage, + SplitPoint *rightpage, FindSplitStrat *strategy) +{ + IndexTuple leftmost, + rightmost; + SplitPoint *leftinterval, + *rightinterval; + int perfectpenalty; + int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + + /* Assume that alternative strategy won't be used for now */ + *strategy = SPLIT_DEFAULT; + + /* + * Use smallest observed first right item size for entire page as perfect + * penalty on internal pages. This can save cycles in the common case + * where most or all splits (not just splits within interval) have first + * right tuples that are the same size. + */ + if (!state->is_leaf) + return state->minfirstrightsz; + + /* + * Use leftmost and rightmost tuples from leftmost and rightmost splits in + * current split interval + */ + _bt_interval_edges(state, &leftinterval, &rightinterval); + leftmost = _bt_split_lastleft(state, leftinterval); + rightmost = _bt_split_firstright(state, rightinterval); + + /* + * If initial split interval can produce a split point that will at least + * avoid appending a heap TID in new high key, we're done. Finish split + * with default strategy and initial split interval. + */ + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + if (perfectpenalty <= indnkeyatts) + return perfectpenalty; + + /* + * Work out how caller should finish split when even their "perfect" + * penalty for initial/default split interval indicates that the interval + * does not contain even a single split that avoids appending a heap TID. + * + * Use the leftmost split's lastleft tuple and the rightmost split's + * firstright tuple to assess every possible split. + */ + leftmost = _bt_split_lastleft(state, leftpage); + rightmost = _bt_split_firstright(state, rightpage); + + /* + * If page (including new item) has many duplicates but is not entirely + * full of duplicates, a many duplicates strategy split will be performed. + * If page is entirely full of duplicates, a single value strategy split + * will be performed. + */ + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + if (perfectpenalty <= indnkeyatts) + { + *strategy = SPLIT_MANY_DUPLICATES; + + /* + * Caller should choose the lowest delta split that avoids appending a + * heap TID. Maximizing the number of attributes that can be + * truncated away (returning perfectpenalty when it happens to be less + * than the number of key attributes in index) can result in continual + * unbalanced page splits. + * + * Just avoiding appending a heap TID can still make splits very + * unbalanced, but this is self-limiting. When final split has a very + * high delta, one side of the split will likely consist of a single + * value. If that page is split once again, then that split will + * likely use the single value strategy. + */ + return indnkeyatts; + } + + /* + * Single value strategy is only appropriate with ever-increasing heap + * TIDs; otherwise, original default strategy split should proceed to + * avoid pathological performance. Use page high key to infer if this is + * the rightmost page among pages that store the same duplicate value. + * This should not prevent insertions of heap TIDs that are slightly out + * of order from using single value strategy, since that's expected with + * concurrent inserters of the same duplicate value. + */ + else if (state->is_rightmost) + *strategy = SPLIT_SINGLE_VALUE; + else + { + ItemId itemid; + IndexTuple hikey; + + itemid = PageGetItemId(state->page, P_HIKEY); + hikey = (IndexTuple) PageGetItem(state->page, itemid); + perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, + state->newitem); + if (perfectpenalty <= indnkeyatts) + *strategy = SPLIT_SINGLE_VALUE; + else + { + /* + * Have caller finish split using default strategy, since page + * does not appear to be the rightmost page for duplicates of the + * value the page is filled with + */ + } + } + + return perfectpenalty; +} + +/* + * Subroutine to locate leftmost and rightmost splits for current/default + * split interval. Note that it will be the same split iff there is only one + * split in interval. + */ +static void +_bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, + SplitPoint **rightinterval) +{ + int highsplit = Min(state->interval, state->nsplits); + SplitPoint *deltaoptimal; + + deltaoptimal = state->splits; + *leftinterval = NULL; + *rightinterval = NULL; + + /* + * Delta is an absolute distance to optimal split point, so both the + * leftmost and rightmost split point will usually be at the end of the + * array + */ + for (int i = highsplit - 1; i >= 0; i--) + { + SplitPoint *distant = state->splits + i; + + if (distant->firstoldonright < deltaoptimal->firstoldonright) + { + if (*leftinterval == NULL) + *leftinterval = distant; + } + else if (distant->firstoldonright > deltaoptimal->firstoldonright) + { + if (*rightinterval == NULL) + *rightinterval = distant; + } + else if (!distant->newitemonleft && deltaoptimal->newitemonleft) + { + /* + * "incoming tuple will become first on right page" (distant) is + * to the left of "incoming tuple will become last on left page" + * (delta-optimal) + */ + Assert(distant->firstoldonright == state->newitemoff); + if (*leftinterval == NULL) + *leftinterval = distant; + } + else if (distant->newitemonleft && !deltaoptimal->newitemonleft) + { + /* + * "incoming tuple will become last on left page" (distant) is to + * the right of "incoming tuple will become first on right page" + * (delta-optimal) + */ + Assert(distant->firstoldonright == state->newitemoff); + if (*rightinterval == NULL) + *rightinterval = distant; + } + else + { + /* There was only one or two splits in initial split interval */ + Assert(distant == deltaoptimal); + if (*leftinterval == NULL) + *leftinterval = distant; + if (*rightinterval == NULL) + *rightinterval = distant; + } + + if (*leftinterval && *rightinterval) + return; + } + + Assert(false); +} + +/* + * Subroutine to find penalty for caller's candidate split point. + * + * On leaf pages, penalty is the attribute number that distinguishes each side + * of a split. It's the last attribute that needs to be included in new high + * key for left page. It can be greater than the number of key attributes in + * cases where a heap TID will need to be appended during truncation. + * + * On internal pages, penalty is simply the size of the first item on the + * right half of the split (including line pointer overhead). This tuple will + * become the new high key for the left page. + */ +static inline int +_bt_split_penalty(FindSplitData *state, SplitPoint *split) +{ + IndexTuple lastleftuple; + IndexTuple firstrighttuple; + + if (!state->is_leaf) + { + ItemId itemid; + + if (!split->newitemonleft && + split->firstoldonright == state->newitemoff) + return state->newitemsz; + + itemid = PageGetItemId(state->page, split->firstoldonright); + + return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); + } + + lastleftuple = _bt_split_lastleft(state, split); + firstrighttuple = _bt_split_firstright(state, split); + + Assert(lastleftuple != firstrighttuple); + return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple); +} + +/* + * Subroutine to get a lastleft IndexTuple for a spit point from page + */ +static inline IndexTuple +_bt_split_lastleft(FindSplitData *state, SplitPoint *split) +{ + ItemId itemid; + + if (split->newitemonleft && split->firstoldonright == state->newitemoff) + return state->newitem; + + itemid = PageGetItemId(state->page, + OffsetNumberPrev(split->firstoldonright)); + return (IndexTuple) PageGetItem(state->page, itemid); +} + +/* + * Subroutine to get a firstright IndexTuple for a spit point from page + */ +static inline IndexTuple +_bt_split_firstright(FindSplitData *state, SplitPoint *split) +{ + ItemId itemid; + + if (!split->newitemonleft && split->firstoldonright == state->newitemoff) + return state->newitem; + + itemid = PageGetItemId(state->page, split->firstoldonright); + return (IndexTuple) PageGetItem(state->page, itemid); +} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 2f9f6e7601..6b59e16c4d 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -22,6 +22,7 @@ #include "access/relscan.h" #include "miscadmin.h" #include "utils/array.h" +#include "utils/datum.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -2295,6 +2296,60 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, return keepnatts; } +/* + * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts. + * + * This is exported so that a candidate split point can have its effect on + * suffix truncation inexpensively evaluated ahead of time when finding a + * split location. A naive bitwise approach to datum comparisons is used to + * save cycles. + * + * The approach taken here usually provides the same answer as _bt_keep_natts + * will (for the same pair of tuples from a heapkeyspace index), since the + * majority of btree opclasses can never indicate that two datums are equal + * unless they're bitwise equal (once detoasted). Similarly, result may + * differ from the _bt_keep_natts result when either tuple has TOASTed datums, + * though this is barely possible in practice. + * + * These issues must be acceptable to callers, typically because they're only + * concerned about making suffix truncation as effective as possible without + * leaving excessive amounts of free space on either side of page split. + * Callers can rely on the fact that attributes considered equal here are + * definitely also equal according to _bt_keep_natts. + */ +int +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int keysz = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + + keepnatts = 1; + for (int attnum = 1; attnum <= keysz; attnum++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + Form_pg_attribute att; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + att = TupleDescAttr(itupdesc, attnum - 1); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + !datumIsEqual(datum1, datum2, att->attbyval, att->attlen)) + break; + + keepnatts++; + } + + return keepnatts; +} + /* * _bt_check_natts() -- Verify tuple has expected number of attributes. * diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index f985a05666..e5876982a2 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -160,11 +160,15 @@ typedef struct BTMetaPageData * For pages above the leaf level, we use a fixed 70% fillfactor. * The fillfactor is applied during index build and when splitting * a rightmost page; when splitting non-rightmost pages we try to - * divide the data equally. + * divide the data equally. When splitting a page that's entirely + * filled with a single value (duplicates), the effective leaf-page + * fillfactor is 96%, regardless of whether the page is a rightmost + * page. */ #define BTREE_MIN_FILLFACTOR 10 #define BTREE_DEFAULT_FILLFACTOR 90 #define BTREE_NONLEAF_FILLFACTOR 70 +#define BTREE_SINGLEVAL_FILLFACTOR 96 /* * In general, the btree code tries to localize its knowledge about @@ -711,6 +715,13 @@ extern bool _bt_doinsert(Relation rel, IndexTuple itup, extern Buffer _bt_getstackbuf(Relation rel, BTStack stack); extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); +/* + * prototypes for functions in nbtsplitloc.c + */ +extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + bool *newitemonleft); + /* * prototypes for functions in nbtpage.c */ @@ -777,6 +788,8 @@ extern bool btproperty(Oid index_oid, int attno, bool *res, bool *isnull); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, + IndexTuple firstright); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap,