diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 7792b08b5f..857759bdcb 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -1121,7 +1121,7 @@ bt_target_page_check(BtreeCheckState *state) * designated purpose. Enforce the lower limit for pivot tuples when * an explicit heap TID isn't actually present. (In all other cases * suffix truncation is guaranteed to generate a pivot tuple that's no - * larger than the first right tuple provided to it by its caller.) + * larger than the firstright tuple provided to it by its caller.) */ lowersizelimit = skey->heapkeyspace && (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL); diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index cbd4ec3b39..fd518d3ea5 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -56,8 +56,8 @@ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); -static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, - OffsetNumber itup_off); +static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, + OffsetNumber itup_off, bool newfirstdataitem); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); /* @@ -1452,18 +1452,18 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTPageOpaque sopaque = NULL; Size itemsz; ItemId itemid; - IndexTuple item; - OffsetNumber leftoff, - rightoff; - OffsetNumber firstright; + IndexTuple firstright, + lefthighkey; + OffsetNumber firstrightoff; + OffsetNumber afterleftoff, + afterrightoff, + minusinfoff; OffsetNumber origpagepostingoff; OffsetNumber maxoff; OffsetNumber i; bool newitemonleft, - isleaf; - IndexTuple lefthikey; - int indnatts = IndexRelationGetNumberOfAttributes(rel); - int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + isleaf, + isrightmost; /* * origpage is the original page to be split. leftpage is a temporary @@ -1480,25 +1480,37 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, */ origpage = BufferGetPage(buf); oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + isleaf = P_ISLEAF(oopaque); + isrightmost = P_RIGHTMOST(oopaque); + maxoff = PageGetMaxOffsetNumber(origpage); origpagenumber = BufferGetBlockNumber(buf); /* * Choose a point to split origpage at. * - * A split point can be thought of as a point _between_ two existing - * tuples on origpage (lastleft and firstright tuples), provided you + * A split point can be thought of as a point _between_ two existing data + * items on origpage (the lastleft and firstright tuples), provided you * pretend that the new item that didn't fit is already on origpage. * * Since origpage does not actually contain newitem, the representation of * split points needs to work with two boundary cases: splits where * newitem is lastleft, and splits where newitem is firstright. * newitemonleft resolves the ambiguity that would otherwise exist when - * newitemoff == firstright. In all other cases it's clear which side of - * the split every tuple goes on from context. newitemonleft is usually - * (but not always) redundant information. + * newitemoff == firstrightoff. In all other cases it's clear which side + * of the split every tuple goes on from context. newitemonleft is + * usually (but not always) redundant information. + * + * firstrightoff is supposed to be an origpage offset number, but it's + * possible that its value will be maxoff+1, which is "past the end" of + * origpage. This happens in the rare case where newitem goes after all + * existing items (i.e. newitemoff is maxoff+1) and we end up splitting + * origpage at the point that leaves newitem alone on new right page. Any + * "!newitemonleft && newitemoff == firstrightoff" split point makes + * newitem the firstright tuple, though, so this case isn't a special + * case. */ - firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, - newitem, &newitemonleft); + firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, + newitem, &newitemonleft); /* Allocate temp buffer for leftpage */ leftpage = PageGetTempPage(origpage); @@ -1524,7 +1536,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * examine the LSN and possibly dump it in a page image. */ PageSetLSN(leftpage, PageGetLSN(origpage)); - isleaf = P_ISLEAF(oopaque); /* * Determine page offset number of existing overlapped-with-orignewitem @@ -1555,74 +1566,57 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, } /* - * The "high key" for the new left page will be the first key that's going - * to go into the new right page, or a truncated version if this is a leaf - * page split. + * The high key for the new left page is a possibly-truncated copy of + * firstright on the leaf level (it's "firstright itself" on internal + * pages; see !isleaf comments below). This may seem to be contrary to + * Lehman & Yao's approach of using a copy of lastleft as the new high key + * when splitting on the leaf level. It isn't, though. * - * The high key for the left page is formed using the first item on the - * right page, which may seem to be contrary to Lehman & Yao's approach of - * using the left page's last item as its new high key when splitting on - * the leaf level. It isn't, though: suffix truncation will leave the - * left page's high key fully equal to the last item on the left page when - * two tuples with equal key values (excluding heap TID) enclose the split - * point. It isn't actually necessary for a new leaf high key to be equal - * to the last item on the left for the L&Y "subtree" invariant to hold. - * It's sufficient to make sure that the new leaf high key is strictly - * less than the first item on the right leaf page, and greater than or - * equal to (not necessarily equal to) the last item on the left leaf - * page. - * - * In other words, when suffix truncation isn't possible, L&Y's exact - * approach to leaf splits is taken. (Actually, even that is slightly - * inaccurate. A tuple with all the keys from firstright but the heap TID - * from lastleft will be used as the new high key, since the last left - * tuple could be physically larger despite being opclass-equal in respect - * of all attributes prior to the heap TID attribute.) + * Suffix truncation will leave the left page's high key fully equal to + * lastleft when lastleft and firstright are equal prior to heap TID (that + * is, the tiebreaker TID value comes from lastleft). It isn't actually + * necessary for a new leaf high key to be a copy of lastleft for the L&Y + * "subtree" invariant to hold. It's sufficient to make sure that the new + * leaf high key is strictly less than firstright, and greater than or + * equal to (not necessarily equal to) lastleft. In other words, when + * suffix truncation isn't possible during a leaf page split, we take + * L&Y's exact approach to generating a new high key for the left page. + * (Actually, that is slightly inaccurate. We don't just use a copy of + * lastleft. A tuple with all the keys from firstright but the max heap + * TID from lastleft is used, to avoid introducing a special case.) */ - if (!newitemonleft && newitemoff == firstright) + if (!newitemonleft && newitemoff == firstrightoff) { - /* incoming tuple will become first on right page */ + /* incoming tuple becomes firstright */ itemsz = newitemsz; - item = newitem; + firstright = newitem; } else { - /* existing item at firstright will become first on right page */ - itemid = PageGetItemId(origpage, firstright); + /* existing item at firstrightoff becomes firstright */ + itemid = PageGetItemId(origpage, firstrightoff); itemsz = ItemIdGetLength(itemid); - item = (IndexTuple) PageGetItem(origpage, itemid); - if (firstright == origpagepostingoff) - item = nposting; + firstright = (IndexTuple) PageGetItem(origpage, itemid); + if (firstrightoff == origpagepostingoff) + firstright = nposting; } - /* - * Truncate unneeded key and non-key attributes of the high key item - * before inserting it on the left page. This can only happen at the leaf - * level, since in general all pivot tuple values originate from leaf - * level high keys. A pivot tuple in a grandparent page must guide a - * search not only to the correct parent page, but also to the correct - * leaf page. - */ - if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts)) + if (isleaf) { IndexTuple lastleft; - /* - * Determine which tuple will become the last on the left page. This - * is needed to decide how many attributes from the first item on the - * right page must remain in new high key for left page. - */ - if (newitemonleft && newitemoff == firstright) + /* Attempt suffix truncation for leaf page splits */ + if (newitemonleft && newitemoff == firstrightoff) { - /* incoming tuple will become last on left page */ + /* incoming tuple becomes lastleft */ lastleft = newitem; } else { OffsetNumber lastleftoff; - /* item just before firstright will become last on left page */ - lastleftoff = OffsetNumberPrev(firstright); + /* existing item before firstrightoff becomes lastleft */ + lastleftoff = OffsetNumberPrev(firstrightoff); Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); @@ -1630,30 +1624,55 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, lastleft = nposting; } - Assert(lastleft != item); - lefthikey = _bt_truncate(rel, lastleft, item, itup_key); - itemsz = IndexTupleSize(lefthikey); - itemsz = MAXALIGN(itemsz); + lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key); + itemsz = IndexTupleSize(lefthighkey); } else - lefthikey = item; + { + /* + * Don't perform suffix truncation on a copy of firstright to make + * left page high key for internal page splits. Must use firstright + * as new high key directly. + * + * Each distinct separator key value originates as a leaf level high + * key; all other separator keys/pivot tuples are copied from one + * level down. A separator key in a grandparent page must be + * identical to high key in rightmost parent page of the subtree to + * its left, which must itself be identical to high key in rightmost + * child page of that same subtree (this even applies to separator + * from grandparent's high key). There must always be an unbroken + * "seam" of identical separator keys that guide index scans at every + * level, starting from the grandparent. That's why suffix truncation + * is unsafe here. + * + * Internal page splits will truncate firstright into a "negative + * infinity" data item when it gets inserted on the new right page + * below, though. This happens during the call to _bt_pgaddtup() for + * the new first data item for right page. Do not confuse this + * mechanism with suffix truncation. It is just a convenient way of + * implementing page splits that split the internal page "inside" + * firstright. The lefthighkey separator key cannot appear a second + * time in the right page (only firstright's downlink goes in right + * page). + */ + lefthighkey = firstright; + } /* * Add new high key to leftpage */ - leftoff = P_HIKEY; + afterleftoff = P_HIKEY; - Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); - Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts); - if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, - false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add hikey to the left sibling" + Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0); + Assert(BTreeTupleGetNAtts(lefthighkey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey))); + if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "failed to add high key to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); - leftoff = OffsetNumberNext(leftoff); - /* be tidy */ - if (lefthikey != item) - pfree(lefthikey); + afterleftoff = OffsetNumberNext(afterleftoff); /* * Acquire a new right page to split into, now that left page has a new @@ -1700,26 +1719,37 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * the tree, then the first entry on the page is the high key from * origpage. */ - rightoff = P_HIKEY; + afterrightoff = P_HIKEY; - if (!P_RIGHTMOST(oopaque)) + if (!isrightmost) { + IndexTuple righthighkey; + itemid = PageGetItemId(origpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); - item = (IndexTuple) PageGetItem(origpage, itemid); - Assert(BTreeTupleGetNAtts(item, rel) > 0); - Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts); - if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, + righthighkey = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0); + Assert(BTreeTupleGetNAtts(righthighkey, rel) <= + IndexRelationGetNumberOfKeyAttributes(rel)); + if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber) { memset(rightpage, 0, BufferGetPageSize(rbuf)); - elog(ERROR, "failed to add hikey to the right sibling" + elog(ERROR, "failed to add high key to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } - rightoff = OffsetNumberNext(rightoff); + afterrightoff = OffsetNumberNext(afterrightoff); } + /* + * Internal page splits truncate first data item on right page -- it + * becomes "minus infinity" item for the page. Set this up here. + */ + minusinfoff = InvalidOffsetNumber; + if (!isleaf) + minusinfoff = afterrightoff; + /* * Now transfer all the data items (non-pivot tuples in isleaf case, or * additional pivot tuples in !isleaf case) to the appropriate page. @@ -1727,20 +1757,20 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * Note: we *must* insert at least the right page's items in item-number * order, for the benefit of _bt_restore_page(). */ - maxoff = PageGetMaxOffsetNumber(origpage); - for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) { + IndexTuple dataitem; + itemid = PageGetItemId(origpage, i); itemsz = ItemIdGetLength(itemid); - item = (IndexTuple) PageGetItem(origpage, itemid); + dataitem = (IndexTuple) PageGetItem(origpage, itemid); /* replace original item with nposting due to posting split? */ if (i == origpagepostingoff) { - Assert(BTreeTupleIsPosting(item)); + Assert(BTreeTupleIsPosting(dataitem)); Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); - item = nposting; + dataitem = nposting; } /* does new item belong before this one? */ @@ -1748,56 +1778,59 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, { if (newitemonleft) { - Assert(newitemoff <= firstright); - if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff)) + Assert(newitemoff <= firstrightoff); + if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, + false)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } - leftoff = OffsetNumberNext(leftoff); + afterleftoff = OffsetNumberNext(afterleftoff); } else { - Assert(newitemoff >= firstright); - if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) + Assert(newitemoff >= firstrightoff); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, + afterrightoff == minusinfoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } - rightoff = OffsetNumberNext(rightoff); + afterrightoff = OffsetNumberNext(afterrightoff); } } /* decide which page to put it on */ - if (i < firstright) + if (i < firstrightoff) { - if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff)) + if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } - leftoff = OffsetNumberNext(leftoff); + afterleftoff = OffsetNumberNext(afterleftoff); } else { - if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff)) + if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, + afterrightoff == minusinfoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } - rightoff = OffsetNumberNext(rightoff); + afterrightoff = OffsetNumberNext(afterrightoff); } } - /* cope with possibility that newitem goes at the end */ + /* Handle case where newitem goes at the end of rightpage */ if (i <= newitemoff) { /* @@ -1805,15 +1838,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * *everything* on the left page, which cannot fit (if it could, we'd * not be splitting the page). */ - Assert(!newitemonleft); - if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) + Assert(!newitemonleft && newitemoff == maxoff + 1); + if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, + afterrightoff == minusinfoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } - rightoff = OffsetNumberNext(rightoff); + afterrightoff = OffsetNumberNext(afterrightoff); } /* @@ -1823,7 +1857,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * all readers release locks on a page before trying to fetch its * neighbors. */ - if (!P_RIGHTMOST(oopaque)) + if (!isrightmost) { sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); spage = BufferGetPage(sbuf); @@ -1886,7 +1920,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, MarkBufferDirty(buf); MarkBufferDirty(rbuf); - if (!P_RIGHTMOST(ropaque)) + if (!isrightmost) { sopaque->btpo_prev = rightpagenumber; MarkBufferDirty(sbuf); @@ -1914,10 +1948,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, xlrec.level = ropaque->btpo.level; /* See comments below on newitem, orignewitem, and posting lists */ - xlrec.firstright = firstright; + xlrec.firstrightoff = firstrightoff; xlrec.newitemoff = newitemoff; xlrec.postingoff = 0; - if (postingoff != 0 && origpagepostingoff < firstright) + if (postingoff != 0 && origpagepostingoff < firstrightoff) xlrec.postingoff = postingoff; XLogBeginInsert(); @@ -1925,10 +1959,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); - /* Log the right sibling, because we've changed its prev-pointer. */ - if (!P_RIGHTMOST(ropaque)) + /* Log original right sibling, since we've changed its prev-pointer */ + if (!isrightmost) XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); - if (BufferIsValid(cbuf)) + if (!isleaf) XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); /* @@ -1959,18 +1993,24 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * newitem-logged case). */ if (newitemonleft && xlrec.postingoff == 0) - XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + XLogRegisterBufData(0, (char *) newitem, newitemsz); else if (xlrec.postingoff != 0) { - Assert(newitemonleft || firstright == newitemoff); - Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem)); - XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz)); + Assert(isleaf); + Assert(newitemonleft || firstrightoff == newitemoff); + Assert(newitemsz == IndexTupleSize(orignewitem)); + XLogRegisterBufData(0, (char *) orignewitem, newitemsz); } /* Log the left page's new high key */ - itemid = PageGetItemId(origpage, P_HIKEY); - item = (IndexTuple) PageGetItem(origpage, itemid); - XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); + if (!isleaf) + { + /* lefthighkey isn't local copy, get current pointer */ + itemid = PageGetItemId(origpage, P_HIKEY); + lefthighkey = (IndexTuple) PageGetItem(origpage, itemid); + } + XLogRegisterBufData(0, (char *) lefthighkey, + MAXALIGN(IndexTupleSize(lefthighkey))); /* * Log the contents of the right page in the format understood by @@ -1991,26 +2031,26 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, PageSetLSN(origpage, recptr); PageSetLSN(rightpage, recptr); - if (!P_RIGHTMOST(ropaque)) - { + if (!isrightmost) PageSetLSN(spage, recptr); - } if (!isleaf) - { PageSetLSN(BufferGetPage(cbuf), recptr); - } } END_CRIT_SECTION(); /* release the old right sibling */ - if (!P_RIGHTMOST(ropaque)) + if (!isrightmost) _bt_relbuf(rel, sbuf); /* release the child */ if (!isleaf) _bt_relbuf(rel, cbuf); + /* be tidy */ + if (isleaf) + pfree(lefthighkey); + /* split's done */ return rbuf; } @@ -2405,9 +2445,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) metad = BTPageGetMeta(metapg); /* - * Create downlink item for left page (old root). Since this will be the - * first item in a non-leaf page, it implicitly has minus-infinity key - * value, so we need not store any actual key in it. + * Create downlink item for left page (old root). The key value used is + * "minus infinity", a sentinel value that's reliably less than any real + * key value that could appear in the left page. */ left_item_sz = sizeof(IndexTupleData); left_item = (IndexTuple) palloc(left_item_sz); @@ -2541,33 +2581,30 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) * _bt_pgaddtup() -- add a data item to a particular page during split. * * The difference between this routine and a bare PageAddItem call is - * that this code knows that the leftmost data item on an internal - * btree page has a key that must be treated as minus infinity. - * Therefore, it truncates away all attributes. This extra step is - * only needed during internal page splits. + * that this code can deal with the first data item on an internal btree + * page in passing. This data item (which is called "firstright" within + * _bt_split()) has a key that must be treated as minus infinity after + * the split. Therefore, we truncate away all attributes when caller + * specifies it's the first data item on page (downlink is not changed, + * though). This extra step is only needed for the right page of an + * internal page split. There is no need to do this for the first data + * item on the existing/left page, since that will already have been + * truncated during an earlier page split. * - * Truncation of an internal page data item can be thought of as one - * of the steps used to "move" a boundary separator key during an - * internal page split. Conceptually, _bt_split() caller splits - * internal pages "inside" the firstright data item: firstright's - * separator key is used as the high key for the left page, while its - * downlink is used within the first data item (also the negative - * infinity item) for the right page. Each distinct separator key - * should appear no more than once per level of the tree. - * - * CAUTION: this works ONLY if we insert the tuples in order, so that - * the given itup_off does represent the final position of the tuple! + * See _bt_split() for a high level explanation of why we truncate here. + * Note that this routine has nothing to do with suffix truncation, + * despite using some of the same infrastructure. */ -static bool +static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, - OffsetNumber itup_off) + OffsetNumber itup_off, + bool newfirstdataitem) { - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTupleData trunctuple; - if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque)) + if (newfirstdataitem) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); @@ -2576,8 +2613,8 @@ _bt_pgaddtup(Page page, itemsize = sizeof(IndexTupleData); } - if (PageAddItem(page, (Item) itup, itemsize, itup_off, - false, false) == InvalidOffsetNumber) + if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false, + false) == InvalidOffsetNumber)) return false; return true; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index ba230bbcd2..15f10a29d3 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -269,7 +269,8 @@ static Page _bt_blnewpage(uint32 level); static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); static void _bt_slideleft(Page page); static void _bt_sortaddtup(Page page, Size itemsize, - IndexTuple itup, OffsetNumber itup_off); + IndexTuple itup, OffsetNumber itup_off, + bool newfirstdataitem); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra); static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, @@ -750,26 +751,24 @@ _bt_slideleft(Page page) /* * Add an item to a page being built. * - * The main difference between this routine and a bare PageAddItem call - * is that this code knows that the leftmost data item on a non-leaf btree - * page has a key that must be treated as minus infinity. Therefore, it - * truncates away all attributes. + * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant + * raises an error directly. * - * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use - * that because it assumes that P_RIGHTMOST() will return the correct - * answer for the page. Here, we don't know yet if the page will be - * rightmost. Offset P_FIRSTKEY is always the first data key. + * Note that our nbtsort.c caller does not know yet if the page will be + * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by + * caller. Page that turns out to be the rightmost on its level is fixed by + * calling _bt_slideleft(). */ static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, - OffsetNumber itup_off) + OffsetNumber itup_off, + bool newfirstdataitem) { - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTupleData trunctuple; - if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) + if (newfirstdataitem) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); @@ -867,12 +866,13 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, * Every newly built index will treat heap TID as part of the keyspace, * which imposes the requirement that new high keys must occasionally have * a heap TID appended within _bt_truncate(). That may leave a new pivot - * tuple one or two MAXALIGN() quantums larger than the original first - * right tuple it's derived from. v4 deals with the problem by decreasing - * the limit on the size of tuples inserted on the leaf level by the same - * small amount. Enforce the new v4+ limit on the leaf level, and the old - * limit on internal levels, since pivot tuples may need to make use of - * the reserved space. This should never fail on internal pages. + * tuple one or two MAXALIGN() quantums larger than the original + * firstright tuple it's derived from. v4 deals with the problem by + * decreasing the limit on the size of tuples inserted on the leaf level + * by the same small amount. Enforce the new v4+ limit on the leaf level, + * and the old limit on internal levels, since pivot tuples may need to + * make use of the reserved space. This should never fail on internal + * pages. */ if (unlikely(itupsz > BTMaxItemSize(npage))) _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, @@ -925,7 +925,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); - _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); + _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY, + !isleaf); /* * Move 'last' into the high key position on opage. _bt_blnewpage() @@ -1054,7 +1055,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); - _bt_sortaddtup(npage, itupsz, itup, last_off); + _bt_sortaddtup(npage, itupsz, itup, last_off, + !isleaf && last_off == P_FIRSTKEY); state->btps_page = npage; state->btps_blkno = nblkno; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 8ba055be9e..5f0d0be3c2 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -37,7 +37,7 @@ typedef struct int16 rightfree; /* space left on right page post-split */ /* split point identifying fields (returned by _bt_findsplitloc) */ - OffsetNumber firstoldonright; /* first item on new right page */ + OffsetNumber firstrightoff; /* first origpage item on rightpage */ bool newitemonleft; /* new item goes on left, or right? */ } SplitPoint; @@ -46,7 +46,7 @@ typedef struct { /* context data for _bt_recsplitloc */ Relation rel; /* index relation */ - Page page; /* page undergoing split */ + Page origpage; /* page undergoing split */ IndexTuple newitem; /* new item (cause of page split) */ Size newitemsz; /* size of newitem (includes line pointer) */ bool is_leaf; /* T if splitting a leaf page */ @@ -55,7 +55,7 @@ typedef struct int leftspace; /* space available for items on left page */ int rightspace; /* space available for items on right page */ int olddataitemstotal; /* space taken by old items */ - Size minfirstrightsz; /* smallest firstoldonright tuple size */ + Size minfirstrightsz; /* smallest firstright size */ /* candidate split point data */ int maxsplits; /* maximum number of splits */ @@ -65,8 +65,9 @@ typedef struct } FindSplitData; static void _bt_recsplitloc(FindSplitData *state, - OffsetNumber firstoldonright, bool newitemonleft, - int olddataitemstoleft, Size firstoldonrightsz); + OffsetNumber firstrightoff, bool newitemonleft, + int olddataitemstoleft, + Size firstrightofforigpagetuplesz); static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult); static int _bt_splitcmp(const void *arg1, const void *arg2); @@ -119,13 +120,18 @@ static inline IndexTuple _bt_split_firstright(FindSplitData *state, * suffix truncation. * * We return the index of the first existing tuple that should go on the - * righthand page, plus a boolean indicating whether the new tuple goes on - * the left or right page. The bool is necessary to disambiguate the case - * where firstright == newitemoff. + * righthand page (which is called firstrightoff), plus a boolean + * indicating whether the new tuple goes on the left or right page. You + * can think of the returned state as a point _between_ two adjacent data + * items (laftleft and firstright data items) on an imaginary version of + * origpage that already includes newitem. The bool is necessary to + * disambiguate the case where firstrightoff == newitemoff (i.e. it is + * sometimes needed to determine if the firstright tuple for the split is + * newitem rather than the tuple from origpage at offset firstrightoff). */ OffsetNumber _bt_findsplitloc(Relation rel, - Page page, + Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, @@ -143,36 +149,36 @@ _bt_findsplitloc(Relation rel, ItemId itemid; OffsetNumber offnum, maxoff, - foundfirstright; + firstrightoff; double fillfactormult; bool usemult; SplitPoint leftpage, rightpage; - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); + opaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + maxoff = PageGetMaxOffsetNumber(origpage); /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = - PageGetPageSize(page) - SizeOfPageHeaderData - + PageGetPageSize(origpage) - SizeOfPageHeaderData - MAXALIGN(sizeof(BTPageOpaqueData)); /* The right page will have the same high key as the old page */ if (!P_RIGHTMOST(opaque)) { - itemid = PageGetItemId(page, P_HIKEY); + itemid = PageGetItemId(origpage, P_HIKEY); rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData)); } /* Count up total space in data items before actually scanning 'em */ - olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); + olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage); leaffillfactor = BTGetFillFactor(rel); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); state.rel = rel; - state.page = page; + state.origpage = origpage; state.newitem = newitem; state.newitemsz = newitemsz; state.is_leaf = P_ISLEAF(opaque); @@ -209,7 +215,7 @@ _bt_findsplitloc(Relation rel, { Size itemsz; - itemid = PageGetItemId(page, offnum); + itemid = PageGetItemId(origpage, offnum); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); /* @@ -293,8 +299,7 @@ _bt_findsplitloc(Relation rel, * New item inserted at rightmost point among a localized grouping on * a leaf page -- apply "split after new item" optimization, either by * applying leaf fillfactor multiplier, or by choosing the exact split - * point that leaves the new item as last on the left. (usemult is set - * for us.) + * point that leaves newitem as lastleft. (usemult is set for us.) */ if (usemult) { @@ -309,7 +314,7 @@ _bt_findsplitloc(Relation rel, SplitPoint *split = state.splits + i; if (split->newitemonleft && - newitemoff == split->firstoldonright) + newitemoff == split->firstrightoff) { pfree(state.splits); *newitemonleft = true; @@ -429,24 +434,26 @@ _bt_findsplitloc(Relation rel, * the entry that has the lowest penalty, and is therefore expected to * maximize fan-out. Sets *newitemonleft for us. */ - foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, - strategy); + firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, + strategy); pfree(state.splits); - return foundfirstright; + return firstrightoff; } /* * Subroutine to record a particular point between two tuples (possibly the - * new item) on page (ie, combination of firstright and newitemonleft - * settings) in *state for later analysis. This is also a convenient point - * to check if the split is legal (if it isn't, it won't be recorded). + * new item) on page (ie, combination of firstrightoff and newitemonleft + * settings) in *state for later analysis. This is also a convenient point to + * check if the split is legal (if it isn't, it won't be recorded). * - * firstoldonright is the offset of the first item on the original page that - * goes to the right page, and firstoldonrightsz is the size of that tuple. - * firstoldonright can be > max offset, which means that all the old items go - * to the left page and only the new item goes to the right page. In that - * case, firstoldonrightsz is not used. + * firstrightoff is the offset of the first item on the original page that + * goes to the right page, and firstrightofforigpagetuplesz is the size of + * that tuple. firstrightoff can be > max offset, which means that all the + * old items go to the left page and only the new item goes to the right page. + * We don't actually use firstrightofforigpagetuplesz in that case (actually, + * we don't use it for _any_ split where the firstright tuple happens to be + * newitem). * * olddataitemstoleft is the total size of all old items to the left of the * split point that is recorded here when legal. Should not include @@ -454,41 +461,44 @@ _bt_findsplitloc(Relation rel, */ static void _bt_recsplitloc(FindSplitData *state, - OffsetNumber firstoldonright, + OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, - Size firstoldonrightsz) + Size firstrightofforigpagetuplesz) { int16 leftfree, rightfree; - Size firstrightitemsz; + Size firstrightsz; Size postingsz = 0; - bool newitemisfirstonright; + bool newitemisfirstright; - /* Is the new item going to be the first item on the right page? */ - newitemisfirstonright = (firstoldonright == state->newitemoff - && !newitemonleft); + /* Is the new item going to be split point's firstright tuple? */ + newitemisfirstright = (firstrightoff == state->newitemoff && + !newitemonleft); - if (newitemisfirstonright) - firstrightitemsz = state->newitemsz; + if (newitemisfirstright) + firstrightsz = state->newitemsz; else { - firstrightitemsz = firstoldonrightsz; + firstrightsz = firstrightofforigpagetuplesz; /* - * Calculate suffix truncation space saving when firstright is a - * posting list tuple, though only when the firstright is over 64 - * bytes including line pointer overhead (arbitrary). This avoids - * accessing the tuple in cases where its posting list must be very - * small (if firstright has one at all). + * Calculate suffix truncation space saving when firstright tuple is a + * posting list tuple, though only when the tuple is over 64 bytes + * including line pointer overhead (arbitrary). This avoids accessing + * the tuple in cases where its posting list must be very small (if + * tuple has one at all). + * + * Note: We don't do this in the case where firstright tuple is + * newitem, since newitem cannot have a posting list. */ - if (state->is_leaf && firstrightitemsz > 64) + if (state->is_leaf && firstrightsz > 64) { ItemId itemid; IndexTuple newhighkey; - itemid = PageGetItemId(state->page, firstoldonright); - newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + itemid = PageGetItemId(state->origpage, firstrightoff); + newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid); if (BTreeTupleIsPosting(newhighkey)) postingsz = IndexTupleSize(newhighkey) - @@ -525,11 +535,11 @@ _bt_recsplitloc(FindSplitData *state, * precise there noticeably improves the balance of free space.) */ if (state->is_leaf) - leftfree -= (int16) (firstrightitemsz + + leftfree -= (int16) (firstrightsz + MAXALIGN(sizeof(ItemPointerData)) - postingsz); else - leftfree -= (int16) firstrightitemsz; + leftfree -= (int16) firstrightsz; /* account for the new item */ if (newitemonleft) @@ -542,7 +552,7 @@ _bt_recsplitloc(FindSplitData *state, * data from the first item that winds up on the right page. */ if (!state->is_leaf) - rightfree += (int16) firstrightitemsz - + rightfree += (int16) firstrightsz - (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); /* Record split if legal */ @@ -550,13 +560,13 @@ _bt_recsplitloc(FindSplitData *state, { Assert(state->nsplits < state->maxsplits); - /* Determine smallest firstright item size on page */ - state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz); + /* Determine smallest firstright tuple size among legal splits */ + state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz); state->splits[state->nsplits].curdelta = 0; state->splits[state->nsplits].leftfree = leftfree; state->splits[state->nsplits].rightfree = rightfree; - state->splits[state->nsplits].firstoldonright = firstoldonright; + state->splits[state->nsplits].firstrightoff = firstrightoff; state->splits[state->nsplits].newitemonleft = newitemonleft; state->nsplits++; } @@ -632,8 +642,8 @@ _bt_splitcmp(const void *arg1, const void *arg2) * taken by caller varies. Caller uses original leaf page fillfactor in * standard way rather than using the new item offset directly when *usemult * was also set to true here. Otherwise, caller applies optimization by - * locating the legal split point that makes the new tuple the very last tuple - * on the left side of the split. + * locating the legal split point that makes the new tuple the lastleft tuple + * for the split. */ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, @@ -694,8 +704,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, */ if (state->newitemoff > maxoff) { - itemid = PageGetItemId(state->page, maxoff); - tup = (IndexTuple) PageGetItem(state->page, itemid); + itemid = PageGetItemId(state->origpage, maxoff); + tup = (IndexTuple) PageGetItem(state->origpage, itemid); keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); if (keepnatts > 1 && keepnatts <= nkeyatts) @@ -720,8 +730,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, * optimization. Besides, all inappropriate cases triggered here will * still split in the middle of the page on average. */ - itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); - tup = (IndexTuple) PageGetItem(state->page, itemid); + itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff)); + tup = (IndexTuple) PageGetItem(state->origpage, itemid); /* Do cheaper test first */ if (BTreeTupleIsPosting(tup) || !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) @@ -843,8 +853,8 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, * leftmost page. */ if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && - !final->newitemonleft && final->firstoldonright >= state->newitemoff && - final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL) + !final->newitemonleft && final->firstrightoff >= state->newitemoff && + final->firstrightoff < state->newitemoff + MAX_LEAF_INTERVAL) { /* * Avoid the problem by performing a 50:50 split when the new item is @@ -854,7 +864,7 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, } *newitemonleft = final->newitemonleft; - return final->firstoldonright; + return final->firstrightoff; } /* @@ -883,10 +893,11 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, *strategy = SPLIT_DEFAULT; /* - * Use smallest observed first right item size for entire page as perfect + * Use smallest observed firstright item size for entire page (actually, + * entire imaginary version of page that includes newitem) as perfect * penalty on internal pages. This can save cycles in the common case - * where most or all splits (not just splits within interval) have first - * right tuples that are the same size. + * where most or all splits (not just splits within interval) have + * firstright tuples that are the same size. */ if (!state->is_leaf) return state->minfirstrightsz; @@ -961,8 +972,8 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, ItemId itemid; IndexTuple hikey; - itemid = PageGetItemId(state->page, P_HIKEY); - hikey = (IndexTuple) PageGetItem(state->page, itemid); + itemid = PageGetItemId(state->origpage, P_HIKEY); + hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, state->newitem); if (perfectpenalty <= indnkeyatts) @@ -1005,12 +1016,12 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, { SplitPoint *distant = state->splits + i; - if (distant->firstoldonright < deltaoptimal->firstoldonright) + if (distant->firstrightoff < deltaoptimal->firstrightoff) { if (*leftinterval == NULL) *leftinterval = distant; } - else if (distant->firstoldonright > deltaoptimal->firstoldonright) + else if (distant->firstrightoff > deltaoptimal->firstrightoff) { if (*rightinterval == NULL) *rightinterval = distant; @@ -1018,22 +1029,20 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, else if (!distant->newitemonleft && deltaoptimal->newitemonleft) { /* - * "incoming tuple will become first on right page" (distant) is - * to the left of "incoming tuple will become last on left page" - * (delta-optimal) + * "incoming tuple will become firstright" (distant) is to the + * left of "incoming tuple will become lastleft" (delta-optimal) */ - Assert(distant->firstoldonright == state->newitemoff); + Assert(distant->firstrightoff == state->newitemoff); if (*leftinterval == NULL) *leftinterval = distant; } else if (distant->newitemonleft && !deltaoptimal->newitemonleft) { /* - * "incoming tuple will become last on left page" (distant) is to - * the right of "incoming tuple will become first on right page" - * (delta-optimal) + * "incoming tuple will become lastleft" (distant) is to the right + * of "incoming tuple will become firstright" (delta-optimal) */ - Assert(distant->firstoldonright == state->newitemoff); + Assert(distant->firstrightoff == state->newitemoff); if (*rightinterval == NULL) *rightinterval = distant; } @@ -1062,34 +1071,33 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, * key for left page. It can be greater than the number of key attributes in * cases where a heap TID will need to be appended during truncation. * - * On internal pages, penalty is simply the size of the first item on the - * right half of the split (including line pointer overhead). This tuple will - * become the new high key for the left page. + * On internal pages, penalty is simply the size of the firstright tuple for + * the split (including line pointer overhead). This tuple will become the + * new high key for the left page. */ static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split) { - IndexTuple lastleftuple; - IndexTuple firstrighttuple; + IndexTuple lastleft; + IndexTuple firstright; if (!state->is_leaf) { ItemId itemid; if (!split->newitemonleft && - split->firstoldonright == state->newitemoff) + split->firstrightoff == state->newitemoff) return state->newitemsz; - itemid = PageGetItemId(state->page, split->firstoldonright); + itemid = PageGetItemId(state->origpage, split->firstrightoff); return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); } - lastleftuple = _bt_split_lastleft(state, split); - firstrighttuple = _bt_split_firstright(state, split); + lastleft = _bt_split_lastleft(state, split); + firstright = _bt_split_firstright(state, split); - Assert(lastleftuple != firstrighttuple); - return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple); + return _bt_keep_natts_fast(state->rel, lastleft, firstright); } /* @@ -1100,12 +1108,12 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split) { ItemId itemid; - if (split->newitemonleft && split->firstoldonright == state->newitemoff) + if (split->newitemonleft && split->firstrightoff == state->newitemoff) return state->newitem; - itemid = PageGetItemId(state->page, - OffsetNumberPrev(split->firstoldonright)); - return (IndexTuple) PageGetItem(state->page, itemid); + itemid = PageGetItemId(state->origpage, + OffsetNumberPrev(split->firstrightoff)); + return (IndexTuple) PageGetItem(state->origpage, itemid); } /* @@ -1116,9 +1124,9 @@ _bt_split_firstright(FindSplitData *state, SplitPoint *split) { ItemId itemid; - if (!split->newitemonleft && split->firstoldonright == state->newitemoff) + if (!split->newitemonleft && split->firstrightoff == state->newitemoff) return state->newitem; - itemid = PageGetItemId(state->page, split->firstoldonright); - return (IndexTuple) PageGetItem(state->page, itemid); + itemid = PageGetItemId(state->origpage, split->firstrightoff); + return (IndexTuple) PageGetItem(state->origpage, itemid); } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 08d7201f19..ce48a51640 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2346,17 +2346,12 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, ScanKey scankey; /* - * Be consistent about the representation of BTREE_VERSION 2/3 tuples - * across Postgres versions; don't allow new pivot tuples to have - * truncated key attributes there. _bt_compare() treats truncated key - * attributes as having the value minus infinity, which would break - * searches within !heapkeyspace indexes. + * _bt_compare() treats truncated key attributes as having the value minus + * infinity, which would break searches within !heapkeyspace indexes. We + * must still truncate away non-key attribute values, though. */ if (!itup_key->heapkeyspace) - { - Assert(nkeyatts != IndexRelationGetNumberOfAttributes(rel)); return nkeyatts; - } scankey = itup_key->scankeys; keepnatts = 1; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 99d0914e72..493931e1b8 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -251,7 +251,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting, } static void -btree_xlog_split(bool onleft, XLogReaderState *record) +btree_xlog_split(bool newitemonleft, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); @@ -323,7 +323,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record) datapos = XLogRecGetBlockData(record, 0, &datalen); - if (onleft || xlrec->postingoff != 0) + if (newitemonleft || xlrec->postingoff != 0) { newitem = (IndexTuple) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); @@ -368,7 +368,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record) elog(PANIC, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); - for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) + for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstrightoff; off++) { ItemId itemid; Size itemsz; @@ -377,7 +377,8 @@ btree_xlog_split(bool onleft, XLogReaderState *record) /* Add replacement posting list when required */ if (off == replacepostingoff) { - Assert(onleft || xlrec->firstright == xlrec->newitemoff); + Assert(newitemonleft || + xlrec->firstrightoff == xlrec->newitemoff); if (PageAddItem(newlpage, (Item) nposting, MAXALIGN(IndexTupleSize(nposting)), leftoff, false, false) == InvalidOffsetNumber) @@ -387,7 +388,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record) } /* add the new item if it was inserted on left page */ - else if (onleft && off == xlrec->newitemoff) + else if (newitemonleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -405,7 +406,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record) } /* cope with possibility that newitem goes at the end */ - if (onleft && off == xlrec->newitemoff) + if (newitemonleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 7a1616f371..e099107f91 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -39,8 +39,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; - appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d", - xlrec->level, xlrec->firstright, + appendStringInfo(buf, "level %u, firstrightoff %d, newitemoff %d, postingoff %d", + xlrec->level, xlrec->firstrightoff, xlrec->newitemoff, xlrec->postingoff); break; } diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 347976c532..ec7b2a6c33 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -99,9 +99,9 @@ typedef struct xl_btree_insert * left or right split page (and thus, whether the new item is stored or not). * We always log the left page high key because suffix truncation can generate * a new leaf high key using user-defined code. This is also necessary on - * internal pages, since the first right item that the left page's high key - * was based on will have been truncated to zero attributes in the right page - * (the original is unavailable from the right page). + * internal pages, since the firstright item that the left page's high key was + * based on will have been truncated to zero attributes in the right page (the + * separator key is unavailable from the right page). * * Backup Blk 0: original page / new left page * @@ -153,7 +153,7 @@ typedef struct xl_btree_insert typedef struct xl_btree_split { uint32 level; /* tree level of page being split */ - OffsetNumber firstright; /* first item moved to right page */ + OffsetNumber firstrightoff; /* first origpage item on rightpage */ OffsetNumber newitemoff; /* new item's offset */ uint16 postingoff; /* offset inside orig posting tuple */ } xl_btree_split;