diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 7d894b1132..e5a664b146 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -71,8 +71,7 @@ static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, - OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem, bool newitemonleft); + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static OffsetNumber _bt_findsplitloc(Relation rel, Page page, @@ -842,7 +841,6 @@ _bt_insertonpg(Relation rel, { Page page; BTPageOpaque lpageop; - OffsetNumber firstright = InvalidOffsetNumber; Size itemsz; page = BufferGetPage(buf); @@ -878,7 +876,6 @@ _bt_insertonpg(Relation rel, { bool is_root = P_ISROOT(lpageop); bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop); - bool newitemonleft; Buffer rbuf; /* @@ -899,14 +896,8 @@ _bt_insertonpg(Relation rel, Assert(!(P_ISLEAF(lpageop) && BlockNumberIsValid(RelationGetTargetBlock(rel)))); - /* Choose the split point */ - firstright = _bt_findsplitloc(rel, page, - newitemoff, itemsz, - &newitemonleft); - /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, buf, cbuf, firstright, - newitemoff, itemsz, itup, newitemonleft); + rbuf = _bt_split(rel, buf, cbuf, newitemoff, itemsz, itup); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -1106,9 +1097,8 @@ _bt_insertonpg(Relation rel, * _bt_split() -- split a page in the btree. * * On entry, buf is the page to split, and is pinned and write-locked. - * firstright is the item index of the first item to be moved to the - * new right page. newitemoff etc. tell us about the new item that - * must be inserted along with the data from the old page. + * newitemoff etc. tell us about the new item that must be inserted + * along with the data from the original page. * * When splitting a non-leaf page, 'cbuf' is the left-sibling of the * page we're inserting the downlink for. This function will clear the @@ -1118,9 +1108,8 @@ _bt_insertonpg(Relation rel, * The pin and lock on buf are maintained. */ static Buffer -_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, - OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, - bool newitemonleft) +_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, + Size newitemsz, IndexTuple newitem) { Buffer rbuf; Page origpage; @@ -1139,36 +1128,67 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, IndexTuple item; OffsetNumber leftoff, rightoff; + OffsetNumber firstright; OffsetNumber maxoff; OffsetNumber i; - bool isleaf; + bool newitemonleft, + isleaf; IndexTuple lefthikey; int indnatts = IndexRelationGetNumberOfAttributes(rel); int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - /* Acquire a new page to split into */ - rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); - /* * origpage is the original page to be split. leftpage is a temporary * buffer that receives the left-sibling data, which will be copied back - * into origpage on success. rightpage is the new page that receives the - * right-sibling data. If we fail before reaching the critical section, - * origpage hasn't been modified and leftpage is only workspace. In - * principle we shouldn't need to worry about rightpage either, because it - * hasn't been linked into the btree page structure; but to avoid leaving - * possibly-confusing junk behind, we are careful to rewrite rightpage as - * zeroes before throwing any error. + * into origpage on success. rightpage is the new page that will receive + * the right-sibling data. + * + * leftpage is allocated after choosing a split point. rightpage's new + * buffer isn't acquired until after leftpage is initialized and has new + * high key, the last point where splitting the page may fail (barring + * corruption). Failing before acquiring new buffer won't have lasting + * consequences, since origpage won't have been modified and leftpage is + * only workspace. */ origpage = BufferGetPage(buf); - leftpage = PageGetTempPage(origpage); - rightpage = BufferGetPage(rbuf); - + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); origpagenumber = BufferGetBlockNumber(buf); - rightpagenumber = BufferGetBlockNumber(rbuf); + /* + * Choose a point to split origpage at. + * + * A split point can be thought of as a point _between_ two existing + * tuples on origpage (lastleft and firstright tuples), provided you + * pretend that the new item that didn't fit is already on origpage. + * + * Since origpage does not actually contain newitem, the representation of + * split points needs to work with two boundary cases: splits where + * newitem is lastleft, and splits where newitem is firstright. + * newitemonleft resolves the ambiguity that would otherwise exist when + * newitemoff == firstright. In all other cases it's clear which side of + * the split every tuple goes on from context. newitemonleft is usually + * (but not always) redundant information. + */ + firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, + &newitemonleft); + + /* Allocate temp buffer for leftpage */ + leftpage = PageGetTempPage(origpage); _bt_pageinit(leftpage, BufferGetPageSize(buf)); - /* rightpage was already initialized by _bt_getbuf */ + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + + /* + * leftpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + lopaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + /* set flag in leftpage indicating that rightpage has no downlink yet */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; + lopaque->btpo_prev = oopaque->btpo_prev; + /* handle btpo_next after rightpage buffer acquired */ + lopaque->btpo.level = oopaque->btpo.level; + /* handle btpo_cycleid after rightpage buffer acquired */ /* * Copy the original page's LSN into leftpage, which will become the @@ -1176,62 +1196,13 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * examine the LSN and possibly dump it in a page image. */ PageSetLSN(leftpage, PageGetLSN(origpage)); - - /* init btree private data */ - oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); - lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); - ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); - isleaf = P_ISLEAF(oopaque); - /* if we're splitting this page, it won't be the root when we're done */ - /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ - lopaque->btpo_flags = oopaque->btpo_flags; - lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); - ropaque->btpo_flags = lopaque->btpo_flags; - /* set flag in left page indicating that the right page has no downlink */ - lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; - lopaque->btpo_prev = oopaque->btpo_prev; - lopaque->btpo_next = rightpagenumber; - ropaque->btpo_prev = origpagenumber; - ropaque->btpo_next = oopaque->btpo_next; - lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; - /* Since we already have write-lock on both pages, ok to read cycleid */ - lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); - ropaque->btpo_cycleid = lopaque->btpo_cycleid; - - /* - * If the page we're splitting is not the rightmost page at its level in - * the tree, then the first entry on the page is the high key for the - * page. We need to copy that to the right half. Otherwise (meaning the - * rightmost page case), all the items on the right half will be user - * data. - */ - rightoff = P_HIKEY; - - if (!P_RIGHTMOST(oopaque)) - { - itemid = PageGetItemId(origpage, P_HIKEY); - itemsz = ItemIdGetLength(itemid); - item = (IndexTuple) PageGetItem(origpage, itemid); - Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts); - if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, - false, false) == InvalidOffsetNumber) - { - memset(rightpage, 0, BufferGetPageSize(rbuf)); - elog(ERROR, "failed to add hikey to the right sibling" - " while splitting block %u of index \"%s\"", - origpagenumber, RelationGetRelationName(rel)); - } - rightoff = OffsetNumberNext(rightoff); - } - /* * The "high key" for the new left page will be the first key that's going * to go into the new right page. This might be either the existing data * item at position firstright, or the incoming tuple. */ - leftoff = P_HIKEY; if (!newitemonleft && newitemoff == firstright) { /* incoming tuple will become first on right page */ @@ -1265,22 +1236,89 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, else lefthikey = item; + /* + * Add new high key to leftpage + */ + leftoff = P_HIKEY; + Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts); if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, false, false) == InvalidOffsetNumber) - { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add hikey to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); - } leftoff = OffsetNumberNext(leftoff); /* be tidy */ if (lefthikey != item) pfree(lefthikey); /* - * Now transfer all the data items to the appropriate page. + * Acquire a new right page to split into, now that left page has a new + * high key. From here on, it's not okay to throw an error without + * zeroing rightpage first. This coding rule ensures that we won't + * confuse future VACUUM operations, which might otherwise try to re-find + * a downlink to a leftover junk page as the page undergoes deletion. + * + * It would be reasonable to start the critical section just after the new + * rightpage buffer is acquired instead; that would allow us to avoid + * leftover junk pages without bothering to zero rightpage. We do it this + * way because it avoids an unnecessary PANIC when either origpage or its + * existing sibling page are corrupt. + */ + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rightpage = BufferGetPage(rbuf); + rightpagenumber = BufferGetBlockNumber(rbuf); + /* rightpage was initialized by _bt_getbuf */ + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* + * Finish off remaining leftpage special area fields. They cannot be set + * before both origpage (leftpage) and rightpage buffers are acquired and + * locked. + */ + lopaque->btpo_next = rightpagenumber; + lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); + + /* + * rightpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + ropaque->btpo_flags = oopaque->btpo_flags; + ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = oopaque->btpo_next; + ropaque->btpo.level = oopaque->btpo.level; + ropaque->btpo_cycleid = lopaque->btpo_cycleid; + + /* + * Add new high key to rightpage where necessary. + * + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key from + * origpage. + */ + rightoff = P_HIKEY; + + if (!P_RIGHTMOST(oopaque)) + { + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts); + if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, + false, false) == InvalidOffsetNumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add hikey to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + rightoff = OffsetNumberNext(rightoff); + } + + /* + * Now transfer all the data items (non-pivot tuples in isleaf case, or + * additional pivot tuples in !isleaf case) to the appropriate page. * * Note: we *must* insert at least the right page's items in item-number * order, for the benefit of _bt_restore_page(). @@ -1298,6 +1336,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, { if (newitemonleft) { + Assert(newitemoff <= firstright); if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); @@ -1309,6 +1348,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, } else { + Assert(newitemoff >= firstright); if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); @@ -1371,7 +1411,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * all readers release locks on a page before trying to fetch its * neighbors. */ - if (!P_RIGHTMOST(oopaque)) { sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);