diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index b87a3cb471..635ece73b3 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, bool rootdescend); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); +static void bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent); static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, @@ -787,17 +790,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ } - /* - * readonly mode can only ever land on live pages and half-dead pages, - * so sibling pointers should always be in mutual agreement - */ - if (state->readonly && opaque->btpo_prev != leftcurrent) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("left link/right link pair in index \"%s\" not in agreement", - RelationGetRelationName(state->rel)), - errdetail_internal("Block=%u left block=%u left link from block=%u.", - current, leftcurrent, opaque->btpo_prev))); + /* Sibling links should be in mutual agreement */ + if (opaque->btpo_prev != leftcurrent) + bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); /* Check level, which must be valid for non-ignorable page */ if (level.level != opaque->btpo.level) @@ -877,6 +872,140 @@ nextpage: return nextleveldown; } +/* + * Raise an error when target page's left link does not point back to the + * previous target page, called leftcurrent here. The leftcurrent page's + * right link was followed to get to the current target page, and we expect + * mutual agreement among leftcurrent and the current target page. Make sure + * that this condition has definitely been violated in the !readonly case, + * where concurrent page splits are something that we need to deal with. + * + * Cross-page inconsistencies involving pages that don't agree about being + * siblings are known to be a particularly good indicator of corruption + * involving partial writes/lost updates. The bt_right_page_check_scankey + * check also provides a way of detecting cross-page inconsistencies for + * !readonly callers, but it can only detect sibling pages that have an + * out-of-order keyspace, which can't catch many of the problems that we + * expect to catch here. + * + * The classic example of the kind of inconsistency that we can only catch + * with this check (when in !readonly mode) involves three sibling pages that + * were affected by a faulty page split at some point in the past. The + * effects of the split are reflected in the original page and its new right + * sibling page, with a lack of any accompanying changes for the _original_ + * right sibling page. The original right sibling page's left link fails to + * point to the new right sibling page (its left link still points to the + * original page), even though the first phase of a page split is supposed to + * work as a single atomic action. This subtle inconsistency will probably + * only break backwards scans in practice. + * + * Note that this is the only place where amcheck will "couple" buffer locks + * (and only for !readonly callers). In general we prefer to avoid more + * thorough cross-page checks in !readonly mode, but it seems worth the + * complexity here. Also, the performance overhead of performing lock + * coupling here is negligible in practice. Control only reaches here with a + * non-corrupt index when there is a concurrent page split at the instant + * caller crossed over to target page from leftcurrent page. + */ +static void +bt_recheck_sibling_links(BtreeCheckState *state, + BlockNumber btpo_prev_from_target, + BlockNumber leftcurrent) +{ + if (!state->readonly) + { + Buffer lbuf; + Buffer newtargetbuf; + Page page; + BTPageOpaque opaque; + BlockNumber newtargetblock; + + /* Couple locks in the usual order for nbtree: Left to right */ + lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent, + RBM_NORMAL, state->checkstrategy); + LockBuffer(lbuf, BT_READ); + _bt_checkpage(state->rel, lbuf); + page = BufferGetPage(lbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * Cannot reason about concurrently deleted page -- the left link + * in the page to the right is expected to point to some other + * page to the left (not leftcurrent page). + * + * Note that we deliberately don't give up with a half-dead page. + */ + UnlockReleaseBuffer(lbuf); + return; + } + + newtargetblock = opaque->btpo_next; + /* Avoid self-deadlock when newtargetblock == leftcurrent */ + if (newtargetblock != leftcurrent) + { + newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, + newtargetblock, RBM_NORMAL, + state->checkstrategy); + LockBuffer(newtargetbuf, BT_READ); + _bt_checkpage(state->rel, newtargetbuf); + page = BufferGetPage(newtargetbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* btpo_prev_from_target may have changed; update it */ + btpo_prev_from_target = opaque->btpo_prev; + } + else + { + /* + * leftcurrent right sibling points back to leftcurrent block. + * Index is corrupt. Easiest way to handle this is to pretend + * that we actually read from a distinct page that has an invalid + * block number in its btpo_prev. + */ + newtargetbuf = InvalidBuffer; + btpo_prev_from_target = InvalidBlockNumber; + } + + /* + * No need to check P_ISDELETED here, since new target block cannot be + * marked deleted as long as we hold a lock on lbuf + */ + if (BufferIsValid(newtargetbuf)) + UnlockReleaseBuffer(newtargetbuf); + UnlockReleaseBuffer(lbuf); + + if (btpo_prev_from_target == leftcurrent) + { + /* Report split in left sibling, not target (or new target) */ + ereport(DEBUG1, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("harmless concurrent page split detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.", + leftcurrent, newtargetblock, + state->targetblock))); + return; + } + + /* + * Index is corrupt. Make sure that we report correct target page. + * + * This could have changed in cases where there was a concurrent page + * split, as well as index corruption (at least in theory). Note that + * btpo_prev_from_target was already updated above. + */ + state->targetblock = newtargetblock; + } + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("left link/right link pair in index \"%s\" not in agreement", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u left block=%u left link from block=%u.", + state->targetblock, leftcurrent, + btpo_prev_from_target))); +} + /* * Function performs the following checks on target page, or pages ancillary to * target page: @@ -1965,18 +2094,14 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * downlink, which was concurrently physically removed in target/parent as * part of deletion's first phase.) * - * Note that while the cross-page-same-level last item check uses a trick - * that allows it to perform verification for !readonly callers, a similar - * trick seems difficult here. The trick that that other check uses is, - * in essence, to lock down race conditions to those that occur due to - * concurrent page deletion of the target; that's a race that can be - * reliably detected before actually reporting corruption. - * - * On the other hand, we'd need to lock down race conditions involving - * deletion of child's left page, for long enough to read the child page - * into memory (in other words, a scheme with concurrently held buffer - * locks on both child and left-of-child pages). That's unacceptable for - * amcheck functions on general principle, though. + * While we use various techniques elsewhere to perform cross-page + * verification for !readonly callers, a similar trick seems difficult + * here. The tricks used by bt_recheck_sibling_links and by + * bt_right_page_check_scankey both involve verification of a same-level, + * cross-sibling invariant. Cross-level invariants are far more squishy, + * though. The nbtree REDO routines do not actually couple buffer locks + * across levels during page splits, so making any cross-level check work + * reliably in !readonly mode may be impossible. */ Assert(state->readonly); @@ -2785,6 +2910,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, * There is never an attempt to get a consistent view of multiple pages using * multiple concurrent buffer locks; in general, we only acquire a single pin * and buffer lock at a time, which is often all that the nbtree code requires. + * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only + * exception to this general rule.) * * Operating on a copy of the page is useful because it prevents control * getting stuck in an uninterruptible state when an underlying operator class