diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 9d5fc424a5..abce31a5a9 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -572,23 +572,12 @@ replay of page deletion records does not hold a write lock on the target leaf page throughout; only the primary needs to block out concurrent writers that insert on to the page being deleted.) -There are also locking differences between the primary and WAL replay -for the first stage of a page split (i.e. same-level differences in -locking). Replay of the first phase of a page split can get away with -locking and updating the original right sibling page (which is also the -new right sibling page's right sibling) after locks on the original page -and its new right sibling have been released. Again, this is okay -because there are no writers. Page deletion WAL replay cannot get away -with being lax about same-level locking during replay, though -- doing -so risks confusing concurrent backwards scans. - -Page deletion's second phase locks the left sibling page, target page, -and right page in order on the standby, just like on the primary. This -allows backwards scans running on a standby to reason about page -deletion on the leaf level; a page cannot appear deleted without that -being reflected in the sibling pages. It's probably possible to be more -lax about how locks are acquired on the standby during the second phase -of page deletion, but that hardly seems worth it. +WAL replay holds same-level locks in a way that matches the approach +taken during original execution, though. This prevent readers from +observing same-level inconsistencies. It's probably possible to be more +lax about how same-level locks are acquired during recovery (most kinds +of readers could still move right to recover if we didn't couple +same-level locks), but we prefer to be conservative here. During recovery all index scans start with ignore_killed_tuples = false and we never set kill_prior_tuple. We do this because the oldest xmin diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 1fd6392463..dbec58d524 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -172,10 +172,10 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting, * Insertion to an internal page finishes an incomplete split at the child * level. Clear the incomplete-split flag in the child. Note: during * normal operation, the child and parent pages are locked at the same - * time, so that clearing the flag and inserting the downlink appear - * atomic to other backends. We don't bother with that during replay, - * because readers don't care about the incomplete-split flag and there - * cannot be updates happening. + * time (the locks are coupled), so that clearing the flag and inserting + * the downlink appear atomic to other backends. We don't bother with + * that during replay, because readers don't care about the + * incomplete-split flag and there cannot be updates happening. */ if (!isleaf) _bt_clear_incomplete_split(record, 1); @@ -272,9 +272,17 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) spagenumber = P_NONE; /* - * Clear the incomplete split flag on the left sibling of the child page - * this is a downlink for. (Like in btree_xlog_insert, this can be done - * before locking the other pages) + * Clear the incomplete split flag on the appropriate child page one level + * down when origpage/buf is an internal page (there must have been + * cascading page splits during original execution in the event of an + * internal page split). This is like the corresponding btree_xlog_insert + * call for internal pages. We're not clearing the incomplete split flag + * for the current page split here (you can think of this as part of the + * insert of newitem that the page split action needs to perform in + * passing). + * + * Like in btree_xlog_insert, this can be done before locking other pages. + * We never need to couple cross-level locks in REDO routines. */ if (!isleaf) _bt_clear_incomplete_split(record, 3); @@ -427,22 +435,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) MarkBufferDirty(buf); } - /* - * We no longer need the buffers. They must be released together, so that - * readers cannot observe two inconsistent halves. - */ - if (BufferIsValid(buf)) - UnlockReleaseBuffer(buf); - UnlockReleaseBuffer(rbuf); - - /* - * Fix left-link of the page to the right of the new right sibling. - * - * Note: in normal operation, we do this while still holding lock on the - * two split pages. However, that's not necessary for correctness in WAL - * replay, because no other index update can be in progress, and readers - * will cope properly when following an obsolete left-link. - */ + /* Fix left-link of the page to the right of the new right sibling */ if (spagenumber != P_NONE) { Buffer sbuf; @@ -460,6 +453,14 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) if (BufferIsValid(sbuf)) UnlockReleaseBuffer(sbuf); } + + /* + * Finally, release the remaining buffers. sbuf, rbuf, and buf must be + * released together, so that readers cannot observe inconsistencies. + */ + UnlockReleaseBuffer(rbuf); + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); } static void @@ -733,6 +734,11 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) PageSetLSN(page, lsn); MarkBufferDirty(buffer); } + + /* + * Don't need to couple cross-level locks in REDO routines, so release + * lock on internal page immediately + */ if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); @@ -789,12 +795,6 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) * the pages in the same standard left-to-right order (leftsib, target, * rightsib), and don't release the sibling locks until the target is * marked deleted. - * - * btree_xlog_split() can get away with fixing its right sibling page's - * left link last of all, after dropping all other locks. We prefer to - * avoid dropping locks on same-level pages early compared to normal - * operation. This keeps things simple for backwards scans. See - * nbtree/README. */ /* Fix right-link of left sibling, if any */