WAL-log inplace update before revealing it to other sessions.

A buffer lock won't stop a reader having already checked tuple
visibility.  If a vac_update_datfrozenid() and then a crash happened
during inplace update of a relfrozenxid value, datfrozenxid could
overtake relfrozenxid.  That could lead to "could not access status of
transaction" errors.  Back-patch to v12 (all supported versions).  In
v14 and earlier, this also back-patches the assertion removal from
commit 7fcf2faf9c.

Discussion: https://postgr.es/m/20240620012908.92.nmisch@google.com
This commit is contained in:
Noah Misch 2024-10-25 06:51:03 -07:00
parent 95c5acb3fc
commit bfd5c6e279
2 changed files with 46 additions and 16 deletions

View File

@ -203,6 +203,4 @@ Inplace updates create an exception to the rule that tuple data won't change
under a reader holding a pin. A reader of a heap_fetch() result tuple may
witness a torn read. Current inplace-updated fields are aligned and are no
wider than four bytes, and current readers don't need consistency across
fields. Hence, they get by with just fetching each field once. XXX such a
caller may also read a value that has not reached WAL; see
systable_inplace_update_finish().
fields. Hence, they get by with just fetching each field once.

View File

@ -6344,6 +6344,8 @@ heap_inplace_update_and_unlock(Relation relation,
HeapTupleHeader htup = oldtup->t_data;
uint32 oldlen;
uint32 newlen;
char *dst;
char *src;
Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
oldlen = oldtup->t_len - htup->t_hoff;
@ -6351,6 +6353,9 @@ heap_inplace_update_and_unlock(Relation relation,
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length");
dst = (char *) htup + htup->t_hoff;
src = (char *) tuple->t_data + tuple->t_data->t_hoff;
/*
* Construct shared cache inval if necessary. Note that because we only
* pass the new version of the tuple, this mustn't be used for any
@ -6369,15 +6374,15 @@ heap_inplace_update_and_unlock(Relation relation,
*/
PreInplace_Inval();
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
memcpy((char *) htup + htup->t_hoff,
(char *) tuple->t_data + tuple->t_data->t_hoff,
newlen);
/*----------
* XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
* NO EREPORT(ERROR) from here till changes are complete
*
* Our buffer lock won't stop a reader having already pinned and checked
* visibility for this tuple. Hence, we write WAL first, then mutate the
* buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
* checkpoint delay makes that acceptable. With the usual order of
* changes, a crash after memcpy() and before XLogInsert() could allow
* datfrozenxid to overtake relfrozenxid:
*
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
* ["R" is a VACUUM tbl]
@ -6387,14 +6392,28 @@ heap_inplace_update_and_unlock(Relation relation,
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
* [crash]
* [recovery restores datfrozenxid w/o relfrozenxid]
*
* Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
* the buffer to the stack before logging. Here, that facilitates a FPI
* of the post-mutation block before we accept other sessions seeing it.
*/
MarkBufferDirty(buffer);
Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
START_CRIT_SECTION();
MyProc->delayChkptFlags |= DELAY_CHKPT_START;
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
PGAlignedBlock copied_buffer;
char *origdata = (char *) BufferGetBlock(buffer);
Page page = BufferGetPage(buffer);
uint16 lower = ((PageHeader) page)->pd_lower;
uint16 upper = ((PageHeader) page)->pd_upper;
uintptr_t dst_offset_in_block;
RelFileLocator rlocator;
ForkNumber forkno;
BlockNumber blkno;
XLogRecPtr recptr;
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@ -6402,16 +6421,28 @@ heap_inplace_update_and_unlock(Relation relation,
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
/* register block matching what buffer will look like after changes */
memcpy(copied_buffer.data, origdata, lower);
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
dst_offset_in_block = dst - origdata;
memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
BufferGetTag(buffer, &rlocator, &forkno, &blkno);
Assert(forkno == MAIN_FORKNUM);
XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
REGBUF_STANDARD);
XLogRegisterBufData(0, src, newlen);
/* inplace updates aren't decoded atm, don't log the origin */
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
PageSetLSN(BufferGetPage(buffer), recptr);
PageSetLSN(page, recptr);
}
memcpy(dst, src, newlen);
MarkBufferDirty(buffer);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/*
@ -6424,6 +6455,7 @@ heap_inplace_update_and_unlock(Relation relation,
*/
AtInplace_Inval();
MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
END_CRIT_SECTION();
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);