diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 91cfae1603..db683b1217 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -5409,14 +5409,282 @@ heap_inplace_update(Relation relation, HeapTuple tuple) CacheInvalidateHeapTuple(relation, tuple, NULL); } +#define FRM_NOOP 0x0001 +#define FRM_INVALIDATE_XMAX 0x0002 +#define FRM_RETURN_IS_XID 0x0004 +#define FRM_RETURN_IS_MULTI 0x0008 +#define FRM_MARK_COMMITTED 0x0010 /* - * heap_freeze_tuple + * FreezeMultiXactId + * Determine what to do during freezing when a tuple is marked by a + * MultiXactId. + * + * NB -- this might have the side-effect of creating a new MultiXactId! + * + * "flags" is an output value; it's used to tell caller what to do on return. + * Possible flags are: + * FRM_NOOP + * don't do anything -- keep existing Xmax + * FRM_INVALIDATE_XMAX + * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag. + * FRM_RETURN_IS_XID + * The Xid return value is a single update Xid to set as xmax. + * FRM_MARK_COMMITTED + * Xmax can be marked as HEAP_XMAX_COMMITTED + * FRM_RETURN_IS_MULTI + * The return value is a new MultiXactId to set as new Xmax. + * (caller must obtain proper infomask bits using GetMultiXactIdHintBits) + */ +static TransactionId +FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, + TransactionId cutoff_xid, MultiXactId cutoff_multi, + uint16 *flags) +{ + TransactionId xid = InvalidTransactionId; + int i; + MultiXactMember *members; + int nmembers; + bool need_replace; + int nnewmembers; + MultiXactMember *newmembers; + bool has_lockers; + TransactionId update_xid; + bool update_committed; + + *flags = 0; + + /* We should only be called in Multis */ + Assert(t_infomask & HEAP_XMAX_IS_MULTI); + + if (!MultiXactIdIsValid(multi)) + { + /* Ensure infomask bits are appropriately set/reset */ + *flags |= FRM_INVALIDATE_XMAX; + return InvalidTransactionId; + } + else if (MultiXactIdPrecedes(multi, cutoff_multi)) + { + /* + * This old multi cannot possibly have members still running. If it + * was a locker only, it can be removed without any further + * consideration; but if it contained an update, we might need to + * preserve it. + */ + Assert(!MultiXactIdIsRunning(multi)); + if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)) + { + *flags |= FRM_INVALIDATE_XMAX; + xid = InvalidTransactionId; /* not strictly necessary */ + } + else + { + /* replace multi by update xid */ + xid = MultiXactIdGetUpdateXid(multi, t_infomask); + + /* wasn't only a lock, xid needs to be valid */ + Assert(TransactionIdIsValid(xid)); + + /* + * If the xid is older than the cutoff, it has to have aborted, + * otherwise the tuple would have gotten pruned away. + */ + if (TransactionIdPrecedes(xid, cutoff_xid)) + { + Assert(!TransactionIdDidCommit(xid)); + *flags |= FRM_INVALIDATE_XMAX; + xid = InvalidTransactionId; /* not strictly necessary */ + } + else + { + *flags |= FRM_RETURN_IS_XID; + } + } + + return xid; + } + + /* + * This multixact might have or might not have members still running, but + * we know it's valid and is newer than the cutoff point for multis. + * However, some member(s) of it may be below the cutoff for Xids, so we + * need to walk the whole members array to figure out what to do, if + * anything. + */ + + nmembers = GetMultiXactIdMembers(multi, &members, false); + if (nmembers <= 0) + { + /* Nothing worth keeping */ + *flags |= FRM_INVALIDATE_XMAX; + return InvalidTransactionId; + } + + /* is there anything older than the cutoff? */ + need_replace = false; + for (i = 0; i < nmembers; i++) + { + if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) + { + need_replace = true; + break; + } + } + + /* + * In the simplest case, there is no member older than the cutoff; we can + * keep the existing MultiXactId as is. + */ + if (!need_replace) + { + *flags |= FRM_NOOP; + pfree(members); + return InvalidTransactionId; + } + + /* + * If the multi needs to be updated, figure out which members do we need + * to keep. + */ + nnewmembers = 0; + newmembers = palloc(sizeof(MultiXactMember) * nmembers); + has_lockers = false; + update_xid = InvalidTransactionId; + update_committed = false; + + for (i = 0; i < nmembers; i++) + { + /* + * Determine whether to keep this member or ignore it. + */ + if (ISUPDATE_from_mxstatus(members[i].status)) + { + TransactionId xid = members[i].xid; + + /* + * It's an update; should we keep it? If the transaction is known + * aborted then it's okay to ignore it, otherwise not. However, + * if the Xid is older than the cutoff_xid, we must remove it. + * Note that such an old updater cannot possibly be committed, + * because HeapTupleSatisfiesVacuum would have returned + * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple. + * + * Note the TransactionIdDidAbort() test is just an optimization + * and not strictly necessary for correctness. + * + * As with all tuple visibility routines, it's critical to test + * TransactionIdIsInProgress before the transam.c routines, + * because of race conditions explained in detail in tqual.c. + */ + if (TransactionIdIsCurrentTransactionId(xid) || + TransactionIdIsInProgress(xid)) + { + Assert(!TransactionIdIsValid(update_xid)); + update_xid = xid; + } + else if (!TransactionIdDidAbort(xid)) + { + /* + * Test whether to tell caller to set HEAP_XMAX_COMMITTED + * while we have the Xid still in cache. Note this can only + * be done if the transaction is known not running. + */ + if (TransactionIdDidCommit(xid)) + update_committed = true; + Assert(!TransactionIdIsValid(update_xid)); + update_xid = xid; + } + + /* + * If we determined that it's an Xid corresponding to an update + * that must be retained, additionally add it to the list of + * members of the new Multis, in case we end up using that. (We + * might still decide to use only an update Xid and not a multi, + * but it's easier to maintain the list as we walk the old members + * list.) + * + * It is possible to end up with a very old updater Xid that + * crashed and thus did not mark itself as aborted in pg_clog. + * That would manifest as a pre-cutoff Xid. Make sure to ignore + * it. + */ + if (TransactionIdIsValid(update_xid)) + { + if (!TransactionIdPrecedes(update_xid, cutoff_xid)) + { + newmembers[nnewmembers++] = members[i]; + } + else + { + /* cannot have committed: would be HEAPTUPLE_DEAD */ + Assert(!TransactionIdDidCommit(update_xid)); + update_xid = InvalidTransactionId; + update_committed = false; + } + } + } + else + { + /* We only keep lockers if they are still running */ + if (TransactionIdIsCurrentTransactionId(members[i].xid) || + TransactionIdIsInProgress(members[i].xid)) + { + /* running locker cannot possibly be older than the cutoff */ + Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid)); + newmembers[nnewmembers++] = members[i]; + has_lockers = true; + } + } + } + + pfree(members); + + if (nnewmembers == 0) + { + /* nothing worth keeping!? Tell caller to remove the whole thing */ + *flags |= FRM_INVALIDATE_XMAX; + xid = InvalidTransactionId; + } + else if (TransactionIdIsValid(update_xid) && !has_lockers) + { + /* + * If there's a single member and it's an update, pass it back alone + * without creating a new Multi. (XXX we could do this when there's a + * single remaining locker, too, but that would complicate the API too + * much; moreover, the case with the single updater is more + * interesting, because those are longer-lived.) + */ + Assert(nnewmembers == 1); + *flags |= FRM_RETURN_IS_XID; + if (update_committed) + *flags |= FRM_MARK_COMMITTED; + xid = update_xid; + } + else + { + /* + * Create a new multixact with the surviving members of the previous + * one, to set as new Xmax in the tuple. + */ + xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers); + *flags |= FRM_RETURN_IS_MULTI; + } + + pfree(newmembers); + + return xid; +} + +/* + * heap_prepare_freeze_tuple * * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) - * are older than the specified cutoff XID. If so, replace them with - * FrozenTransactionId or InvalidTransactionId as appropriate, and return - * TRUE. Return FALSE if nothing was changed. + * are older than the specified cutoff XID and cutoff MultiXactId. If so, + * setup enough state (in the *frz output argument) to later execute and + * WAL-log what we would need to do, and return TRUE. Return FALSE if nothing + * is to be changed. + * + * Caller is responsible for setting the offset field, if appropriate. * * It is assumed that the caller has checked the tuple with * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD @@ -5425,54 +5693,44 @@ heap_inplace_update(Relation relation, HeapTuple tuple) * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any * XID older than it could neither be running nor seen as running by any * open transaction. This ensures that the replacement will not change - * anyone's idea of the tuple state. Also, since we assume the tuple is - * not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us - * to assume that it is either committed good or aborted, as appropriate; - * so we need no external state checks to decide what to do. (This is good - * because this function is applied during WAL recovery, when we don't have - * access to any such state, and can't depend on the hint bits to be set.) - * There is an exception we make which is to assume GetMultiXactIdMembers can - * be called during recovery. - * + * anyone's idea of the tuple state. * Similarly, cutoff_multi must be less than or equal to the smallest * MultiXactId used by any transaction currently open. * * If the tuple is in a shared buffer, caller must hold an exclusive lock on * that buffer. * - * Note: it might seem we could make the changes without exclusive lock, since - * TransactionId read/write is assumed atomic anyway. However there is a race - * condition: someone who just fetched an old XID that we overwrite here could - * conceivably not finish checking the XID against pg_clog before we finish - * the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting - * exclusive lock ensures no other backend is in process of checking the - * tuple status. Also, getting exclusive lock makes it safe to adjust the - * infomask bits. - * - * NB: Cannot rely on hint bits here, they might not be set after a crash or - * on a standby. + * NB: It is not enough to set hint bits to indicate something is + * committed/invalid -- they might not be set on a standby, or after crash + * recovery. We really need to remove old xids. */ bool -heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, - MultiXactId cutoff_multi) +heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, + TransactionId cutoff_multi, + xl_heap_freeze_tuple *frz) + { bool changed = false; bool freeze_xmax = false; TransactionId xid; + frz->frzflags = 0; + frz->t_infomask2 = tuple->t_infomask2; + frz->t_infomask = tuple->t_infomask; + frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + /* Process xmin */ xid = HeapTupleHeaderGetXmin(tuple); if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, cutoff_xid)) { - HeapTupleHeaderSetXmin(tuple, FrozenTransactionId); + frz->frzflags |= XLH_FREEZE_XMIN; /* * Might as well fix the hint bits too; usually XMIN_COMMITTED will * already be set here, but there's a small chance not. */ - Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); - tuple->t_infomask |= HEAP_XMIN_COMMITTED; + frz->t_infomask |= HEAP_XMIN_COMMITTED; changed = true; } @@ -5489,91 +5747,53 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - if (!MultiXactIdIsValid(xid)) - { - /* no xmax set, ignore */ - ; - } - else if (MultiXactIdPrecedes(xid, cutoff_multi)) + TransactionId newxmax; + uint16 flags; + + newxmax = FreezeMultiXactId(xid, tuple->t_infomask, + cutoff_xid, cutoff_multi, &flags); + + if (flags & FRM_INVALIDATE_XMAX) + freeze_xmax = true; + else if (flags & FRM_RETURN_IS_XID) { /* - * This old multi cannot possibly be running. If it was a locker - * only, it can be removed without much further thought; but if it - * contained an update, we need to preserve it. + * NB -- some of these transformations are only valid because + * we know the return Xid is a tuple updater (i.e. not merely a + * locker.) Also note that the only reason we don't explicitely + * worry about HEAP_KEYS_UPDATED is because it lives in t_infomask2 + * rather than t_infomask. */ - if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - freeze_xmax = true; - else - { - TransactionId update_xid; - - update_xid = HeapTupleGetUpdateXid(tuple); - - /* - * The multixact has an update hidden within. Get rid of it. - * - * If the update_xid is below the cutoff_xid, it necessarily - * must be an aborted transaction. In a primary server, such - * an Xmax would have gotten marked invalid by - * HeapTupleSatisfiesVacuum, but in a replica that is not - * called before we are, so deal with it in the same way. - * - * If not below the cutoff_xid, then the tuple would have been - * pruned by vacuum, if the update committed long enough ago, - * and we wouldn't be freezing it; so it's either recently - * committed, or in-progress. Deal with this by setting the - * Xmax to the update Xid directly and remove the IS_MULTI - * bit. (We know there cannot be running lockers in this - * multi, because it's below the cutoff_multi value.) - */ - - if (TransactionIdPrecedes(update_xid, cutoff_xid)) - { - Assert(InRecovery || TransactionIdDidAbort(update_xid)); - freeze_xmax = true; - } - else - { - Assert(InRecovery || !TransactionIdIsInProgress(update_xid)); - tuple->t_infomask &= ~HEAP_XMAX_BITS; - HeapTupleHeaderSetXmax(tuple, update_xid); - changed = true; - } - } + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->xmax = newxmax; + if (flags & FRM_MARK_COMMITTED) + frz->t_infomask &= HEAP_XMAX_COMMITTED; + changed = true; } - else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + else if (flags & FRM_RETURN_IS_MULTI) { - /* newer than the cutoff, so don't touch it */ - ; + uint16 newbits; + uint16 newbits2; + + /* + * We can't use GetMultiXactIdHintBits directly on the new multi + * here; that routine initializes the masks to all zeroes, which + * would lose other bits we need. Doing it this way ensures all + * unrelated bits remain untouched. + */ + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; + GetMultiXactIdHintBits(newxmax, &newbits, &newbits2); + frz->t_infomask |= newbits; + frz->t_infomask2 |= newbits2; + + frz->xmax = newxmax; + + changed = true; } else { - TransactionId update_xid; - - /* - * This is a multixact which is not marked LOCK_ONLY, but which - * is newer than the cutoff_multi. If the update_xid is below the - * cutoff_xid point, then we can just freeze the Xmax in the - * tuple, removing it altogether. This seems simple, but there - * are several underlying assumptions: - * - * 1. A tuple marked by an multixact containing a very old - * committed update Xid would have been pruned away by vacuum; we - * wouldn't be freezing this tuple at all. - * - * 2. There cannot possibly be any live locking members remaining - * in the multixact. This is because if they were alive, the - * update's Xid would had been considered, via the lockers' - * snapshot's Xmin, as part the cutoff_xid. - * - * 3. We don't create new MultiXacts via MultiXactIdExpand() that - * include a very old aborted update Xid: in that function we only - * include update Xids corresponding to transactions that are - * committed or in-progress. - */ - update_xid = HeapTupleGetUpdateXid(tuple); - if (TransactionIdPrecedes(update_xid, cutoff_xid)) - freeze_xmax = true; + Assert(flags & FRM_NOOP); } } else if (TransactionIdIsNormal(xid) && @@ -5584,17 +5804,17 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, if (freeze_xmax) { - HeapTupleHeaderSetXmax(tuple, InvalidTransactionId); + frz->xmax = InvalidTransactionId; /* * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + * LOCKED. Normalize to INVALID just to be sure no one gets confused. * Also get rid of the HEAP_KEYS_UPDATED bit. */ - tuple->t_infomask &= ~HEAP_XMAX_BITS; - tuple->t_infomask |= HEAP_XMAX_INVALID; - HeapTupleHeaderClearHotUpdated(tuple); - tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED; + frz->t_infomask &= ~HEAP_XMAX_BITS; + frz->t_infomask |= HEAP_XMAX_INVALID; + frz->t_infomask2 &= ~HEAP_HOT_UPDATED; + frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; changed = true; } @@ -5614,16 +5834,16 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, * xvac transaction succeeded. */ if (tuple->t_infomask & HEAP_MOVED_OFF) - HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); + frz->frzflags |= XLH_INVALID_XVAC; else - HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); + frz->frzflags |= XLH_FREEZE_XVAC; /* * Might as well fix the hint bits too; usually XMIN_COMMITTED * will already be set here, but there's a small chance not. */ Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID)); - tuple->t_infomask |= HEAP_XMIN_COMMITTED; + frz->t_infomask |= HEAP_XMIN_COMMITTED; changed = true; } } @@ -5631,6 +5851,70 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, return changed; } +/* + * heap_execute_freeze_tuple + * Execute the prepared freezing of a tuple. + * + * Caller is responsible for ensuring that no other backend can access the + * storage underlying this tuple, either by holding an exclusive lock on the + * buffer containing it (which is what lazy VACUUM does), or by having it by + * in private storage (which is what CLUSTER and friends do). + * + * Note: it might seem we could make the changes without exclusive lock, since + * TransactionId read/write is assumed atomic anyway. However there is a race + * condition: someone who just fetched an old XID that we overwrite here could + * conceivably not finish checking the XID against pg_clog before we finish + * the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting + * exclusive lock ensures no other backend is in process of checking the + * tuple status. Also, getting exclusive lock makes it safe to adjust the + * infomask bits. + * + * NB: All code in here must be safe to execute during crash recovery! + */ +void +heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) +{ + if (frz->frzflags & XLH_FREEZE_XMIN) + HeapTupleHeaderSetXmin(tuple, FrozenTransactionId); + + HeapTupleHeaderSetXmax(tuple, frz->xmax); + + if (frz->frzflags & XLH_FREEZE_XVAC) + HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); + + if (frz->frzflags & XLH_INVALID_XVAC) + HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); + + tuple->t_infomask = frz->t_infomask; + tuple->t_infomask2 = frz->t_infomask2; +} + +/* + * heap_freeze_tuple + * Freeze tuple in place, without WAL logging. + * + * Useful for callers like CLUSTER that perform their own WAL logging. + */ +bool +heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid, + TransactionId cutoff_multi) +{ + xl_heap_freeze_tuple frz; + bool do_freeze; + + do_freeze = heap_prepare_freeze_tuple(tuple, cutoff_xid, cutoff_multi, + &frz); + + /* + * Note that because this is not a WAL-logged operation, we don't need to + * fill in the offset in the freeze record. + */ + + if (do_freeze) + heap_execute_freeze_tuple(tuple, &frz); + return do_freeze; +} + /* * For a given MultiXactId, return the hint bits that should be set in the * tuple's infomask. @@ -5934,16 +6218,26 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, } else if (MultiXactIdPrecedes(multi, cutoff_multi)) return true; - else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - { - /* only-locker multis don't need internal examination */ - ; - } else { - if (TransactionIdPrecedes(HeapTupleGetUpdateXid(tuple), - cutoff_xid)) - return true; + MultiXactMember *members; + int nmembers; + int i; + + /* need to check whether any member of the mxact is too old */ + + nmembers = GetMultiXactIdMembers(multi, &members, false); + + for (i = 0; i < nmembers; i++) + { + if (TransactionIdPrecedes(members[i].xid, cutoff_xid)) + { + pfree(members); + return true; + } + } + if (nmembers > 0) + pfree(members); } } else @@ -6193,45 +6487,44 @@ log_heap_clean(Relation reln, Buffer buffer, } /* - * Perform XLogInsert for a heap-freeze operation. Caller must already - * have modified the buffer and marked it dirty. + * Perform XLogInsert for a heap-freeze operation. Caller must have already + * modified the buffer and marked it dirty. */ XLogRecPtr -log_heap_freeze(Relation reln, Buffer buffer, - TransactionId cutoff_xid, MultiXactId cutoff_multi, - OffsetNumber *offsets, int offcnt) +log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, + xl_heap_freeze_tuple *tuples, int ntuples) { - xl_heap_freeze xlrec; + xl_heap_freeze_page xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); /* nor when there are no tuples to freeze */ - Assert(offcnt > 0); + Assert(ntuples > 0); xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); xlrec.cutoff_xid = cutoff_xid; - xlrec.cutoff_multi = cutoff_multi; + xlrec.ntuples = ntuples; rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfHeapFreeze; + rdata[0].len = SizeOfHeapFreezePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); /* - * The tuple-offsets array is not actually in the buffer, but pretend that - * it is. When XLogInsert stores the whole buffer, the offsets array need + * The freeze plan array is not actually in the buffer, but pretend that + * it is. When XLogInsert stores the whole buffer, the freeze plan need * not be stored too. */ - rdata[1].data = (char *) offsets; - rdata[1].len = offcnt * sizeof(OffsetNumber); + rdata[1].data = (char *) tuples; + rdata[1].len = ntuples * sizeof(xl_heap_freeze_tuple); rdata[1].buffer = buffer; rdata[1].buffer_std = true; rdata[1].next = NULL; - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata); + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE, rdata); return recptr; } @@ -6848,64 +7141,6 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace); } -static void -heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record) -{ - xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record); - TransactionId cutoff_xid = xlrec->cutoff_xid; - MultiXactId cutoff_multi = xlrec->cutoff_multi; - Buffer buffer; - Page page; - - /* - * In Hot Standby mode, ensure that there's no queries running which still - * consider the frozen xids as running. - */ - if (InHotStandby) - ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node); - - /* If we have a full-page image, restore it and we're done */ - if (record->xl_info & XLR_BKP_BLOCK(0)) - { - (void) RestoreBackupBlock(lsn, record, 0, false, false); - return; - } - - buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); - if (!BufferIsValid(buffer)) - return; - page = (Page) BufferGetPage(buffer); - - if (lsn <= PageGetLSN(page)) - { - UnlockReleaseBuffer(buffer); - return; - } - - if (record->xl_len > SizeOfHeapFreeze) - { - OffsetNumber *offsets; - OffsetNumber *offsets_end; - - offsets = (OffsetNumber *) ((char *) xlrec + SizeOfHeapFreeze); - offsets_end = (OffsetNumber *) ((char *) xlrec + record->xl_len); - - while (offsets < offsets_end) - { - /* offsets[] entries are one-based */ - ItemId lp = PageGetItemId(page, *offsets); - HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp); - - (void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi); - offsets++; - } - } - - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); -} - /* * Replay XLOG_HEAP2_VISIBLE record. * @@ -7020,6 +7255,63 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record) } } +/* + * Replay XLOG_HEAP2_FREEZE_PAGE records + */ +static void +heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record) +{ + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record); + TransactionId cutoff_xid = xlrec->cutoff_xid; + Buffer buffer; + Page page; + int ntup; + + /* + * In Hot Standby mode, ensure that there's no queries running which still + * consider the frozen xids as running. + */ + if (InHotStandby) + ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node); + + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); + return; + } + + buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); + if (!BufferIsValid(buffer)) + return; + + page = (Page) BufferGetPage(buffer); + + if (lsn <= PageGetLSN(page)) + { + UnlockReleaseBuffer(buffer); + return; + } + + /* now execute freeze plan for each frozen tuple */ + for (ntup = 0; ntup < xlrec->ntuples; ntup++) + { + xl_heap_freeze_tuple *xlrec_tp; + ItemId lp; + HeapTupleHeader tuple; + + xlrec_tp = &xlrec->tuples[ntup]; + lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */ + tuple = (HeapTupleHeader) PageGetItem(page, lp); + + heap_execute_freeze_tuple(tuple, xlrec_tp); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + static void heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record) { @@ -7883,12 +8175,12 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record) switch (info & XLOG_HEAP_OPMASK) { - case XLOG_HEAP2_FREEZE: - heap_xlog_freeze(lsn, record); - break; case XLOG_HEAP2_CLEAN: heap_xlog_clean(lsn, record); break; + case XLOG_HEAP2_FREEZE_PAGE: + heap_xlog_freeze_page(lsn, record); + break; case XLOG_HEAP2_CLEANUP_INFO: heap_xlog_cleanup_info(lsn, record); break; diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 39c53d0022..4a86b8527d 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -131,16 +131,7 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) uint8 info = xl_info & ~XLR_INFO_MASK; info &= XLOG_HEAP_OPMASK; - if (info == XLOG_HEAP2_FREEZE) - { - xl_heap_freeze *xlrec = (xl_heap_freeze *) rec; - - appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff xid %u multi %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, - xlrec->cutoff_xid, xlrec->cutoff_multi); - } - else if (info == XLOG_HEAP2_CLEAN) + if (info == XLOG_HEAP2_CLEAN) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; @@ -149,6 +140,15 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) xlrec->node.relNode, xlrec->block, xlrec->latestRemovedXid); } + else if (info == XLOG_HEAP2_FREEZE_PAGE) + { + xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec; + + appendStringInfo(buf, "freeze_page: rel %u/%u/%u; blk %u; cutoff xid %u ntuples %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block, + xlrec->cutoff_xid, xlrec->ntuples); + } else if (info == XLOG_HEAP2_CLEANUP_INFO) { xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 05e1dcb49c..55a8ca7ac4 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -289,7 +289,6 @@ static MemoryContext MXactContext = NULL; /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); -static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, MultiXactMember *members); static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); @@ -336,6 +335,9 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + /* * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs * are still running. In typical usage, xid2 will be our own XID and the @@ -347,7 +349,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, members[1].xid = xid2; members[1].status = status2; - newMulti = CreateMultiXactId(2, members); + newMulti = MultiXactIdCreateFromMembers(2, members); debug_elog3(DEBUG2, "Create: %s", mxid_to_string(newMulti, 2, members)); @@ -387,6 +389,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) AssertArg(MultiXactIdIsValid(multi)); AssertArg(TransactionIdIsValid(xid)); + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s", multi, xid, mxstatus_to_string(status)); @@ -410,7 +415,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) */ member.xid = xid; member.status = status; - newMulti = CreateMultiXactId(1, &member); + newMulti = MultiXactIdCreateFromMembers(1, &member); debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", multi, newMulti); @@ -462,7 +467,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) newMembers[j].xid = xid; newMembers[j++].status = status; - newMulti = CreateMultiXactId(j, newMembers); + newMulti = MultiXactIdCreateFromMembers(j, newMembers); pfree(members); pfree(newMembers); @@ -667,16 +672,16 @@ ReadNextMultiXactId(void) } /* - * CreateMultiXactId - * Make a new MultiXactId + * MultiXactIdCreateFromMembers + * Make a new MultiXactId from the specified set of members * * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the * given TransactionIds as members. Returns the newly created MultiXactId. * * NB: the passed members[] array will be sorted in-place. */ -static MultiXactId -CreateMultiXactId(int nmembers, MultiXactMember *members) +MultiXactId +MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) { MultiXactId multi; MultiXactOffset offset; @@ -707,6 +712,13 @@ CreateMultiXactId(int nmembers, MultiXactMember *members) * Assign the MXID and offsets range to use, and make sure there is space * in the OFFSETs and MEMBERs files. NB: this routine does * START_CRIT_SECTION(). + * + * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check + * that we've called MultiXactIdSetOldestMember here. This is because + * this routine is used in some places to create new MultiXactIds of which + * the current backend is not a member, notably during freezing of multis + * in vacuum. During vacuum, in particular, it would be unacceptable to + * keep OldestMulti set, in case it runs for long. */ multi = GetNewMultiXactId(nmembers, &offset); @@ -763,7 +775,8 @@ CreateMultiXactId(int nmembers, MultiXactMember *members) * RecordNewMultiXact * Write info about a new multixact into the offsets and members files * - * This is broken out of CreateMultiXactId so that xlog replay can use it. + * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can + * use it. */ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, @@ -867,9 +880,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers); - /* MultiXactIdSetOldestMember() must have been called already */ - Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); - /* safety check, we should never get this far in a HS slave */ if (RecoveryInProgress()) elog(ERROR, "cannot assign MultiXactIds during recovery"); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 28e98e8b48..8dd3de5e8e 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -424,6 +424,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; + xl_heap_freeze_tuple *frozen; pg_rusage_init(&ru0); @@ -446,6 +447,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); + frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage); /* * We want to skip pages that don't require vacuuming according to the @@ -500,7 +502,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, bool tupgone, hastup; int prev_dead_count; - OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; @@ -890,9 +891,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ - if (heap_freeze_tuple(tuple.t_data, FreezeLimit, - MultiXactCutoff)) - frozen[nfrozen++] = offnum; + if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit, + MultiXactCutoff, &frozen[nfrozen])) + frozen[nfrozen++].offset = offnum; } } /* scan along page */ @@ -903,15 +904,33 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, */ if (nfrozen > 0) { + START_CRIT_SECTION(); + MarkBufferDirty(buf); + + /* execute collected freezes */ + for (i = 0; i < nfrozen; i++) + { + ItemId itemid; + HeapTupleHeader htup; + + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + heap_execute_freeze_tuple(htup, &frozen[i]); + } + + /* Now WAL-log freezing if neccessary */ if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, - MultiXactCutoff, frozen, nfrozen); + frozen, nfrozen); PageSetLSN(page, recptr); } + + END_CRIT_SECTION(); } /* @@ -1012,6 +1031,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, RecordPageWithFreeSpace(onerel, blkno, freespace); } + pfree(frozen); + /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 438e79db48..4062b422a7 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -48,9 +48,9 @@ * the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to * these, too. */ -#define XLOG_HEAP2_FREEZE 0x00 +/* 0x00 is free, was XLOG_HEAP2_FREEZE */ #define XLOG_HEAP2_CLEAN 0x10 -/* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */ +#define XLOG_HEAP2_FREEZE_PAGE 0x20 #define XLOG_HEAP2_CLEANUP_INFO 0x30 #define XLOG_HEAP2_VISIBLE 0x40 #define XLOG_HEAP2_MULTI_INSERT 0x50 @@ -270,17 +270,36 @@ typedef struct xl_heap_inplace #define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid) -/* This is what we need to know about tuple freezing during vacuum */ -typedef struct xl_heap_freeze +/* + * This struct represents a 'freeze plan', which is what we need to know about + * a single tuple being frozen during vacuum. + */ +#define XLH_FREEZE_XMIN 0x01 +#define XLH_FREEZE_XVAC 0x02 +#define XLH_INVALID_XVAC 0x04 + +typedef struct xl_heap_freeze_tuple +{ + TransactionId xmax; + OffsetNumber offset; + uint16 t_infomask2; + uint16 t_infomask; + uint8 frzflags; +} xl_heap_freeze_tuple; + +/* + * This is what we need to know about a block being frozen during vacuum + */ +typedef struct xl_heap_freeze_page { RelFileNode node; BlockNumber block; TransactionId cutoff_xid; - MultiXactId cutoff_multi; - /* TUPLE OFFSET NUMBERS FOLLOW AT THE END */ -} xl_heap_freeze; + uint16 ntuples; + xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER]; +} xl_heap_freeze_page; -#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId)) +#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples) /* This is what we need to know about setting a visibility map bit */ typedef struct xl_heap_visible @@ -331,8 +350,14 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *nowunused, int nunused, TransactionId latestRemovedXid); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, - TransactionId cutoff_xid, MultiXactId cutoff_multi, - OffsetNumber *offsets, int offcnt); + TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, + int ntuples); +extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, + TransactionId cutoff_xid, + TransactionId cutoff_multi, + xl_heap_freeze_tuple *frz); +extern void heap_execute_freeze_tuple(HeapTupleHeader tuple, + xl_heap_freeze_tuple *xlrec_tp); extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 6085ea3ec1..0e3b273b9e 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -81,6 +81,9 @@ extern MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status2); extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status); +extern MultiXactId MultiXactIdCreateFromMembers(int nmembers, + MultiXactMember *members); + extern MultiXactId ReadNextMultiXactId(void); extern bool MultiXactIdIsRunning(MultiXactId multi); extern void MultiXactIdSetOldestMember(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index c78a2fbfae..d0022b3751 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -55,7 +55,7 @@ typedef struct BkpBlock /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD079 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD07A /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData {