diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 37d044be98..ae8f68d948 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2323,16 +2323,19 @@ include_dir 'conf.d' levels. This parameter can only be set at server start. - In minimal level, WAL-logging of some bulk - operations can be safely skipped, which can make those - operations much faster (see ). - Operations in which this optimization can be applied include: + In minimal level, no information is logged for + permanent relations for the remainder of a transaction that creates or + rewrites them. This can make operations much faster (see + ). Operations that initiate this + optimization include: - CREATE TABLE AS - CREATE INDEX + ALTER ... SET TABLESPACE CLUSTER - COPY into tables that were created or truncated in the same - transaction + CREATE TABLE + REFRESH MATERIALIZED VIEW + (without ) + REINDEX + TRUNCATE But minimal WAL does not contain enough information to reconstruct the data from a base backup and the WAL logs, so replica or @@ -2721,6 +2724,26 @@ include_dir 'conf.d' + + wal_skip_threshold (integer) + + wal_skip_threshold configuration parameter + + + + + When wal_level is minimal and a + transaction commits after creating or rewriting a permanent relation, + this setting determines how to persist the new data. If the data is + smaller than this setting, write it to the WAL log; otherwise, use an + fsync of affected files. Depending on the properties of your storage, + raising or lowering this value might help if such commits are slowing + concurrent transactions. The default is two megabytes + (2MB). + + + + commit_delay (integer) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 992486eabe..83fa63071f 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1536,8 +1536,8 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; needs to be written, because in case of an error, the files containing the newly loaded data will be removed anyway. However, this consideration only applies when - is minimal for - non-partitioned tables as all commands must write WAL otherwise. + is minimal + as all commands must write WAL otherwise. @@ -1637,42 +1637,13 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; - Aside from avoiding the time for the archiver or WAL sender to - process the WAL data, - doing this will actually make certain commands faster, because they - are designed not to write WAL at all if wal_level - is minimal. (They can guarantee crash safety more cheaply - by doing an fsync at the end than by writing WAL.) - This applies to the following commands: - - - - CREATE TABLE AS SELECT - - - - - CREATE INDEX (and variants such as - ALTER TABLE ADD PRIMARY KEY) - - - - - ALTER TABLE SET TABLESPACE - - - - - CLUSTER - - - - - COPY FROM, when the target table has been - created or truncated earlier in the same transaction - - - + Aside from avoiding the time for the archiver or WAL sender to process the + WAL data, doing this will actually make certain commands faster, because + they do not to write WAL at all if wal_level + is minimal and the current subtransaction (or top-level + transaction) created or truncated the table or index they change. (They + can guarantee crash safety more cheaply by doing + an fsync at the end than by writing WAL.) diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index b9c4e27e1a..46d150f9d8 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -191,7 +191,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) PageSetLSN(page, recptr); } else - PageSetLSN(page, gistGetFakeLSN(heap)); + PageSetLSN(page, gistGetFakeLSN(index)); UnlockReleaseBuffer(buffer); diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 55cccd247a..d17965aa4b 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -972,23 +972,44 @@ gistproperty(Oid index_oid, int attno, } /* - * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs - * to detect concurrent page splits anyway. This function provides a fake - * sequence of LSNs for that purpose. + * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page + * splits anyway. This function provides a fake sequence of LSNs for that + * purpose. */ XLogRecPtr gistGetFakeLSN(Relation rel) { - static XLogRecPtr counter = 1; - if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) { /* * Temporary relations are only accessible in our session, so a simple * backend-local counter will do. */ + static XLogRecPtr counter = 1; + return counter++; } + else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) + { + /* + * WAL-logging on this relation will start after commit, so its LSNs + * must be distinct numbers smaller than the LSN at the next commit. + * Emit a dummy WAL record if insert-LSN hasn't advanced after the + * last call. + */ + static XLogRecPtr lastlsn = InvalidXLogRecPtr; + XLogRecPtr currlsn = GetXLogInsertRecPtr(); + + /* Shouldn't be called for WAL-logging relations */ + Assert(!RelationNeedsWAL(rel)); + + /* No need for an actual record if we already have a distinct LSN */ + if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn) + currlsn = gistXLogAssignLSN(); + + lastlsn = currlsn; + return currlsn; + } else { /* diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 17e213967b..93e6468213 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -505,6 +505,9 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(record); break; + case XLOG_GIST_ASSIGN_LSN: + /* nop. See gistGetFakeLSN(). */ + break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -623,6 +626,24 @@ gistXLogSplit(bool page_is_leaf, return recptr; } +/* + * Write an empty XLOG record to assign a distinct LSN. + */ +XLogRecPtr +gistXLogAssignLSN(void) +{ + int dummy = 0; + + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN); +} + /* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 8ebf86f687..af16d2d280 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -27,7 +27,6 @@ * heap_multi_insert - insert multiple tuples into a relation * heap_delete - delete a tuple from a relation * heap_update - replace a tuple in a relation with another tuple - * heap_sync - sync heap, for when no WAL has been written * * NOTES * This file contains the heap_ routines which implement @@ -2396,12 +2395,6 @@ ReleaseBulkInsertStatePin(BulkInsertState bistate) * The new tuple is stamped with current transaction ID and the specified * command ID. * - * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not - * logged in WAL, even for a non-temp relation. Safe usage of this behavior - * requires that we arrange that all new tuples go into new pages not - * containing any tuples from other transactions, and that the relation gets - * fsync'd before commit. (See also heap_sync() comments) - * * The HEAP_INSERT_SKIP_FSM option is passed directly to * RelationGetBufferForTuple, which see for more info. * @@ -2510,7 +2503,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) + if (RelationNeedsWAL(relation)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2720,7 +2713,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ AssertArg(!(options & HEAP_INSERT_NO_LOGICAL)); - needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -9420,18 +9413,13 @@ heap2_redo(XLogReaderState *record) } /* - * heap_sync - sync a heap, for use when no WAL has been written + * heap_sync - for binary compatibility * - * This forces the heap contents (including TOAST heap if any) down to disk. - * If we skipped using WAL, and WAL is otherwise needed, we must force the - * relation down to disk before it's safe to commit the transaction. This - * requires writing out any dirty buffers and then doing a forced fsync. - * - * Indexes are not touched. (Currently, index operations associated with - * the commands that use this are WAL-logged and so do not need fsync. - * That behavior might change someday, but in any case it's likely that - * any fsync decisions required would be per-index and hence not appropriate - * to be done here.) + * A newer PostgreSQL version removes this function. It exists here just in + * case an extension calls it. See "Skipping WAL for New RelFileNode" in + * src/backend/access/transam/README for the system that superseded it, + * allowing removal of most calls. Cases like copy_relation_data() should + * call smgrimmedsync() directly. */ void heap_sync(Relation rel) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 9f0b586b5b..dcb4d6877f 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -145,7 +145,6 @@ typedef struct RewriteStateData Page rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ bool rs_buffer_valid; /* T if any tuples in buffer */ - bool rs_use_wal; /* must we WAL-log inserts? */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine * tuple visibility */ @@ -239,15 +238,13 @@ static void logical_end_heap_rewrite(RewriteState state); * oldest_xmin xid used by the caller to determine which tuples are dead * freeze_xid xid before which tuples will be frozen * min_multi multixact before which multis will be removed - * use_wal should the inserts to the new heap be WAL-logged? * * Returns an opaque RewriteState, allocated in current memory context, * to be used in subsequent calls to the other functions. */ RewriteState begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, - TransactionId freeze_xid, MultiXactId cutoff_multi, - bool use_wal) + TransactionId freeze_xid, MultiXactId cutoff_multi) { RewriteState state; MemoryContext rw_cxt; @@ -272,7 +269,6 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; - state->rs_use_wal = use_wal; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; @@ -331,7 +327,7 @@ end_heap_rewrite(RewriteState state) /* Write the last page, if any */ if (state->rs_buffer_valid) { - if (state->rs_use_wal) + if (RelationNeedsWAL(state->rs_new_rel)) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, @@ -346,18 +342,14 @@ end_heap_rewrite(RewriteState state) } /* - * If the rel is WAL-logged, must fsync before commit. We use heap_sync - * to ensure that the toast table gets fsync'd too. - * - * It's obvious that we must do this when not WAL-logging. It's less - * obvious that we have to do it even if we did WAL-log the pages. The + * When we WAL-logged rel pages, we must nonetheless fsync them. The * reason is the same as in tablecmds.c's copy_relation_data(): we're * writing data that's not in shared buffers, and so a CHECKPOINT * occurring during the rewriteheap operation won't have fsync'd data we * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) - heap_sync(state->rs_new_rel); + smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM); logical_end_heap_rewrite(state); @@ -655,9 +647,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup) { int options = HEAP_INSERT_SKIP_FSM; - if (!state->rs_use_wal) - options |= HEAP_INSERT_SKIP_WAL; - /* * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data * for the TOAST table are not logically decoded. The main heap is @@ -696,7 +685,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ - if (state->rs_use_wal) + if (RelationNeedsWAL(state->rs_new_rel)) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index dab41ea298..58c702d7a0 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -31,18 +31,6 @@ * them. They will need to be re-read into shared buffers on first use after * the build finishes. * - * Since the index will never be used unless it is completely built, - * from a crash-recovery point of view there is no need to WAL-log the - * steps of the build. After completing the index build, we can just sync - * the whole file to disk using smgrimmedsync() before exiting this module. - * This can be seen to be sufficient for crash recovery by considering that - * it's effectively equivalent to what would happen if a CHECKPOINT occurred - * just after the index build. However, it is clearly not sufficient if the - * DBA is using the WAL log for PITR or replication purposes, since another - * machine would not be able to reconstruct the index from WAL. Therefore, - * we log the completed index pages to WAL if and only if WAL archiving is - * active. - * * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. * @@ -530,12 +518,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; - - /* - * We need to log index creation in WAL iff WAL archiving/streaming is - * enabled UNLESS the index isn't WAL-logged anyway. - */ - wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); + wstate.btws_use_wal = RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; @@ -1190,21 +1173,15 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * If the index is WAL-logged, we must fsync it down to disk before it's - * safe to commit the transaction. (For a non-WAL-logged index we don't - * care since the index will be uninteresting after a crash anyway.) - * - * It's obvious that we must do this when not WAL-logging the build. It's - * less obvious that we have to do it even if we did WAL-log the index - * pages. The reason is that since we're building outside shared buffers, - * a CHECKPOINT occurring during the build has no way to flush the - * previously written data to disk (indeed it won't know the index even - * exists). A crash later on would replay WAL from the checkpoint, - * therefore it wouldn't replay our earlier WAL entries. If we do not - * fsync those pages here, they might still not be on disk when the crash - * occurs. + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. */ - if (RelationNeedsWAL(wstate->index)) + if (wstate->btws_use_wal) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index e5e925e0c5..8c44925442 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -46,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record) break; case XLOG_GIST_CREATE_INDEX: break; + case XLOG_GIST_ASSIGN_LSN: + /* No details to write out */ + break; } } @@ -65,6 +68,9 @@ gist_identify(uint8 info) case XLOG_GIST_CREATE_INDEX: id = "CREATE_INDEX"; break; + case XLOG_GIST_ASSIGN_LSN: + id = "ASSIGN_LSN"; + break; } return id; diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index ad4083eb6b..28045f3087 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -717,6 +717,38 @@ then restart recovery. This is part of the reason for not writing a WAL entry until we've successfully done the original action. +Skipping WAL for New RelFileNode +-------------------------------- + +Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK +would unlink, in-tree access methods write no WAL for that change. Code that +writes WAL without calling RelationNeedsWAL() must check for this case. This +skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change +for the same block, REDO could overwrite the WAL-skipping change. If a +WAL-writing change followed a WAL-skipping change for the same block, a +related problem would arise. When a WAL record contains no full-page image, +REDO expects the page to match its contents from just before record insertion. +A WAL-skipping change may not reach disk at all, violating REDO's expectation +under full_page_writes=off. For any access method, CommitTransaction() writes +and fsyncs affected blocks before recording the commit. + +Prefer to do the same in future access methods. However, two other approaches +can work. First, an access method can irreversibly transition a given fork +from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and +smgrimmedsync(). Second, an access method can opt to write WAL +unconditionally for permanent relations. Under these approaches, the access +method callbacks must not call functions that react to RelationNeedsWAL(). + +This applies only to WAL records whose replay would modify bytes stored in the +new relfilenode. It does not apply to other records about the relfilenode, +such as XLOG_SMGR_CREATE. Because it operates at the level of individual +relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations. +Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which +ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while +the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table +to skip WAL, but that won't affect its indexes. + + Asynchronous Commit ------------------- @@ -820,13 +852,12 @@ Changes to a temp table are not WAL-logged, hence could reach disk in advance of T1's commit, but we don't care since temp table contents don't survive crashes anyway. -Database writes made via any of the paths we have introduced to avoid WAL -overhead for bulk updates are also safe. In these cases it's entirely -possible for the data to reach disk before T1's commit, because T1 will -fsync it down to disk without any sort of interlock, as soon as it finishes -the bulk update. However, all these paths are designed to write data that -no other transaction can see until after T1 commits. The situation is thus -not different from ordinary WAL-logged updates. +Database writes that skip WAL for new relfilenodes are also safe. In these +cases it's entirely possible for the data to reach disk before T1's commit, +because T1 will fsync it down to disk without any sort of interlock. However, +all these paths are designed to write data that no other transaction can see +until after T1 commits. The situation is thus not different from ordinary +WAL-logged updates. Transaction Emulation during Recovery ------------------------------------- diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 7c1771eae7..134c95c9be 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2035,6 +2035,13 @@ CommitTransaction(void) */ PreCommit_on_commit_actions(); + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before AtEOXact_RelationMap(), so that we + * don't see committed-but-broken files after a crash. + */ + smgrDoPendingSyncs(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2264,6 +2271,13 @@ PrepareTransaction(void) */ PreCommit_on_commit_actions(); + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before EndPrepare(), so that we don't see + * committed-but-broken files after a crash and COMMIT PREPARED. + */ + smgrDoPendingSyncs(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2588,6 +2602,7 @@ AbortTransaction(void) */ AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); + smgrDoPendingSyncs(false); AtEOXact_LargeObject(false); AtAbort_Notify(); AtEOXact_RelationMap(false); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 4ecdc9220f..e18abdfe52 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -544,6 +544,8 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry; * fields related to physical storage, like rd_rel, are initialized, so the * fake entry is only usable in low-level operations like ReadBuffer(). * + * This is also used for syncing WAL-skipped files. + * * Caller must free the returned entry with FreeFakeRelcacheEntry(). */ Relation @@ -552,18 +554,20 @@ CreateFakeRelcacheEntry(RelFileNode rnode) FakeRelCacheEntry fakeentry; Relation rel; - Assert(InRecovery); - /* Allocate the Relation struct and all related space in one block. */ fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); rel = (Relation) fakeentry; rel->rd_rel = &fakeentry->pgc; rel->rd_node = rnode; - /* We will never be working with temp rels during recovery */ + + /* + * We will never be working with temp rels during recovery or while + * syncing WAL-skipped files. + */ rel->rd_backend = InvalidBackendId; - /* It must be a permanent table if we're in recovery. */ + /* It must be a permanent table here */ rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; /* We don't know the name of the relation; use relfilenode instead */ @@ -572,9 +576,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode) /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be - * different from the relation's OID. It shouldn't really matter though, - * since we are presumably running by ourselves and can't have any lock - * conflicts ... + * different from the relation's OID. It shouldn't really matter though. + * In recovery, we are running by ourselves and can't have any lock + * conflicts. While syncing, we already hold AccessExclusiveLock. */ rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; rel->rd_lockInfo.lockRelId.relId = rnode.relNode; diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index 4c72989cc2..f6015408e8 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -304,6 +304,8 @@ Boot_DeclareIndexStmt: stmt->idxcomment = NULL; stmt->indexOid = InvalidOid; stmt->oldNode = InvalidOid; + stmt->oldCreateSubid = InvalidSubTransactionId; + stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId; stmt->unique = false; stmt->primary = false; stmt->isconstraint = false; @@ -353,6 +355,8 @@ Boot_DeclareUniqueIndexStmt: stmt->idxcomment = NULL; stmt->indexOid = InvalidOid; stmt->oldNode = InvalidOid; + stmt->oldCreateSubid = InvalidSubTransactionId; + stmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId; stmt->unique = true; stmt->primary = false; stmt->isconstraint = false; diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 5df4382b7e..d828b07a2e 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -28,9 +28,13 @@ #include "catalog/storage_xlog.h" #include "storage/freespace.h" #include "storage/smgr.h" +#include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/rel.h" +/* GUC variables */ +int wal_skip_threshold = 2048; /* in kilobytes */ + /* * We keep a list of all relations (represented as RelFileNode values) * that have been created or deleted in the current transaction. When @@ -60,7 +64,14 @@ typedef struct PendingRelDelete struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; +typedef struct pendingSync +{ + RelFileNode rnode; + bool is_truncated; /* Has the file experienced truncation? */ +} pendingSync; + static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ +HTAB *pendingSyncHash = NULL; /* * RelationCreateStorage @@ -115,6 +126,32 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) pending->nestLevel = GetCurrentTransactionNestLevel(); pending->next = pendingDeletes; pendingDeletes = pending; + + /* Queue an at-commit sync. */ + if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded()) + { + pendingSync *pending; + bool found; + + /* we sync only permanent relations */ + Assert(backend == InvalidBackendId); + + if (!pendingSyncHash) + { + HASHCTL ctl; + + ctl.keysize = sizeof(RelFileNode); + ctl.entrysize = sizeof(pendingSync); + ctl.hcxt = TopTransactionContext; + pendingSyncHash = + hash_create("pending sync hash", + 16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } + + pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found); + Assert(!found); + pending->is_truncated = false; + } } /* @@ -248,6 +285,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (vm) visibilitymap_truncate(rel, nblocks); + RelationPreTruncate(rel); + /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay @@ -290,6 +329,49 @@ RelationTruncate(Relation rel, BlockNumber nblocks) smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks); } +/* + * RelationPreTruncate + * Perform AM-independent work before a physical truncation. + * + * If an access method's relation_nontransactional_truncate does not call + * RelationTruncate(), it must call this before decreasing the table size. + */ +void +RelationPreTruncate(Relation rel) +{ + pendingSync *pending; + + if (!pendingSyncHash) + return; + RelationOpenSmgr(rel); + + pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node), + HASH_FIND, NULL); + if (pending) + pending->is_truncated = true; +} + +/* + * RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL + * + * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for + * New RelFileNode" in src/backend/access/transam/README. Though it is + * known from Relation efficiently, this function is intended for the code + * paths not having access to Relation. + */ +bool +RelFileNodeSkippingWAL(RelFileNode rnode) +{ + if (XLogIsNeeded()) + return false; /* no permanent relfilenode skips WAL */ + + if (!pendingSyncHash || + hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL) + return false; + + return true; +} + /* * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact. * @@ -367,6 +449,144 @@ smgrDoPendingDeletes(bool isCommit) } } +/* + * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact. + */ +void +smgrDoPendingSyncs(bool isCommit) +{ + PendingRelDelete *pending; + int nrels = 0, + maxrels = 0; + SMgrRelation *srels = NULL; + HASH_SEQ_STATUS scan; + pendingSync *pendingsync; + + if (XLogIsNeeded()) + return; /* no relation can use this */ + + Assert(GetCurrentTransactionNestLevel() == 1); + + if (!pendingSyncHash) + return; /* no relation needs sync */ + + /* Just throw away all pending syncs if any at rollback */ + if (!isCommit) + { + pendingSyncHash = NULL; + return; + } + + AssertPendingSyncs_RelationCache(); + + /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */ + for (pending = pendingDeletes; pending != NULL; pending = pending->next) + { + if (!pending->atCommit) + continue; + + (void) hash_search(pendingSyncHash, (void *) &pending->relnode, + HASH_REMOVE, NULL); + } + + hash_seq_init(&scan, pendingSyncHash); + while ((pendingsync = (pendingSync *) hash_seq_search(&scan))) + { + ForkNumber fork; + BlockNumber nblocks[MAX_FORKNUM + 1]; + BlockNumber total_blocks = 0; + SMgrRelation srel; + + srel = smgropen(pendingsync->rnode, InvalidBackendId); + + /* + * We emit newpage WAL records for smaller relations. + * + * Small WAL records have a chance to be emitted along with other + * backends' WAL records. We emit WAL records instead of syncing for + * files that are smaller than a certain threshold, expecting faster + * commit. The threshold is defined by the GUC wal_skip_threshold. + */ + if (!pendingsync->is_truncated) + { + for (fork = 0; fork <= MAX_FORKNUM; fork++) + { + if (smgrexists(srel, fork)) + { + BlockNumber n = smgrnblocks(srel, fork); + + /* we shouldn't come here for unlogged relations */ + Assert(fork != INIT_FORKNUM); + nblocks[fork] = n; + total_blocks += n; + } + else + nblocks[fork] = InvalidBlockNumber; + } + } + + /* + * Sync file or emit WAL records for its contents. + * + * Although we emit WAL record if the file is small enough, do file + * sync regardless of the size if the file has experienced a + * truncation. It is because the file would be followed by trailing + * garbage blocks after a crash recovery if, while a past longer file + * had been flushed out, we omitted syncing-out of the file and + * emitted WAL instead. You might think that we could choose WAL if + * the current main fork is longer than ever, but there's a case where + * main fork is longer than ever but FSM fork gets shorter. + */ + if (pendingsync->is_truncated || + total_blocks * BLCKSZ / 1024 >= wal_skip_threshold) + { + /* allocate the initial array, or extend it, if needed */ + if (maxrels == 0) + { + maxrels = 8; + srels = palloc(sizeof(SMgrRelation) * maxrels); + } + else if (maxrels <= nrels) + { + maxrels *= 2; + srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + } + + srels[nrels++] = srel; + } + else + { + /* Emit WAL records for all blocks. The file is small enough. */ + for (fork = 0; fork <= MAX_FORKNUM; fork++) + { + int n = nblocks[fork]; + Relation rel; + + if (!BlockNumberIsValid(n)) + continue; + + /* + * Emit WAL for the whole file. Unfortunately we don't know + * what kind of a page this is, so we have to log the full + * page including any unused space. ReadBufferExtended() + * counts some pgstat events; unfortunately, we discard them. + */ + rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node); + log_newpage_range(rel, fork, 0, n, false); + FreeFakeRelcacheEntry(rel); + } + } + } + + pendingSyncHash = NULL; + + if (nrels > 0) + { + smgrdosyncall(srels, nrels); + pfree(srels); + } +} + /* * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted. * diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 0a425e1018..a9732c266f 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -763,7 +763,6 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, bool *isnull; IndexScanDesc indexScan; HeapScanDesc heapScan; - bool use_wal; bool is_system_catalog; TransactionId OldestXmin; TransactionId FreezeXid; @@ -820,12 +819,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock); /* - * We need to log the copied data in WAL iff WAL archiving/streaming is - * enabled AND it's a WAL-logged rel. + * Valid smgr_targblock implies something already wrote to the relation. + * This may be harmless, but this function hasn't planned for it. */ - use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap); - - /* use_wal off requires smgr_targblock be initially invalid */ Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); /* @@ -893,7 +889,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, /* Initialize the rewrite operation */ rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid, - MultiXactCutoff, use_wal); + MultiXactCutoff); /* * Decide whether to use an indexscan or seqscan-and-optional-sort to scan @@ -1285,6 +1281,25 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, *mapped_tables++ = r2; } + /* + * Recognize that rel1's relfilenode (swapped from rel2) is new in this + * subtransaction. The rel2 storage (swapped from rel1) may or may not be + * new. + */ + { + Relation rel1, + rel2; + + rel1 = relation_open(r1, NoLock); + rel2 = relation_open(r2, NoLock); + rel2->rd_createSubid = rel1->rd_createSubid; + rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid; + rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid; + RelationAssumeNewRelfilenode(rel1); + relation_close(rel1, NoLock); + relation_close(rel2, NoLock); + } + /* * In the case of a shared catalog, these next few steps will only affect * our own database's pg_class row; but that's okay, because they are all diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index f4653485e2..2cde2541e9 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2382,65 +2382,17 @@ CopyFrom(CopyState cstate) tupDesc = RelationGetDescr(cstate->rel); - /*---------- - * Check to see if we can avoid writing WAL - * - * If archive logging/streaming is not enabled *and* either - * - table was created in same transaction as this COPY - * - data is being written to relfilenode created in this transaction - * then we can skip writing WAL. It's safe because if the transaction - * doesn't commit, we'll discard the table (or the new relfilenode file). - * If it does commit, we'll have done the heap_sync at the bottom of this - * routine first. - * - * As mentioned in comments in utils/rel.h, the in-same-transaction test - * is not always set correctly, since in rare cases rd_newRelfilenodeSubid - * can be cleared before the end of the transaction. The exact case is - * when a relation sets a new relfilenode twice in same transaction, yet - * the second one fails in an aborted subtransaction, e.g. - * - * BEGIN; - * TRUNCATE t; - * SAVEPOINT save; - * TRUNCATE t; - * ROLLBACK TO save; - * COPY ... - * - * Also, if the target file is new-in-transaction, we assume that checking - * FSM for free space is a waste of time, even if we must use WAL because - * of archiving. This could possibly be wrong, but it's unlikely. - * - * The comments for heap_insert and RelationGetBufferForTuple specify that - * skipping WAL logging is only safe if we ensure that our tuples do not - * go into pages containing tuples from any other transactions --- but this - * must be the case if we have a new table or new relfilenode, so we need - * no additional work to enforce that. - * - * We currently don't support this optimization if the COPY target is a - * partitioned table as we currently only lazily initialize partition - * information when routing the first tuple to the partition. We cannot - * know at this stage if we can perform this optimization. It should be - * possible to improve on this, but it does mean maintaining heap insert - * option flags per partition and setting them when we first open the - * partition. - * - * This optimization is not supported for relation types which do not - * have any physical storage, with foreign tables and views using - * INSTEAD OF triggers entering in this category. Partitioned tables - * are not supported as per the description above. - *---------- + /* + * If the target file is new-in-transaction, we assume that checking FSM + * for free space is a waste of time. This could possibly be wrong, but + * it's unlikely. */ - /* createSubid is creation check, newRelfilenodeSubid is truncation check */ if (cstate->rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE && cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && cstate->rel->rd_rel->relkind != RELKIND_VIEW && (cstate->rel->rd_createSubid != InvalidSubTransactionId || - cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)) - { + cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId)) hi_options |= HEAP_INSERT_SKIP_FSM; - if (!XLogIsNeeded()) - hi_options |= HEAP_INSERT_SKIP_WAL; - } /* * Optimize if new relfilenode was created in this subxact or one of its @@ -2932,13 +2884,6 @@ next_tuple: FreeExecutorState(estate); - /* - * If we skipped writing WAL, then we need to sync the heap (but not - * indexes since those use WAL anyway) - */ - if (hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(cstate->rel); - return processed; } diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index 3d82edbf58..1f99b5ceec 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -562,16 +562,13 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) myState->rel = intoRelationDesc; myState->reladdr = intoRelationAddr; myState->output_cid = GetCurrentCommandId(true); - - /* - * We can skip WAL-logging the insertions, unless PITR or streaming - * replication is in use. We can skip the FSM in any case. - */ - myState->hi_options = HEAP_INSERT_SKIP_FSM | - (XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL); + myState->hi_options = HEAP_INSERT_SKIP_FSM; myState->bistate = GetBulkInsertState(); - /* Not using WAL requires smgr_targblock be initially invalid */ + /* + * Valid smgr_targblock implies something already wrote to the relation. + * This may be harmless, but this function hasn't planned for it. + */ Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber); } @@ -617,10 +614,6 @@ intorel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate); - /* If we skipped using WAL, must heap_sync before commit */ - if (myState->hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(myState->rel); - /* close rel, but keep lock until commit */ heap_close(myState->rel, NoLock); myState->rel = NULL; diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 568a32bb4e..f6c2891f50 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1050,6 +1050,8 @@ DefineIndex(Oid relationId, childStmt->relationId = childRelid; childStmt->indexOid = InvalidOid; childStmt->oldNode = InvalidOid; + childStmt->oldCreateSubid = InvalidSubTransactionId; + childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId; /* * Adjust any Vars (both in expressions and in the index's diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index e1eb7c374b..f1b87a9ff8 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -457,17 +457,13 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) */ myState->transientrel = transientrel; myState->output_cid = GetCurrentCommandId(true); - - /* - * We can skip WAL-logging the insertions, unless PITR or streaming - * replication is in use. We can skip the FSM in any case. - */ myState->hi_options = HEAP_INSERT_SKIP_FSM | HEAP_INSERT_FROZEN; - if (!XLogIsNeeded()) - myState->hi_options |= HEAP_INSERT_SKIP_WAL; myState->bistate = GetBulkInsertState(); - /* Not using WAL requires smgr_targblock be initially invalid */ + /* + * Valid smgr_targblock implies something already wrote to the relation. + * This may be harmless, but this function hasn't planned for it. + */ Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber); } @@ -507,10 +503,6 @@ transientrel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate); - /* If we skipped using WAL, must heap_sync before commit */ - if (myState->hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(myState->transientrel); - /* close transientrel, but keep lock until commit */ heap_close(myState->transientrel, NoLock); myState->transientrel = NULL; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 3a2e2ecc4f..c0073d6129 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -4650,19 +4650,14 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) newrel = NULL; /* - * Prepare a BulkInsertState and options for heap_insert. Because we're - * building a new heap, we can skip WAL-logging and fsync it to disk at - * the end instead (unless WAL-logging is required for archiving or - * streaming replication). The FSM is empty too, so don't bother using it. + * Prepare a BulkInsertState and options for heap_insert. The FSM is + * empty, so don't bother using it. */ if (newrel) { mycid = GetCurrentCommandId(true); bistate = GetBulkInsertState(); - hi_options = HEAP_INSERT_SKIP_FSM; - if (!XLogIsNeeded()) - hi_options |= HEAP_INSERT_SKIP_WAL; } else { @@ -4934,10 +4929,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) { FreeBulkInsertState(bistate); - /* If we skipped writing WAL, then we need to sync the heap. */ - if (hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(newrel); - heap_close(newrel, NoLock); } } @@ -7081,14 +7072,19 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, /* * If TryReuseIndex() stashed a relfilenode for us, we used it for the new - * index instead of building from scratch. The DROP of the old edition of - * this index will have scheduled the storage for deletion at commit, so - * cancel that pending deletion. + * index instead of building from scratch. Restore associated fields. + * This may store InvalidSubTransactionId in both fields, in which case + * relcache.c will assume it can rebuild the relcache entry. Hence, do + * this after the CCI that made catalog rows visible to any rebuild. The + * DROP of the old edition of this index will have scheduled the storage + * for deletion at commit, so cancel that pending deletion. */ if (OidIsValid(stmt->oldNode)) { Relation irel = index_open(address.objectId, NoLock); + irel->rd_createSubid = stmt->oldCreateSubid; + irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid; RelationPreserveStorage(irel->rd_node, true); index_close(irel, NoLock); } @@ -10880,7 +10876,11 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) /* If it's a partitioned index, there is no storage to share. */ if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) + { stmt->oldNode = irel->rd_node.relNode; + stmt->oldCreateSubid = irel->rd_createSubid; + stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid; + } index_close(irel, NoLock); } } @@ -11735,6 +11735,8 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) heap_close(pg_class, RowExclusiveLock); + RelationAssumeNewRelfilenode(rel); + relation_close(rel, NoLock); /* Make sure the reltablespace change is visible */ @@ -12006,7 +12008,9 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst, /* * We need to log the copied data in WAL iff WAL archiving/streaming is - * enabled AND it's a permanent relation. + * enabled AND it's a permanent relation. This gives the same answer as + * "RelationNeedsWAL(rel) || copying_initfork", because we know the + * current operation created a new relfilenode. */ use_wal = XLogIsNeeded() && (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork); @@ -12048,21 +12052,15 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst, } /* - * If the rel is WAL-logged, must fsync before commit. We use heap_sync - * to ensure that the toast table gets fsync'd too. (For a temp or - * unlogged rel we don't care since the data will be gone after a crash - * anyway.) - * - * It's obvious that we must do this when not WAL-logging the copy. It's - * less obvious that we have to do it even if we did WAL-log the copied - * pages. The reason is that since we're copying outside shared buffers, a - * CHECKPOINT occurring during the copy has no way to flush the previously - * written data to disk (indeed it won't know the new rel even exists). A - * crash later on would replay WAL from the checkpoint, therefore it - * wouldn't replay our earlier WAL entries. If we do not fsync those pages - * here, they might still not be on disk when the crash occurs. + * When we WAL-logged rel pages, we must nonetheless fsync them. The + * reason is that since we're copying outside shared buffers, a CHECKPOINT + * occurring during the copy has no way to flush the previously written + * data to disk (indeed it won't know the new rel even exists). A crash + * later on would replay WAL from the checkpoint, therefore it wouldn't + * replay our earlier WAL entries. If we do not fsync those pages here, + * they might still not be on disk when the crash occurs. */ - if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork) + if (use_wal || copying_initfork) smgrimmedsync(dst, forkNum); } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index d2fc5dc8dc..180e425816 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3457,6 +3457,8 @@ _copyIndexStmt(const IndexStmt *from) COPY_STRING_FIELD(idxcomment); COPY_SCALAR_FIELD(indexOid); COPY_SCALAR_FIELD(oldNode); + COPY_SCALAR_FIELD(oldCreateSubid); + COPY_SCALAR_FIELD(oldFirstRelfilenodeSubid); COPY_SCALAR_FIELD(unique); COPY_SCALAR_FIELD(primary); COPY_SCALAR_FIELD(isconstraint); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 88ed816808..2021fd65ba 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1336,6 +1336,8 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b) COMPARE_STRING_FIELD(idxcomment); COMPARE_SCALAR_FIELD(indexOid); COMPARE_SCALAR_FIELD(oldNode); + COMPARE_SCALAR_FIELD(oldCreateSubid); + COMPARE_SCALAR_FIELD(oldFirstRelfilenodeSubid); COMPARE_SCALAR_FIELD(unique); COMPARE_SCALAR_FIELD(primary); COMPARE_SCALAR_FIELD(isconstraint); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index f5d786d79a..6d6925725e 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2732,6 +2732,8 @@ _outIndexStmt(StringInfo str, const IndexStmt *node) WRITE_STRING_FIELD(idxcomment); WRITE_OID_FIELD(indexOid); WRITE_OID_FIELD(oldNode); + WRITE_UINT_FIELD(oldCreateSubid); + WRITE_UINT_FIELD(oldFirstRelfilenodeSubid); WRITE_BOOL_FIELD(unique); WRITE_BOOL_FIELD(primary); WRITE_BOOL_FIELD(isconstraint); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index bc65319c2c..038a004b48 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -7398,6 +7398,8 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name n->idxcomment = NULL; n->indexOid = InvalidOid; n->oldNode = InvalidOid; + n->oldCreateSubid = InvalidSubTransactionId; + n->oldFirstRelfilenodeSubid = InvalidSubTransactionId; n->primary = false; n->isconstraint = false; n->deferrable = false; @@ -7426,6 +7428,8 @@ IndexStmt: CREATE opt_unique INDEX opt_concurrently opt_index_name n->idxcomment = NULL; n->indexOid = InvalidOid; n->oldNode = InvalidOid; + n->oldCreateSubid = InvalidSubTransactionId; + n->oldFirstRelfilenodeSubid = InvalidSubTransactionId; n->primary = false; n->isconstraint = false; n->deferrable = false; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 46d2803aa8..3b8c098815 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1355,6 +1355,8 @@ generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx, index->idxcomment = NULL; index->indexOid = InvalidOid; index->oldNode = InvalidOid; + index->oldCreateSubid = InvalidSubTransactionId; + index->oldFirstRelfilenodeSubid = InvalidSubTransactionId; index->unique = idxrec->indisunique; index->primary = idxrec->indisprimary; index->transformed = true; /* don't need transformIndexStmt */ @@ -1937,6 +1939,8 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) index->idxcomment = NULL; index->indexOid = InvalidOid; index->oldNode = InvalidOid; + index->oldCreateSubid = InvalidSubTransactionId; + index->oldFirstRelfilenodeSubid = InvalidSubTransactionId; index->transformed = false; index->concurrent = false; index->if_not_exists = false; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 01eabe5706..1e944d0f10 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -65,7 +65,7 @@ #define BUF_WRITTEN 0x01 #define BUF_REUSABLE 0x02 -#define DROP_RELS_BSEARCH_THRESHOLD 20 +#define RELS_BSEARCH_THRESHOLD 20 typedef struct PrivateRefCountEntry { @@ -104,6 +104,19 @@ typedef struct CkptTsStatus int index; } CkptTsStatus; +/* + * Type for array used to sort SMgrRelations + * + * FlushRelationsAllBuffers shares the same comparator function with + * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be + * compatible. + */ +typedef struct SMgrSortArray +{ + RelFileNode rnode; /* This must be the first member */ + SMgrRelation srel; +} SMgrSortArray; + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -2977,7 +2990,7 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes) * an exactly determined value, as it depends on many factors (CPU and RAM * speeds, amount of shared buffers etc.). */ - use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD; + use_bsearch = n > RELS_BSEARCH_THRESHOLD; /* sort the list of rnodes if necessary */ if (use_bsearch) @@ -3227,6 +3240,104 @@ FlushRelationBuffers(Relation rel) } } +/* --------------------------------------------------------------------- + * FlushRelationsAllBuffers + * + * This function flushes out of the buffer pool all the pages of all + * forks of the specified smgr relations. It's equivalent to calling + * FlushRelationBuffers once per fork per relation. The relations are + * assumed not to use local buffers. + * -------------------------------------------------------------------- + */ +void +FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) +{ + int i; + SMgrSortArray *srels; + bool use_bsearch; + + if (nrels == 0) + return; + + /* fill-in array for qsort */ + srels = palloc(sizeof(SMgrSortArray) * nrels); + + for (i = 0; i < nrels; i++) + { + Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode)); + + srels[i].rnode = smgrs[i]->smgr_rnode.node; + srels[i].srel = smgrs[i]; + } + + /* + * Save the bsearch overhead for low number of relations to sync. See + * DropRelFileNodesAllBuffers for details. + */ + use_bsearch = nrels > RELS_BSEARCH_THRESHOLD; + + /* sort the list of SMgrRelations if necessary */ + if (use_bsearch) + pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator); + + /* Make sure we can handle the pin inside the loop */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + SMgrSortArray *srelent = NULL; + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * As in DropRelFileNodeBuffers, an unlocked precheck should be safe + * and saves some cycles. + */ + + if (!use_bsearch) + { + int j; + + for (j = 0; j < nrels; j++) + { + if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode)) + { + srelent = &srels[j]; + break; + } + } + + } + else + { + srelent = bsearch((const void *) &(bufHdr->tag.rnode), + srels, nrels, sizeof(SMgrSortArray), + rnode_comparator); + } + + /* buffer doesn't belong to any of the given relfilenodes; skip it */ + if (srelent == NULL) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) && + (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + FlushBuffer(bufHdr, srelent->srel); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr, true); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + + pfree(srels); +} + /* --------------------------------------------------------------------- * FlushDatabaseBuffers * @@ -3428,13 +3539,15 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) { /* - * If we're in recovery we cannot dirty a page because of a hint. - * We can set the hint, just not dirty the page as a result so the - * hint is lost when we evict the page or shutdown. + * If we must not write WAL, due to a relfilenode-specific + * condition or being in recovery, don't dirty the page. We can + * set the hint, just not dirty the page as a result so the hint + * is lost when we evict the page or shutdown. * * See src/backend/storage/page/README for longer discussion. */ - if (RecoveryInProgress()) + if (RecoveryInProgress() || + RelFileNodeSkippingWAL(bufHdr->tag.rnode)) return; /* diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 831ae62525..eaf747be65 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -563,6 +563,18 @@ DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) return false; } +#ifdef USE_ASSERT_CHECKING +/* + * GetLockMethodLocalHash -- return the hash of local locks, for modules that + * evaluate assertions based on all locks held. + */ +HTAB * +GetLockMethodLocalHash(void) +{ + return LockMethodLocalHash; +} +#endif + /* * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 200cc7f657..09c7253959 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -357,11 +357,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * During replay, we would delete the file and then recreate it, which is fine * if the contents of the file were repopulated by subsequent WAL entries. * But if we didn't WAL-log insertions, but instead relied on fsyncing the - * file after populating it (as for instance CLUSTER and CREATE INDEX do), - * the contents of the file would be lost forever. By leaving the empty file - * until after the next checkpoint, we prevent reassignment of the relfilenode - * number until it's safe, because relfilenode assignment skips over any - * existing file. + * file after populating it (as we do at wal_level=minimal), the contents of + * the file would be lost forever. By leaving the empty file until after the + * next checkpoint, we prevent reassignment of the relfilenode number until + * it's safe, because relfilenode assignment skips over any existing file. * * We do not need to go through this dance for temp relations, though, because * we never make WAL entries for temp rels, and so a temp rel poses no threat @@ -1011,12 +1010,18 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows - * nothing of dirty buffers that may exist inside the buffer manager. + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. */ void mdimmedsync(SMgrRelation reln, ForkNumber forknum) { int segno; + int min_inactive_seg; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1024,7 +1029,16 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) */ mdnblocks(reln, forknum); - segno = reln->md_num_open_segs[forknum]; + min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + + /* + * Temporarily open inactive segments, then close them after sync. There + * may be some inactive segments left opened after fsync() error, but that + * is harmless. We don't bother to clean them up and take a risk of + * further trouble. The next mdclose() will soon close them. + */ + while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + segno++; while (segno > 0) { @@ -1035,6 +1049,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd)))); + + /* Close inactive segments immediately */ + if (segno > min_inactive_seg) + { + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, segno - 1); + } + segno--; } } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index da91196085..6d53390c61 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -408,6 +408,41 @@ smgrdounlink(SMgrRelation reln, bool isRedo) smgrsw[which].smgr_unlink(rnode, InvalidForkNumber, isRedo); } +/* + * smgrdosyncall() -- Immediately sync all forks of all given relations + * + * All forks of all given relations are synced out to the store. + * + * This is equivalent to FlushRelationBuffers() for each smgr relation, + * then calling smgrimmedsync() for all forks of each relation, but it's + * significantly quicker so should be preferred when possible. + */ +void +smgrdosyncall(SMgrRelation *rels, int nrels) +{ + int i = 0; + ForkNumber forknum; + + if (nrels == 0) + return; + + FlushRelationsAllBuffers(rels, nrels); + + /* + * Sync the physical file(s). + */ + for (i = 0; i < nrels; i++) + { + int which = rels[i]->smgr_which; + + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + if (smgrsw[which].smgr_exists(rels[i], forknum)) + smgrsw[which].smgr_immedsync(rels[i], forknum); + } + } +} + /* * smgrdounlinkall() -- Immediately unlink all forks of all given relations * diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 462b1a9636..a1bc260164 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -253,6 +253,9 @@ static void RelationReloadIndexInfo(Relation relation); static void RelationReloadNailed(Relation relation); static void RelationFlushRelation(Relation relation); static void RememberToFreeTupleDescAtEOX(TupleDesc td); +#ifdef USE_ASSERT_CHECKING +static void AssertPendingSyncConsistency(Relation relation); +#endif static void AtEOXact_cleanup(Relation relation, bool isCommit); static void AtEOSubXact_cleanup(Relation relation, bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid); @@ -1114,6 +1117,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) relation->rd_isnailed = false; relation->rd_createSubid = InvalidSubTransactionId; relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_droppedSubid = InvalidSubTransactionId; switch (relation->rd_rel->relpersistence) { case RELPERSISTENCE_UNLOGGED: @@ -1756,6 +1761,8 @@ formrdesc(const char *relationName, Oid relationReltype, relation->rd_isnailed = true; relation->rd_createSubid = InvalidSubTransactionId; relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_droppedSubid = InvalidSubTransactionId; relation->rd_backend = InvalidBackendId; relation->rd_islocaltemp = false; @@ -1922,6 +1929,13 @@ RelationIdGetRelation(Oid relationId) if (RelationIsValid(rd)) { + /* return NULL for dropped relations */ + if (rd->rd_droppedSubid != InvalidSubTransactionId) + { + Assert(!rd->rd_isvalid); + return NULL; + } + RelationIncrementReferenceCount(rd); /* revalidate cache entry if necessary */ if (!rd->rd_isvalid) @@ -2015,7 +2029,7 @@ RelationClose(Relation relation) #ifdef RELCACHE_FORCE_RELEASE if (RelationHasReferenceCountZero(relation) && relation->rd_createSubid == InvalidSubTransactionId && - relation->rd_newRelfilenodeSubid == InvalidSubTransactionId) + relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId) RelationClearRelation(relation, false); #endif } @@ -2054,10 +2068,11 @@ RelationReloadIndexInfo(Relation relation) HeapTuple pg_class_tuple; Form_pg_class relp; - /* Should be called only for invalidated indexes */ + /* Should be called only for invalidated, live indexes */ Assert((relation->rd_rel->relkind == RELKIND_INDEX || relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) && - !relation->rd_isvalid); + !relation->rd_isvalid && + relation->rd_droppedSubid == InvalidSubTransactionId); /* Ensure it's closed at smgr level */ RelationCloseSmgr(relation); @@ -2348,6 +2363,13 @@ RelationClearRelation(Relation relation, bool rebuild) return; } + /* Mark it invalid until we've finished rebuild */ + relation->rd_isvalid = false; + + /* See RelationForgetRelation(). */ + if (relation->rd_droppedSubid != InvalidSubTransactionId) + return; + /* * Even non-system indexes should not be blown away if they are open and * have valid index support information. This avoids problems with active @@ -2360,15 +2382,11 @@ RelationClearRelation(Relation relation, bool rebuild) relation->rd_refcnt > 0 && relation->rd_indexcxt != NULL) { - relation->rd_isvalid = false; /* needs to be revalidated */ if (IsTransactionState()) RelationReloadIndexInfo(relation); return; } - /* Mark it invalid until we've finished rebuild */ - relation->rd_isvalid = false; - /* * If we're really done with the relcache entry, blow it away. But if * someone is still using it, reconstruct the whole deal without moving @@ -2426,13 +2444,13 @@ RelationClearRelation(Relation relation, bool rebuild) * problem. * * When rebuilding an open relcache entry, we must preserve ref count, - * rd_createSubid/rd_newRelfilenodeSubid, and rd_toastoid state. Also - * attempt to preserve the pg_class entry (rd_rel), tupledesc, - * rewrite-rule, partition key, and partition descriptor substructures - * in place, because various places assume that these structures won't - * move while they are working with an open relcache entry. (Note: - * the refcount mechanism for tupledescs might someday allow us to - * remove this hack for the tupledesc.) + * rd_*Subid, and rd_toastoid state. Also attempt to preserve the + * pg_class entry (rd_rel), tupledesc, rewrite-rule, partition key, + * and partition descriptor substructures in place, because various + * places assume that these structures won't move while they are + * working with an open relcache entry. (Note: the refcount + * mechanism for tupledescs might someday allow us to remove this hack + * for the tupledesc.) * * Note that this process does not touch CurrentResourceOwner; which * is good because whatever ref counts the entry may have do not @@ -2516,6 +2534,8 @@ RelationClearRelation(Relation relation, bool rebuild) /* creation sub-XIDs must be preserved */ SWAPFIELD(SubTransactionId, rd_createSubid); SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid); + SWAPFIELD(SubTransactionId, rd_firstRelfilenodeSubid); + SWAPFIELD(SubTransactionId, rd_droppedSubid); /* un-swap rd_rel pointers, swap contents instead */ SWAPFIELD(Form_pg_class, rd_rel); /* ... but actually, we don't have to update newrel->rd_rel */ @@ -2563,12 +2583,12 @@ static void RelationFlushRelation(Relation relation) { if (relation->rd_createSubid != InvalidSubTransactionId || - relation->rd_newRelfilenodeSubid != InvalidSubTransactionId) + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId) { /* * New relcache entries are always rebuilt, not flushed; else we'd - * forget the "new" status of the relation, which is a useful - * optimization to have. Ditto for the new-relfilenode status. + * forget the "new" status of the relation. Ditto for the + * new-relfilenode status. * * The rel could have zero refcnt here, so temporarily increment the * refcnt to ensure it's safe to rebuild it. We can assume that the @@ -2590,10 +2610,7 @@ RelationFlushRelation(Relation relation) } /* - * RelationForgetRelation - unconditionally remove a relcache entry - * - * External interface for destroying a relcache entry when we - * drop the relation. + * RelationForgetRelation - caller reports that it dropped the relation */ void RelationForgetRelation(Oid rid) @@ -2608,7 +2625,19 @@ RelationForgetRelation(Oid rid) if (!RelationHasReferenceCountZero(relation)) elog(ERROR, "relation %u is still open", rid); - /* Unconditionally destroy the relcache entry */ + Assert(relation->rd_droppedSubid == InvalidSubTransactionId); + if (relation->rd_createSubid != InvalidSubTransactionId || + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId) + { + /* + * In the event of subtransaction rollback, we must not forget + * rd_*Subid. Mark the entry "dropped" so RelationClearRelation() + * invalidates it in lieu of destroying it. (If we're in a top + * transaction, we could opt to destroy the entry.) + */ + relation->rd_droppedSubid = GetCurrentSubTransactionId(); + } + RelationClearRelation(relation, false); } @@ -2648,11 +2677,10 @@ RelationCacheInvalidateEntry(Oid relationId) * relation cache and re-read relation mapping data. * * This is currently used only to recover from SI message buffer overflow, - * so we do not touch new-in-transaction relations; they cannot be targets - * of cross-backend SI updates (and our own updates now go through a - * separate linked list that isn't limited by the SI message buffer size). - * Likewise, we need not discard new-relfilenode-in-transaction hints, - * since any invalidation of those would be a local event. + * so we do not touch relations having new-in-transaction relfilenodes; they + * cannot be targets of cross-backend SI updates (and our own updates now go + * through a separate linked list that isn't limited by the SI message + * buffer size). * * We do this in two phases: the first pass deletes deletable items, and * the second one rebuilds the rebuildable items. This is essential for @@ -2703,7 +2731,7 @@ RelationCacheInvalidate(void) * pending invalidations. */ if (relation->rd_createSubid != InvalidSubTransactionId || - relation->rd_newRelfilenodeSubid != InvalidSubTransactionId) + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId) continue; relcacheInvalsReceived++; @@ -2815,6 +2843,96 @@ RememberToFreeTupleDescAtEOX(TupleDesc td) EOXactTupleDescArray[NextEOXactTupleDescNum++] = td; } +#ifdef USE_ASSERT_CHECKING +/* + * Relation kinds that have physical storage. These relations normally have + * relfilenode set to non-zero, but it can also be zero if the relation is + * mapped. + */ +#define RELKIND_HAS_STORAGE(relkind) \ + ((relkind) == RELKIND_RELATION || \ + (relkind) == RELKIND_INDEX || \ + (relkind) == RELKIND_SEQUENCE || \ + (relkind) == RELKIND_TOASTVALUE || \ + (relkind) == RELKIND_MATVIEW) + +static void +AssertPendingSyncConsistency(Relation relation) +{ + bool relcache_verdict = + relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && + ((relation->rd_createSubid != InvalidSubTransactionId && + RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) || + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId); + + Assert(relcache_verdict == RelFileNodeSkippingWAL(relation->rd_node)); + + if (relation->rd_droppedSubid != InvalidSubTransactionId) + Assert(!relation->rd_isvalid && + (relation->rd_createSubid != InvalidSubTransactionId || + relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)); +} + +/* + * AssertPendingSyncs_RelationCache + * + * Assert that relcache.c and storage.c agree on whether to skip WAL. + */ +void +AssertPendingSyncs_RelationCache(void) +{ + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + Relation *rels; + int maxrels; + int nrels; + RelIdCacheEnt *idhentry; + int i; + + /* + * Open every relation that this transaction has locked. If, for some + * relation, storage.c is skipping WAL and relcache.c is not skipping WAL, + * a CommandCounterIncrement() typically yields a local invalidation + * message that destroys the relcache entry. By recreating such entries + * here, we detect the problem. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + maxrels = 1; + rels = palloc(maxrels * sizeof(*rels)); + nrels = 0; + hash_seq_init(&status, GetLockMethodLocalHash()); + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + Oid relid; + Relation r; + + if (locallock->nLocks <= 0) + continue; + if ((LockTagType) locallock->tag.lock.locktag_type != + LOCKTAG_RELATION) + continue; + relid = ObjectIdGetDatum(locallock->tag.lock.locktag_field2); + r = RelationIdGetRelation(relid); + if (!RelationIsValid(r)) + continue; + if (nrels >= maxrels) + { + maxrels *= 2; + rels = repalloc(rels, maxrels * sizeof(*rels)); + } + rels[nrels++] = r; + } + + hash_seq_init(&status, RelationIdCache); + while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL) + AssertPendingSyncConsistency(idhentry->reldesc); + + for (i = 0; i < nrels; i++) + RelationClose(rels[i]); + PopActiveSnapshot(); +} +#endif + /* * AtEOXact_RelationCache * @@ -2897,6 +3015,8 @@ AtEOXact_RelationCache(bool isCommit) static void AtEOXact_cleanup(Relation relation, bool isCommit) { + bool clear_relcache = false; + /* * The relcache entry's ref count should be back to its normal * not-in-a-transaction state: 0 unless it's nailed in cache. @@ -2922,17 +3042,31 @@ AtEOXact_cleanup(Relation relation, bool isCommit) #endif /* - * Is it a relation created in the current transaction? + * Is the relation live after this transaction ends? * - * During commit, reset the flag to zero, since we are now out of the - * creating transaction. During abort, simply delete the relcache entry - * --- it isn't interesting any longer. + * During commit, clear the relcache entry if it is preserved after + * relation drop, in order not to orphan the entry. During rollback, + * clear the relcache entry if the relation is created in the current + * transaction since it isn't interesting any longer once we are out of + * the transaction. */ - if (relation->rd_createSubid != InvalidSubTransactionId) + clear_relcache = + (isCommit ? + relation->rd_droppedSubid != InvalidSubTransactionId : + relation->rd_createSubid != InvalidSubTransactionId); + + /* + * Since we are now out of the transaction, reset the subids to zero. + * That also lets RelationClearRelation() drop the relcache entry. + */ + relation->rd_createSubid = InvalidSubTransactionId; + relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_droppedSubid = InvalidSubTransactionId; + + if (clear_relcache) { - if (isCommit) - relation->rd_createSubid = InvalidSubTransactionId; - else if (RelationHasReferenceCountZero(relation)) + if (RelationHasReferenceCountZero(relation)) { RelationClearRelation(relation, false); return; @@ -2947,17 +3081,11 @@ AtEOXact_cleanup(Relation relation, bool isCommit) * eventually. This must be just a WARNING to avoid * error-during-error-recovery loops. */ - relation->rd_createSubid = InvalidSubTransactionId; elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount", RelationGetRelationName(relation)); } } - /* - * Likewise, reset the hint about the relfilenode being new. - */ - relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; - /* * Flush any temporary index list. */ @@ -3033,15 +3161,28 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit, /* * Is it a relation created in the current subtransaction? * - * During subcommit, mark it as belonging to the parent, instead. During - * subabort, simply delete the relcache entry. + * During subcommit, mark it as belonging to the parent, instead, as long + * as it has not been dropped. Otherwise simply delete the relcache entry. + * --- it isn't interesting any longer. */ if (relation->rd_createSubid == mySubid) { - if (isCommit) + /* + * Valid rd_droppedSubid means the corresponding relation is dropped + * but the relcache entry is preserved for at-commit pending sync. We + * need to drop it explicitly here not to make the entry orphan. + */ + Assert(relation->rd_droppedSubid == mySubid || + relation->rd_droppedSubid == InvalidSubTransactionId); + if (isCommit && relation->rd_droppedSubid == InvalidSubTransactionId) relation->rd_createSubid = parentSubid; else if (RelationHasReferenceCountZero(relation)) { + /* allow the entry to be removed */ + relation->rd_createSubid = InvalidSubTransactionId; + relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + relation->rd_droppedSubid = InvalidSubTransactionId; RelationClearRelation(relation, false); return; } @@ -3061,7 +3202,8 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit, } /* - * Likewise, update or drop any new-relfilenode-in-subtransaction hint. + * Likewise, update or drop any new-relfilenode-in-subtransaction record + * or drop record. */ if (relation->rd_newRelfilenodeSubid == mySubid) { @@ -3071,6 +3213,22 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit, relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; } + if (relation->rd_firstRelfilenodeSubid == mySubid) + { + if (isCommit) + relation->rd_firstRelfilenodeSubid = parentSubid; + else + relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + } + + if (relation->rd_droppedSubid == mySubid) + { + if (isCommit) + relation->rd_droppedSubid = parentSubid; + else + relation->rd_droppedSubid = InvalidSubTransactionId; + } + /* * Flush any temporary index list. */ @@ -3171,6 +3329,7 @@ RelationBuildLocalRelation(const char *relname, /* it's being created in this transaction */ rel->rd_createSubid = GetCurrentSubTransactionId(); rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; + rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId; /* * create a new tuple descriptor from the one passed in. We do this @@ -3446,14 +3605,29 @@ RelationSetNewRelfilenode(Relation relation, char persistence, */ CommandCounterIncrement(); - /* - * Mark the rel as having been given a new relfilenode in the current - * (sub) transaction. This is a hint that can be used to optimize later - * operations on the rel in the same transaction. - */ - relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId(); + RelationAssumeNewRelfilenode(relation); +} - /* Flag relation as needing eoxact cleanup (to remove the hint) */ +/* + * RelationAssumeNewRelfilenode + * + * Code that modifies pg_class.reltablespace or pg_class.relfilenode must call + * this. The call shall precede any code that might insert WAL records whose + * replay would modify bytes in the new RelFileNode, and the call shall follow + * any WAL modifying bytes in the prior RelFileNode. See struct RelationData. + * Ideally, call this as near as possible to the CommandCounterIncrement() + * that makes the pg_class change visible (before it or after it); that + * minimizes the chance of future development adding a forbidden WAL insertion + * between RelationAssumeNewRelfilenode() and CommandCounterIncrement(). + */ +void +RelationAssumeNewRelfilenode(Relation relation) +{ + relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId(); + if (relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId) + relation->rd_firstRelfilenodeSubid = relation->rd_newRelfilenodeSubid; + + /* Flag relation as needing eoxact cleanup (to clear these fields) */ EOXactListAdd(relation); } @@ -5753,6 +5927,8 @@ load_relcache_init_file(bool shared) rel->rd_statlist = NIL; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; + rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId; + rel->rd_droppedSubid = InvalidSubTransactionId; rel->rd_amcache = NULL; MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info)); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index eca51a7c4e..62d7a829e2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -35,6 +35,7 @@ #include "access/xlog_internal.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" +#include "catalog/storage.h" #include "commands/async.h" #include "commands/prepare.h" #include "commands/user.h" @@ -2518,6 +2519,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"wal_skip_threshold", PGC_USERSET, WAL_SETTINGS, + gettext_noop("Size of new file to fsync instead of writing WAL."), + NULL, + GUC_UNIT_KB + }, + &wal_skip_threshold, + 2048, 0, MAX_KILOBYTES, + NULL, NULL, NULL + }, + { /* see max_connections and superuser_reserved_connections */ {"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 41191d4f45..9d512a1550 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -203,6 +203,7 @@ # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds #wal_writer_flush_after = 1MB # measured in pages, 0 disables +#wal_skip_threshold = 2MB #commit_delay = 0 # range 0-100000, in microseconds #commit_siblings = 5 # range 1-1000 diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 41f496c258..355572d3f7 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -425,6 +425,8 @@ extern XLogRecPtr gistXLogSplit(bool page_is_leaf, BlockNumber origrlink, GistNSN oldnsn, Buffer leftchild, bool markfollowright); +extern XLogRecPtr gistXLogAssignLSN(void); + /* gistget.c */ extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 1a2b9496d0..9ff3dee519 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -23,6 +23,7 @@ /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ #define XLOG_GIST_CREATE_INDEX 0x50 /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ +#define XLOG_GIST_ASSIGN_LSN 0x70 /* nop, assign new LSN */ /* * Backup Blk 0: updated page. diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 40e153f71a..96b8466a20 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -25,7 +25,6 @@ /* "options" flag bits for heap_insert */ -#define HEAP_INSERT_SKIP_WAL 0x0001 #define HEAP_INSERT_SKIP_FSM 0x0002 #define HEAP_INSERT_FROZEN 0x0004 #define HEAP_INSERT_SPECULATIVE 0x0008 diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h index 6d7f669cbc..c3cc36897b 100644 --- a/src/include/access/rewriteheap.h +++ b/src/include/access/rewriteheap.h @@ -23,7 +23,7 @@ typedef struct RewriteStateData *RewriteState; extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap, TransactionId OldestXmin, TransactionId FreezeXid, - MultiXactId MultiXactCutoff, bool use_wal); + MultiXactId MultiXactCutoff); extern void end_heap_rewrite(RewriteState state); extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple, HeapTuple newTuple); diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index ef52d85803..cb7a42ba39 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -18,16 +18,22 @@ #include "storage/relfilenode.h" #include "utils/relcache.h" +/* GUC variables */ +extern int wal_skip_threshold; + extern void RelationCreateStorage(RelFileNode rnode, char relpersistence); extern void RelationDropStorage(Relation rel); extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit); +extern void RelationPreTruncate(Relation rel); extern void RelationTruncate(Relation rel, BlockNumber nblocks); +extern bool RelFileNodeSkippingWAL(RelFileNode rnode); /* * These functions used to be in storage/smgr/smgr.c, which explains the * naming */ extern void smgrDoPendingDeletes(bool isCommit); +extern void smgrDoPendingSyncs(bool isCommit); extern int smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr); extern void AtSubCommit_smgr(void); extern void AtSubAbort_smgr(void); diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 62d3683be5..42acb722ae 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2738,6 +2738,9 @@ typedef struct IndexStmt bool transformed; /* true when transformIndexStmt is finished */ bool concurrent; /* should this be a concurrent index build? */ bool if_not_exists; /* just do nothing if index already exists? */ + SubTransactionId oldCreateSubid; /* rd_createSubid of oldNode */ + SubTransactionId oldFirstRelfilenodeSubid; /* rd_firstRelfilenodeSubid of + * oldNode */ } IndexStmt; /* ---------------------- diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 3cce3906a0..7a11c241a8 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -50,6 +50,9 @@ typedef enum /* forward declared, to avoid having to expose buf_internals.h here */ struct WritebackContext; +/* forward declared, to avoid including smgr.h here */ +struct SMgrRelationData; + /* in globals.c ... this duplicates miscadmin.h */ extern PGDLLIMPORT int NBuffers; @@ -190,6 +193,7 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum); extern void FlushOneBuffer(Buffer buffer); extern void FlushRelationBuffers(Relation rel); +extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels); extern void FlushDatabaseBuffers(Oid dbid); extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index ff4df7fec5..57863722ce 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -540,6 +540,9 @@ extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks); extern void LockReleaseSession(LOCKMETHODID lockmethodid); extern void LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks); extern void LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks); +#ifdef USE_ASSERT_CHECKING +extern HTAB *GetLockMethodLocalHash(void); +#endif extern bool LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock); extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 0298ed1a2b..af2e96525d 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -91,6 +91,7 @@ extern void smgrcloseall(void); extern void smgrclosenode(RelFileNodeBackend rnode); extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdounlink(SMgrRelation reln, bool isRedo); +extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 11a635964f..f02d2b561b 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -66,25 +66,43 @@ typedef struct RelationData /*---------- * rd_createSubid is the ID of the highest subtransaction the rel has - * survived into; or zero if the rel was not created in the current top - * transaction. This can be now be relied on, whereas previously it could - * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is - * the ID of the highest subtransaction the relfilenode change has - * survived into, or zero if not changed in the current transaction (or we - * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten - * when a relation has multiple new relfilenodes within a single - * transaction, with one of them occurring in a subsequently aborted - * subtransaction, e.g. + * survived into or zero if the rel or its rd_node was created before the + * current top transaction. (IndexStmt.oldNode leads to the case of a new + * rel with an old rd_node.) rd_firstRelfilenodeSubid is the ID of the + * highest subtransaction an rd_node change has survived into or zero if + * rd_node matches the value it had at the start of the current top + * transaction. (Rolling back the subtransaction that + * rd_firstRelfilenodeSubid denotes would restore rd_node to the value it + * had at the start of the current top transaction. Rolling back any + * lower subtransaction would not.) Their accuracy is critical to + * RelationNeedsWAL(). + * + * rd_newRelfilenodeSubid is the ID of the highest subtransaction the + * most-recent relfilenode change has survived into or zero if not changed + * in the current transaction (or we have forgotten changing it). This + * field is accurate when non-zero, but it can be zero when a relation has + * multiple new relfilenodes within a single transaction, with one of them + * occurring in a subsequently aborted subtransaction, e.g. * BEGIN; * TRUNCATE t; * SAVEPOINT save; * TRUNCATE t; * ROLLBACK TO save; * -- rd_newRelfilenodeSubid is now forgotten + * + * If every rd_*Subid field is zero, they are read-only outside + * relcache.c. Files that trigger rd_node changes by updating + * pg_class.reltablespace and/or pg_class.relfilenode call + * RelationAssumeNewRelfilenode() to update rd_*Subid. + * + * rd_droppedSubid is the ID of the highest subtransaction that a drop of + * the rel has survived into. In entries visible outside relcache.c, this + * is always zero. */ SubTransactionId rd_createSubid; /* rel was created in current xact */ - SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in - * current xact */ + SubTransactionId rd_newRelfilenodeSubid; /* highest subxact changing + * rd_node to current value */ + /* see end for rd_firstRelfilenodeSubid and rd_droppedSubid */ Form_pg_class rd_rel; /* RELATION tuple */ TupleDesc rd_att; /* tuple descriptor */ @@ -197,6 +215,10 @@ typedef struct RelationData /* placed here to avoid ABI break before v12: */ bool rd_partcheckvalid; /* true if list has been computed */ MemoryContext rd_partcheckcxt; /* private cxt for rd_partcheck, if any */ + + SubTransactionId rd_firstRelfilenodeSubid; /* highest subxact changing + * rd_node to any value */ + SubTransactionId rd_droppedSubid; /* dropped with another Subid set */ } RelationData; @@ -516,9 +538,16 @@ typedef struct ViewOptions /* * RelationNeedsWAL * True if relation needs WAL. + * + * Returns false if wal_level = minimal and this relation is created or + * truncated in the current transaction. See "Skipping WAL for New + * RelFileNode" in src/backend/access/transam/README. */ -#define RelationNeedsWAL(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) +#define RelationNeedsWAL(relation) \ + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \ + (XLogIsNeeded() || \ + (relation->rd_createSubid == InvalidSubTransactionId && \ + relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId))) /* * RelationUsesLocalBuffers diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 54394303a8..50005f9166 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -108,10 +108,11 @@ extern Relation RelationBuildLocalRelation(const char *relname, char relkind); /* - * Routine to manage assignment of new relfilenode to a relation + * Routines to manage assignment of new relfilenode to a relation */ extern void RelationSetNewRelfilenode(Relation relation, char persistence, TransactionId freezeXid, MultiXactId minmulti); +extern void RelationAssumeNewRelfilenode(Relation relation); /* * Routines for flushing/rebuilding relcache entries in various scenarios @@ -124,6 +125,11 @@ extern void RelationCacheInvalidate(void); extern void RelationCloseSmgrByOid(Oid relationId); +#ifdef USE_ASSERT_CHECKING +extern void AssertPendingSyncs_RelationCache(void); +#else +#define AssertPendingSyncs_RelationCache() do {} while (0) +#endif extern void AtEOXact_RelationCache(bool isCommit); extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid); diff --git a/src/test/recovery/t/018_wal_optimize.pl b/src/test/recovery/t/018_wal_optimize.pl new file mode 100644 index 0000000000..50bb2fef61 --- /dev/null +++ b/src/test/recovery/t/018_wal_optimize.pl @@ -0,0 +1,372 @@ +# Test WAL replay when some operation has skipped WAL. +# +# These tests exercise code that once violated the mandate described in +# src/backend/access/transam/README section "Skipping WAL for New +# RelFileNode". The tests work by committing some transactions, initiating an +# immediate shutdown, and confirming that the expected data survives recovery. +# For many years, individual commands made the decision to skip WAL, hence the +# frequent appearance of COPY in these tests. +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 34; + +sub check_orphan_relfilenodes +{ + my ($node, $test_name) = @_; + + my $db_oid = $node->safe_psql('postgres', + "SELECT oid FROM pg_database WHERE datname = 'postgres'"); + my $prefix = "base/$db_oid/"; + my $filepaths_referenced = $node->safe_psql( + 'postgres', " + SELECT pg_relation_filepath(oid) FROM pg_class + WHERE reltablespace = 0 AND relpersistence <> 't' AND + pg_relation_filepath(oid) IS NOT NULL;"); + is_deeply( + [ + sort(map { "$prefix$_" } + grep(/^[0-9]+$/, slurp_dir($node->data_dir . "/$prefix"))) + ], + [ sort split /\n/, $filepaths_referenced ], + $test_name); + return; +} + +# We run this same test suite for both wal_level=minimal and replica. +sub run_wal_optimize +{ + my $wal_level = shift; + + my $node = get_new_node("node_$wal_level"); + $node->init; + $node->append_conf( + 'postgresql.conf', qq( +wal_level = $wal_level +max_prepared_transactions = 1 +wal_log_hints = on +wal_skip_threshold = 0 +#wal_debug = on +)); + $node->start; + + # Setup + my $tablespace_dir = $node->basedir . '/tablespace_other'; + mkdir($tablespace_dir); + $tablespace_dir = TestLib::perl2host($tablespace_dir); + $node->safe_psql('postgres', + "CREATE TABLESPACE other LOCATION '$tablespace_dir';"); + + # Test direct truncation optimization. No tuples. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE trunc (id serial PRIMARY KEY); + TRUNCATE trunc; + COMMIT;"); + $node->stop('immediate'); + $node->start; + my $result = $node->safe_psql('postgres', "SELECT count(*) FROM trunc;"); + is($result, qq(0), "wal_level = $wal_level, TRUNCATE with empty table"); + + # Test truncation with inserted tuples within the same transaction. + # Tuples inserted after the truncation should be seen. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE trunc_ins (id serial PRIMARY KEY); + INSERT INTO trunc_ins VALUES (DEFAULT); + TRUNCATE trunc_ins; + INSERT INTO trunc_ins VALUES (DEFAULT); + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', + "SELECT count(*), min(id) FROM trunc_ins;"); + is($result, qq(1|2), "wal_level = $wal_level, TRUNCATE INSERT"); + + # Same for prepared transaction. + # Tuples inserted after the truncation should be seen. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE twophase (id serial PRIMARY KEY); + INSERT INTO twophase VALUES (DEFAULT); + TRUNCATE twophase; + INSERT INTO twophase VALUES (DEFAULT); + PREPARE TRANSACTION 't'; + COMMIT PREPARED 't';"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', + "SELECT count(*), min(id) FROM trunc_ins;"); + is($result, qq(1|2), "wal_level = $wal_level, TRUNCATE INSERT PREPARE"); + + # Writing WAL at end of xact, instead of syncing. + $node->safe_psql( + 'postgres', " + SET wal_skip_threshold = '1TB'; + BEGIN; + CREATE TABLE noskip (id serial PRIMARY KEY); + INSERT INTO noskip (SELECT FROM generate_series(1, 20000) a) ; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM noskip;"); + is($result, qq(20000), "wal_level = $wal_level, end-of-xact WAL"); + + # Data file for COPY query in subsequent tests + my $basedir = $node->basedir; + my $copy_file = "$basedir/copy_data.txt"; + TestLib::append_to_file( + $copy_file, qq(20000,30000 +20001,30001 +20002,30002)); + + # Test truncation with inserted tuples using both INSERT and COPY. Tuples + # inserted after the truncation should be seen. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE ins_trunc (id serial PRIMARY KEY, id2 int); + INSERT INTO ins_trunc VALUES (DEFAULT, generate_series(1,10000)); + TRUNCATE ins_trunc; + INSERT INTO ins_trunc (id, id2) VALUES (DEFAULT, 10000); + COPY ins_trunc FROM '$copy_file' DELIMITER ','; + INSERT INTO ins_trunc (id, id2) VALUES (DEFAULT, 10000); + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_trunc;"); + is($result, qq(5), "wal_level = $wal_level, TRUNCATE COPY INSERT"); + + # Test truncation with inserted tuples using COPY. Tuples copied after + # the truncation should be seen. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE trunc_copy (id serial PRIMARY KEY, id2 int); + INSERT INTO trunc_copy VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE trunc_copy; + COPY trunc_copy FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = + $node->safe_psql('postgres', "SELECT count(*) FROM trunc_copy;"); + is($result, qq(3), "wal_level = $wal_level, TRUNCATE COPY"); + + # Like previous test, but rollback SET TABLESPACE in a subtransaction. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE spc_abort (id serial PRIMARY KEY, id2 int); + INSERT INTO spc_abort VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE spc_abort; + SAVEPOINT s; + ALTER TABLE spc_abort SET TABLESPACE other; ROLLBACK TO s; + COPY spc_abort FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_abort;"); + is($result, qq(3), + "wal_level = $wal_level, SET TABLESPACE abort subtransaction"); + + # in different subtransaction patterns + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE spc_commit (id serial PRIMARY KEY, id2 int); + INSERT INTO spc_commit VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE spc_commit; + SAVEPOINT s; ALTER TABLE spc_commit SET TABLESPACE other; RELEASE s; + COPY spc_commit FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = + $node->safe_psql('postgres', "SELECT count(*) FROM spc_commit;"); + is($result, qq(3), + "wal_level = $wal_level, SET TABLESPACE commit subtransaction"); + + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE spc_nest (id serial PRIMARY KEY, id2 int); + INSERT INTO spc_nest VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE spc_nest; + SAVEPOINT s; + ALTER TABLE spc_nest SET TABLESPACE other; + SAVEPOINT s2; + ALTER TABLE spc_nest SET TABLESPACE pg_default; + ROLLBACK TO s2; + SAVEPOINT s2; + ALTER TABLE spc_nest SET TABLESPACE pg_default; + RELEASE s2; + ROLLBACK TO s; + COPY spc_nest FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_nest;"); + is($result, qq(3), + "wal_level = $wal_level, SET TABLESPACE nested subtransaction"); + + $node->safe_psql( + 'postgres', " + CREATE TABLE spc_hint (id int); + INSERT INTO spc_hint VALUES (1); + BEGIN; + ALTER TABLE spc_hint SET TABLESPACE other; + CHECKPOINT; + SELECT * FROM spc_hint; -- set hint bit + INSERT INTO spc_hint VALUES (2); + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM spc_hint;"); + is($result, qq(2), "wal_level = $wal_level, SET TABLESPACE, hint bit"); + + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE idx_hint (c int PRIMARY KEY); + SAVEPOINT q; INSERT INTO idx_hint VALUES (1); ROLLBACK TO q; + CHECKPOINT; + INSERT INTO idx_hint VALUES (1); -- set index hint bit + INSERT INTO idx_hint VALUES (2); + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->psql('postgres',); + my ($ret, $stdout, $stderr) = + $node->psql('postgres', "INSERT INTO idx_hint VALUES (2);"); + is($ret, qq(3), "wal_level = $wal_level, unique index LP_DEAD"); + like( + $stderr, + qr/violates unique/, + "wal_level = $wal_level, unique index LP_DEAD message"); + + # UPDATE touches two buffers for one row. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE upd (id serial PRIMARY KEY, id2 int); + INSERT INTO upd (id, id2) VALUES (DEFAULT, generate_series(1,10000)); + COPY upd FROM '$copy_file' DELIMITER ','; + UPDATE upd SET id2 = id2 + 1; + DELETE FROM upd; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM upd;"); + is($result, qq(0), + "wal_level = $wal_level, UPDATE touches two buffers for one row"); + + # Test consistency of COPY with INSERT for table created in the same + # transaction. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE ins_copy (id serial PRIMARY KEY, id2 int); + INSERT INTO ins_copy VALUES (DEFAULT, 1); + COPY ins_copy FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_copy;"); + is($result, qq(4), "wal_level = $wal_level, INSERT COPY"); + + # Test consistency of COPY that inserts more to the same table using + # triggers. If the INSERTS from the trigger go to the same block data + # is copied to, and the INSERTs are WAL-logged, WAL replay will fail when + # it tries to replay the WAL record but the "before" image doesn't match, + # because not all changes were WAL-logged. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE ins_trig (id serial PRIMARY KEY, id2 text); + CREATE FUNCTION ins_trig_before_row_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + IF new.id2 NOT LIKE 'triggered%' THEN + INSERT INTO ins_trig + VALUES (DEFAULT, 'triggered row before' || NEW.id2); + END IF; + RETURN NEW; + END; \$\$; + CREATE FUNCTION ins_trig_after_row_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + IF new.id2 NOT LIKE 'triggered%' THEN + INSERT INTO ins_trig + VALUES (DEFAULT, 'triggered row after' || NEW.id2); + END IF; + RETURN NEW; + END; \$\$; + CREATE TRIGGER ins_trig_before_row_insert + BEFORE INSERT ON ins_trig + FOR EACH ROW EXECUTE PROCEDURE ins_trig_before_row_trig(); + CREATE TRIGGER ins_trig_after_row_insert + AFTER INSERT ON ins_trig + FOR EACH ROW EXECUTE PROCEDURE ins_trig_after_row_trig(); + COPY ins_trig FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM ins_trig;"); + is($result, qq(9), "wal_level = $wal_level, COPY with INSERT triggers"); + + # Test consistency of INSERT, COPY and TRUNCATE in same transaction block + # with TRUNCATE triggers. + $node->safe_psql( + 'postgres', " + BEGIN; + CREATE TABLE trunc_trig (id serial PRIMARY KEY, id2 text); + CREATE FUNCTION trunc_trig_before_stat_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + INSERT INTO trunc_trig VALUES (DEFAULT, 'triggered stat before'); + RETURN NULL; + END; \$\$; + CREATE FUNCTION trunc_trig_after_stat_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + INSERT INTO trunc_trig VALUES (DEFAULT, 'triggered stat before'); + RETURN NULL; + END; \$\$; + CREATE TRIGGER trunc_trig_before_stat_truncate + BEFORE TRUNCATE ON trunc_trig + FOR EACH STATEMENT EXECUTE PROCEDURE trunc_trig_before_stat_trig(); + CREATE TRIGGER trunc_trig_after_stat_truncate + AFTER TRUNCATE ON trunc_trig + FOR EACH STATEMENT EXECUTE PROCEDURE trunc_trig_after_stat_trig(); + INSERT INTO trunc_trig VALUES (DEFAULT, 1); + TRUNCATE trunc_trig; + COPY trunc_trig FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = + $node->safe_psql('postgres', "SELECT count(*) FROM trunc_trig;"); + is($result, qq(4), + "wal_level = $wal_level, TRUNCATE COPY with TRUNCATE triggers"); + + # Test redo of temp table creation. + $node->safe_psql( + 'postgres', " + CREATE TEMP TABLE temp (id serial PRIMARY KEY, id2 text);"); + $node->stop('immediate'); + $node->start; + check_orphan_relfilenodes($node, + "wal_level = $wal_level, no orphan relfilenode remains"); + + return; +} + +# Run same test suite for multiple wal_level values. +run_wal_optimize("minimal"); +run_wal_optimize("replica"); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 4e9c15bbd4..f2f9c0a3f8 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2073,6 +2073,12 @@ select * from another; (3 rows) drop table another; +-- Create an index that skips WAL, then perform a SET DATA TYPE that skips +-- rewriting the index. +begin; +create table skip_wal_skip_rewrite_index (c varchar(10) primary key); +alter table skip_wal_skip_rewrite_index alter c type varchar(20); +commit; -- table's row type create table tab1 (a int, b text); create table tab2 (x int, y tab1); diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out index f44024aedc..ac3e1b0353 100644 --- a/src/test/regress/expected/create_table.out +++ b/src/test/regress/expected/create_table.out @@ -290,6 +290,19 @@ SELECT firstc, lastc FROM extra_wide_table; -- check that the oid column is added before the primary key is checked CREATE TABLE oid_pk (f1 INT, PRIMARY KEY(oid)) WITH OIDS; DROP TABLE oid_pk; +-- Verify that subtransaction rollback restores rd_createSubid. +BEGIN; +CREATE TABLE remember_create_subid (c int); +SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_create_subid; +-- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid. +CREATE TABLE remember_node_subid (c int); +BEGIN; +ALTER TABLE remember_node_subid ALTER c TYPE bigint; +SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_node_subid; -- -- Partitioned tables -- diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index cfc09e859b..db3738b19e 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1398,6 +1398,13 @@ select * from another; drop table another; +-- Create an index that skips WAL, then perform a SET DATA TYPE that skips +-- rewriting the index. +begin; +create table skip_wal_skip_rewrite_index (c varchar(10) primary key); +alter table skip_wal_skip_rewrite_index alter c type varchar(20); +commit; + -- table's row type create table tab1 (a int, b text); create table tab2 (x int, y tab1); diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql index dc5b729ad8..5c71914640 100644 --- a/src/test/regress/sql/create_table.sql +++ b/src/test/regress/sql/create_table.sql @@ -300,6 +300,21 @@ SELECT firstc, lastc FROM extra_wide_table; CREATE TABLE oid_pk (f1 INT, PRIMARY KEY(oid)) WITH OIDS; DROP TABLE oid_pk; +-- Verify that subtransaction rollback restores rd_createSubid. +BEGIN; +CREATE TABLE remember_create_subid (c int); +SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_create_subid; + +-- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid. +CREATE TABLE remember_node_subid (c int); +BEGIN; +ALTER TABLE remember_node_subid ALTER c TYPE bigint; +SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_node_subid; + -- -- Partitioned tables --