From fba8113c1b74b9508cf2e6b7a18b0fb3637d9ba0 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 29 Mar 2007 00:15:39 +0000 Subject: [PATCH] Teach CLUSTER to skip writing WAL if not needed (ie, not using archiving) --- Simon. Also, code review and cleanup for the previous COPY-no-WAL patches --- Tom. --- doc/src/sgml/perform.sgml | 116 ++++++++++++++++++++------- src/backend/access/heap/heapam.c | 73 ++++++++++------- src/backend/access/heap/tuptoaster.c | 22 +++-- src/backend/catalog/index.c | 9 +-- src/backend/commands/cluster.c | 20 ++++- src/backend/commands/copy.c | 112 +++++++++++++------------- src/backend/executor/execMain.c | 12 +-- src/backend/utils/cache/relcache.c | 75 +++++++++-------- src/include/access/heapam.h | 5 +- src/include/access/tuptoaster.h | 5 +- src/include/utils/rel.h | 8 +- src/include/utils/relcache.h | 6 +- 12 files changed, 281 insertions(+), 182 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5c2a959945..dcd0d1d2d3 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1,4 +1,4 @@ - + Performance Tips @@ -801,7 +801,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; EXECUTE as many times as required. This avoids some of the overhead of repeatedly parsing and planning INSERT. Different interfaces provide this facility - in different ways; look for Prepared Statements in the interface + in different ways; look for prepared statements in the interface documentation. @@ -815,14 +815,12 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; COPY is fastest when used within the same transaction as an earlier CREATE TABLE or - TRUNCATE command. In those cases, no WAL - needs to be written because in case of an error, the files - containing the newly loaded data will be removed automatically. - CREATE TABLE AS SELECT is also optimized - to avoid writing WAL. COPY and - CREATE TABLE AS SELECT will write WAL - when is set and will not - therefore be optimized in that case. + TRUNCATE command. In such cases no WAL + needs to be written, because in case of an error, the files + containing the newly loaded data will be removed anyway. + However, this consideration does not apply when + is set, as all commands + must write WAL in that case. @@ -897,23 +895,51 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; Turn off <varname>archive_command</varname> - When loading large amounts of data you might want to unset the - before loading. It might be - faster to take a new base backup once the load has completed - than to allow a large archive to accumulate. + When loading large amounts of data into an installation that uses + WAL archiving, you might want to disable archiving (unset the + configuration variable) + while loading. It might be + faster to take a new base backup after the load has completed + than to process a large amount of incremental WAL data. - This is particularly important advice because certain commands - will perform more slowly when archive_command - is set, as a result of their needing to write large amounts of WAL. + Aside from avoiding the time for the archiver to process the WAL data, + doing this will actually make certain commands faster, because they + are designed not to write WAL at all if archive_command + is unset. (They can guarantee crash safety more cheaply by doing an + fsync at the end than by writing WAL.) This applies to the following commands: - CREATE TABLE AS SELECT, - CREATE INDEX and also COPY, when - it is executed in the same transaction as a prior - CREATE TABLE or TRUNCATE command. + + + + CREATE TABLE AS SELECT + + + + + CREATE INDEX (and variants such as + ALTER TABLE ADD PRIMARY KEY) + + + + + ALTER TABLE SET TABLESPACE + + + + + CLUSTER + + + + + COPY FROM, when the target table has been + created or truncated earlier in the same transaction + + + - @@ -950,15 +976,43 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; By default, pg_dump uses COPY, and when it is generating a complete schema-and-data dump, it is careful to load data before creating indexes and foreign keys. So in this case - the first several guidelines are handled automatically. What is left - for you to do is to set appropriate (i.e., larger than normal) values - for maintenance_work_mem and - checkpoint_segments, as well as unsetting - archive_command before loading the dump script, - and then to run ANALYZE afterwards and resetting - archive_command if required. All of the - parameters can be reset once the load has completed without needing - to restart the server, as described in . + several guidelines are handled automatically. What is left + for you to do is to: + + + + Set appropriate (i.e., larger than normal) values for + maintenance_work_mem and + checkpoint_segments. + + + + + If using WAL archiving, consider disabling it during the restore. + To do that, unset archive_command before loading the + dump script, and afterwards restore archive_command + and take a fresh base backup. + + + + + Consider whether the whole dump should be restored as a single + transaction. To do that, pass the + + + + Run ANALYZE afterwards. + + + diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 367831a515..a99aa4ced0 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.229 2007/03/25 19:45:13 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.230 2007/03/29 00:15:37 tgl Exp $ * * * INTERFACE ROUTINES @@ -1360,11 +1360,14 @@ heap_get_latest_tid(Relation relation, * that all new tuples go into new pages not containing any tuples from other * transactions, that the relation gets fsync'd before commit, and that the * transaction emits at least one WAL record to ensure RecordTransactionCommit - * will decide to WAL-log the commit. (see heap_sync() comments also) + * will decide to WAL-log the commit. (See also heap_sync() comments) * * use_fsm is passed directly to RelationGetBufferForTuple, which see for * more info. * + * Note that use_wal and use_fsm will be applied when inserting into the + * heap's TOAST table, too, if the tuple requires any out-of-line data. + * * The return value is the OID assigned to the tuple (either here or by the * caller), or InvalidOid if no OID. The header fields of *tup are updated * to match the stored tuple; in particular tup->t_self receives the actual @@ -1418,7 +1421,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * into the relation; tup is the caller's original untoasted data. */ if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) - heaptup = toast_insert_or_update(relation, tup, NULL, use_wal); + heaptup = toast_insert_or_update(relation, tup, NULL, + use_wal, use_fsm); else heaptup = tup; @@ -1526,8 +1530,10 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * simple_heap_insert - insert a tuple * * Currently, this routine differs from heap_insert only in supplying - * a default command ID. But it should be used rather than using - * heap_insert directly in most places where we are modifying system catalogs. + * a default command ID and not allowing access to the speedup options. + * + * This should be used rather than using heap_insert directly in most places + * where we are modifying system catalogs. */ Oid simple_heap_insert(Relation relation, HeapTuple tup) @@ -1535,18 +1541,6 @@ simple_heap_insert(Relation relation, HeapTuple tup) return heap_insert(relation, tup, GetCurrentCommandId(), true, true); } -/* - * fast_heap_insert - insert a tuple with options to improve speed - * - * Currently, this routine allows specifying additional options for speed - * in certain cases, such as WAL-avoiding COPY command - */ -Oid -fast_heap_insert(Relation relation, HeapTuple tup, bool use_wal) -{ - return heap_insert(relation, tup, GetCurrentCommandId(), use_wal, use_wal); -} - /* * heap_delete - delete a tuple * @@ -2112,7 +2106,9 @@ l2: */ if (need_toast) { - heaptup = toast_insert_or_update(relation, newtup, &oldtup, true); + /* Note we always use WAL and FSM during updates */ + heaptup = toast_insert_or_update(relation, newtup, &oldtup, + true, true); newtupsize = MAXALIGN(heaptup->t_len); } else @@ -3988,23 +3984,40 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) appendStringInfo(buf, "UNKNOWN"); } -/* ---------------- - * heap_sync - sync a heap, for use when no WAL has been written +/* + * heap_sync - sync a heap, for use when no WAL has been written * - * ---------------- + * This forces the heap contents (including TOAST heap if any) down to disk. + * If we skipped using WAL, and it's not a temp relation, we must force the + * relation down to disk before it's safe to commit the transaction. This + * requires writing out any dirty buffers and then doing a forced fsync. + * + * Indexes are not touched. (Currently, index operations associated with + * the commands that use this are WAL-logged and so do not need fsync. + * That behavior might change someday, but in any case it's likely that + * any fsync decisions required would be per-index and hence not appropriate + * to be done here.) */ void heap_sync(Relation rel) { - if (!rel->rd_istemp) + /* temp tables never need fsync */ + if (rel->rd_istemp) + return; + + /* main heap */ + FlushRelationBuffers(rel); + /* FlushRelationBuffers will have opened rd_smgr */ + smgrimmedsync(rel->rd_smgr); + + /* toast heap, if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) { - /* - * If we skipped using WAL, and it's not a temp relation, - * we must force the relation down to disk before it's - * safe to commit the transaction. This requires forcing - * out any dirty buffers and then doing a forced fsync. - */ - FlushRelationBuffers(rel); - smgrimmedsync(rel->rd_smgr); + Relation toastrel; + + toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock); + FlushRelationBuffers(toastrel); + smgrimmedsync(toastrel->rd_smgr); + heap_close(toastrel, AccessShareLock); } } diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c index b1eb8aea4d..b1e02e1375 100644 --- a/src/backend/access/heap/tuptoaster.c +++ b/src/backend/access/heap/tuptoaster.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.71 2007/02/27 23:48:07 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.72 2007/03/29 00:15:37 tgl Exp $ * * * INTERFACE ROUTINES @@ -33,6 +33,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/tuptoaster.h" +#include "access/xact.h" #include "catalog/catalog.h" #include "utils/fmgroids.h" #include "utils/pg_lzcompress.h" @@ -42,7 +43,8 @@ #undef TOAST_DEBUG static void toast_delete_datum(Relation rel, Datum value); -static Datum toast_save_datum(Relation rel, Datum value, bool use_wal); +static Datum toast_save_datum(Relation rel, Datum value, + bool use_wal, bool use_fsm); static varattrib *toast_fetch_datum(varattrib *attr); static varattrib *toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length); @@ -333,6 +335,7 @@ toast_delete(Relation rel, HeapTuple oldtup) * Inputs: * newtup: the candidate new tuple to be inserted * oldtup: the old row version for UPDATE, or NULL for INSERT + * use_wal, use_fsm: flags to be passed to heap_insert() for toast rows * Result: * either newtup if no toasting is needed, or a palloc'd modified tuple * that is what should actually get stored @@ -342,7 +345,8 @@ toast_delete(Relation rel, HeapTuple oldtup) * ---------- */ HeapTuple -toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, bool use_wal) +toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, + bool use_wal, bool use_fsm) { HeapTuple result_tuple; TupleDesc tupleDesc; @@ -618,7 +622,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, bool us i = biggest_attno; old_value = toast_values[i]; toast_action[i] = 'p'; - toast_values[i] = toast_save_datum(rel, toast_values[i], use_wal); + toast_values[i] = toast_save_datum(rel, toast_values[i], + use_wal, use_fsm); if (toast_free[i]) pfree(DatumGetPointer(old_value)); @@ -729,7 +734,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, bool us i = biggest_attno; old_value = toast_values[i]; toast_action[i] = 'p'; - toast_values[i] = toast_save_datum(rel, toast_values[i], use_wal); + toast_values[i] = toast_save_datum(rel, toast_values[i], + use_wal, use_fsm); if (toast_free[i]) pfree(DatumGetPointer(old_value)); @@ -977,7 +983,8 @@ toast_compress_datum(Datum value) * ---------- */ static Datum -toast_save_datum(Relation rel, Datum value, bool use_wal) +toast_save_datum(Relation rel, Datum value, + bool use_wal, bool use_fsm) { Relation toastrel; Relation toastidx; @@ -985,6 +992,7 @@ toast_save_datum(Relation rel, Datum value, bool use_wal) TupleDesc toasttupDesc; Datum t_values[3]; bool t_isnull[3]; + CommandId mycid = GetCurrentCommandId(); varattrib *result; struct { @@ -1063,7 +1071,7 @@ toast_save_datum(Relation rel, Datum value, bool use_wal) if (!HeapTupleIsValid(toasttup)) elog(ERROR, "failed to build TOAST tuple"); - fast_heap_insert(toastrel, toasttup, use_wal); + heap_insert(toastrel, toasttup, mycid, use_wal, use_fsm); /* * Create the index entry. We cheat a little here by not using diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index b660a94aab..377bc9f4f2 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.281 2007/03/25 19:45:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.282 2007/03/29 00:15:37 tgl Exp $ * * * INTERFACE ROUTINES @@ -1248,12 +1248,11 @@ setNewRelfilenode(Relation relation) heap_close(pg_class, RowExclusiveLock); - /* Remember we did this in current transaction, to allow later optimisations */ - relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId(); - RelationCacheResetAtEOXact(); - /* Make sure the relfilenode change is visible */ CommandCounterIncrement(); + + /* Mark the rel as having a new relfilenode in current transaction */ + RelationCacheMarkNewRelfilenode(relation); } diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index aa91136940..ac771b77a6 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.157 2007/03/13 00:33:39 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.158 2007/03/29 00:15:37 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -653,6 +653,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) char *nulls; IndexScanDesc scan; HeapTuple tuple; + CommandId mycid = GetCurrentCommandId(); + bool use_wal; /* * Open the relations we need. @@ -675,6 +677,17 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) nulls = (char *) palloc(natts * sizeof(char)); memset(nulls, 'n', natts * sizeof(char)); + /* + * We need to log the copied data in WAL iff WAL archiving is enabled AND + * it's not a temp rel. (Since we know the target relation is new and + * can't have any FSM data, we can always tell heap_insert to ignore FSM, + * even when using WAL.) + */ + use_wal = XLogArchivingActive() && !NewHeap->rd_istemp; + + /* use_wal off requires rd_targblock be initially invalid */ + Assert(NewHeap->rd_targblock == InvalidBlockNumber); + /* * Scan through the OldHeap on the OldIndex and copy each tuple into the * NewHeap. @@ -722,7 +735,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) if (NewHeap->rd_rel->relhasoids) HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple)); - simple_heap_insert(NewHeap, copiedTuple); + heap_insert(NewHeap, copiedTuple, mycid, use_wal, false); heap_freetuple(copiedTuple); @@ -734,6 +747,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) pfree(values); pfree(nulls); + if (!use_wal) + heap_sync(NewHeap); + index_close(OldIndex, NoLock); heap_close(OldHeap, NoLock); heap_close(NewHeap, NoLock); diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index a2e1939ea2..99d347f590 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.278 2007/03/13 00:33:39 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.279 2007/03/29 00:15:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1125,11 +1125,10 @@ DoCopy(const CopyStmt *stmt, const char *queryString) cstate->copy_dest = COPY_FILE; /* default */ cstate->filename = stmt->filename; - if (is_from) /* copy from file to database */ - CopyFrom(cstate); + if (is_from) + CopyFrom(cstate); /* copy from file to database */ else - /* copy from database to file */ - DoCopyTo(cstate); + DoCopyTo(cstate); /* copy from database to file */ /* * Close the relation or query. If reading, we can release the @@ -1640,7 +1639,9 @@ CopyFrom(CopyState cstate) ExprContext *econtext; /* used for ExecEvalExpr for default atts */ MemoryContext oldcontext = CurrentMemoryContext; ErrorContextCallback errcontext; - bool use_wal = true; /* By default, we use WAL to log db changes */ + CommandId mycid = GetCurrentCommandId(); + bool use_wal = true; /* by default, use WAL logging */ + bool use_fsm = true; /* by default, use FSM for free space */ Assert(cstate->rel); @@ -1663,6 +1664,48 @@ CopyFrom(CopyState cstate) RelationGetRelationName(cstate->rel)))); } + /*---------- + * Check to see if we can avoid writing WAL + * + * If archive logging is not enabled *and* either + * - table was created in same transaction as this COPY + * - data is being written to relfilenode created in this transaction + * then we can skip writing WAL. It's safe because if the transaction + * doesn't commit, we'll discard the table (or the new relfilenode file). + * If it does commit, we'll have done the heap_sync at the bottom of this + * routine first. + * + * As mentioned in comments in utils/rel.h, the in-same-transaction test + * is not completely reliable, since in rare cases rd_createSubid or + * rd_newRelfilenodeSubid can be cleared before the end of the transaction. + * However this is OK since at worst we will fail to make the optimization. + * + * When skipping WAL it's entirely possible that COPY itself will write no + * WAL records at all. This is of concern because RecordTransactionCommit + * might decide it doesn't need to log our eventual commit, which we + * certainly need it to do. However, we need no special action here for + * that, because if we have a new table or new relfilenode then there + * must have been a WAL-logged pg_class update earlier in the transaction. + * + * Also, if the target file is new-in-transaction, we assume that checking + * FSM for free space is a waste of time, even if we must use WAL because + * of archiving. This could possibly be wrong, but it's unlikely. + * + * The comments for heap_insert and RelationGetBufferForTuple specify that + * skipping WAL logging is only safe if we ensure that our tuples do not + * go into pages containing tuples from any other transactions --- but this + * must be the case if we have a new table or new relfilenode, so we need + * no additional work to enforce that. + *---------- + */ + if (cstate->rel->rd_createSubid != InvalidSubTransactionId || + cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId) + { + use_fsm = false; + if (!XLogArchivingActive()) + use_wal = false; + } + if (pipe) { if (whereToSendOutput == DestRemote) @@ -1832,28 +1875,6 @@ CopyFrom(CopyState cstate) nfields = file_has_oids ? (attr_count + 1) : attr_count; field_strings = (char **) palloc(nfields * sizeof(char *)); - /* - * Check for performance optimization by avoiding WAL writes - * - * If archive logging is not be enabled *and* either - * - table is created in same transaction as this COPY - * - table data is now being written to new relfilenode - * then we can safely avoid writing WAL. Why? - * The data files for the table plus toast table/index, plus any indexes - * will all be dropped at the end of the transaction if it fails, so we - * do not need to worry about inconsistent states. - * As mentioned in comments in utils/rel.h, the in-same-transaction test is - * not completely reliable, since rd_createSubId can be reset to zero in - * certain cases before the end of the creating transaction. - * We are doing this for performance only, so we only need to know: - * if rd_createSubid != InvalidSubTransactionId then it is *always* just - * created. If we have PITR enabled, then we *must* use_wal - */ - if ((cstate->rel->rd_createSubid != InvalidSubTransactionId || - cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId) - && !XLogArchivingActive()) - use_wal = false; - /* Initialize state variables */ cstate->fe_eof = false; cstate->eol_type = EOL_UNKNOWN; @@ -2087,7 +2108,7 @@ CopyFrom(CopyState cstate) ExecConstraints(resultRelInfo, slot, estate); /* OK, store the tuple and create index entries for it */ - fast_heap_insert(cstate->rel, tuple, use_wal); + heap_insert(cstate->rel, tuple, mycid, use_wal, use_fsm); if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false); @@ -2104,32 +2125,6 @@ CopyFrom(CopyState cstate) } } - /* - * If we skipped writing WAL for heaps, then we need to sync - */ - if (!use_wal) - { - /* main heap */ - heap_sync(cstate->rel); - - /* main heap indexes, if any */ - /* we always use WAL for index inserts, so no need to sync */ - - /* toast heap, if any */ - if (OidIsValid(cstate->rel->rd_rel->reltoastrelid)) - { - Relation toastrel; - - toastrel = heap_open(cstate->rel->rd_rel->reltoastrelid, - AccessShareLock); - heap_sync(toastrel); - heap_close(toastrel, AccessShareLock); - } - - /* toast index, if toast heap */ - /* we always use WAL for index inserts, so no need to sync */ - } - /* Done, clean up */ error_context_stack = errcontext.previous; @@ -2164,6 +2159,13 @@ CopyFrom(CopyState cstate) errmsg("could not read from file \"%s\": %m", cstate->filename))); } + + /* + * If we skipped writing WAL, then we need to sync the heap (but not + * indexes since those use WAL anyway) + */ + if (!use_wal) + heap_sync(cstate->rel); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index b2f7159e8c..cacd7c6fe7 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -26,7 +26,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.291 2007/03/25 19:45:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.292 2007/03/29 00:15:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2541,14 +2541,8 @@ CloseIntoRel(QueryDesc *queryDesc) /* OpenIntoRel might never have gotten called */ if (estate->es_into_relation_descriptor) { - /* - * If we skipped using WAL, and it's not a temp relation, we must - * force the relation down to disk before it's safe to commit the - * transaction. This requires forcing out any dirty buffers and then - * doing a forced fsync. - */ - if (!estate->es_into_relation_use_wal && - !estate->es_into_relation_descriptor->rd_istemp) + /* If we skipped using WAL, must heap_sync before commit */ + if (!estate->es_into_relation_use_wal) heap_sync(estate->es_into_relation_descriptor); /* close rel, but keep lock until commit */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 91b7f146b4..d8bd36bc94 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.258 2007/03/19 23:38:29 wieck Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.259 2007/03/29 00:15:38 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1572,7 +1572,8 @@ RelationClose(Relation relation) #ifdef RELCACHE_FORCE_RELEASE if (RelationHasReferenceCountZero(relation) && - relation->rd_createSubid == InvalidSubTransactionId) + relation->rd_createSubid == InvalidSubTransactionId && + relation->rd_newRelfilenodeSubid == InvalidSubTransactionId) RelationClearRelation(relation, false); #endif } @@ -1759,11 +1760,12 @@ RelationClearRelation(Relation relation, bool rebuild) { /* * When rebuilding an open relcache entry, must preserve ref count and - * rd_createSubid state. Also attempt to preserve the tupledesc and - * rewrite-rule substructures in place. (Note: the refcount mechanism - * for tupledescs may eventually ensure that we don't really need to - * preserve the tupledesc in-place, but for now there are still a lot - * of places that assume an open rel's tupledesc won't move.) + * rd_createSubid/rd_newRelfilenodeSubid state. Also attempt to + * preserve the tupledesc and rewrite-rule substructures in place. + * (Note: the refcount mechanism for tupledescs may eventually ensure + * that we don't really need to preserve the tupledesc in-place, but + * for now there are still a lot of places that assume an open rel's + * tupledesc won't move.) * * Note that this process does not touch CurrentResourceOwner; which * is good because whatever ref counts the entry may have do not @@ -1839,7 +1841,7 @@ RelationFlushRelation(Relation relation) /* * New relcache entries are always rebuilt, not flushed; else we'd * forget the "new" status of the relation, which is a useful - * optimization to have. + * optimization to have. Ditto for the new-relfilenode status. */ rebuild = true; } @@ -1916,6 +1918,8 @@ RelationCacheInvalidateEntry(Oid relationId) * so we do not touch new-in-transaction relations; they cannot be targets * of cross-backend SI updates (and our own updates now go through a * separate linked list that isn't limited by the SI message buffer size). + * Likewise, we need not discard new-relfilenode-in-transaction hints, + * since any invalidation of those would be a local event. * * We do this in two phases: the first pass deletes deletable items, and * the second one rebuilds the rebuildable items. This is essential for @@ -1958,14 +1962,6 @@ RelationCacheInvalidate(void) if (relation->rd_createSubid != InvalidSubTransactionId) continue; - /* - * Reset newRelfilenode hint. It is never used for correctness, only - * for performance optimization. An incorrectly set hint can lead - * to data loss in some circumstances, so play safe. - */ - if (relation->rd_newRelfilenodeSubid != InvalidSubTransactionId) - relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; - relcacheInvalsReceived++; if (RelationHasReferenceCountZero(relation)) @@ -2017,17 +2013,6 @@ RelationCacheInvalidate(void) list_free(rebuildList); } -/* - * RelationCacheResetAtEOXact - * - * Register that work will be required at main-transaction commit or abort - */ -void -RelationCacheResetAtEOXact(void) -{ - need_eoxact_work = true; -} - /* * AtEOXact_RelationCache * @@ -2056,9 +2041,10 @@ AtEOXact_RelationCache(bool isCommit) * the debug-only Assert checks, most transactions don't create any work * for us to do here, so we keep a static flag that gets set if there is * anything to do. (Currently, this means either a relation is created in - * the current xact, or an index list is forced.) For simplicity, the - * flag remains set till end of top-level transaction, even though we - * could clear it at subtransaction end in some cases. + * the current xact, or one is given a new relfilenode, or an index list + * is forced.) For simplicity, the flag remains set till end of top-level + * transaction, even though we could clear it at subtransaction end in + * some cases. */ if (!need_eoxact_work #ifdef USE_ASSERT_CHECKING @@ -2111,6 +2097,10 @@ AtEOXact_RelationCache(bool isCommit) continue; } } + + /* + * Likewise, reset the hint about the relfilenode being new. + */ relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; /* @@ -2173,6 +2163,10 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, continue; } } + + /* + * Likewise, update or drop any new-relfilenode-in-subtransaction hint. + */ if (relation->rd_newRelfilenodeSubid == mySubid) { if (isCommit) @@ -2194,6 +2188,23 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, } } +/* + * RelationCacheMarkNewRelfilenode + * + * Mark the rel as having been given a new relfilenode in the current + * (sub) transaction. This is a hint that can be used to optimize + * later operations on the rel in the same transaction. + */ +void +RelationCacheMarkNewRelfilenode(Relation rel) +{ + /* Mark it... */ + rel->rd_newRelfilenodeSubid = GetCurrentSubTransactionId(); + /* ... and now we have eoxact cleanup work to do */ + need_eoxact_work = true; +} + + /* * RelationBuildLocalRelation * Build a relcache entry for an about-to-be-created relation, @@ -2272,7 +2283,7 @@ RelationBuildLocalRelation(const char *relname, rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; /* must flag that we have rels created in this transaction */ - RelationCacheResetAtEOXact(); + need_eoxact_work = true; /* is it a temporary relation? */ rel->rd_istemp = isTempNamespace(relnamespace); @@ -2928,7 +2939,7 @@ RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex) relation->rd_oidindex = oidIndex; relation->rd_indexvalid = 2; /* mark list as forced */ /* must flag that we have a forced index list */ - RelationCacheResetAtEOXact(); + need_eoxact_work = true; } /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 3a68959957..6c7c98b3f2 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.120 2007/01/25 02:17:26 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.121 2007/03/29 00:15:39 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -178,9 +178,6 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup); -extern Oid fast_heap_insert(Relation relation, HeapTuple tup, bool use_wal); - - extern void heap_markpos(HeapScanDesc scan); extern void heap_restrpos(HeapScanDesc scan); diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h index 3693379dba..6cc0bdcbe8 100644 --- a/src/include/access/tuptoaster.h +++ b/src/include/access/tuptoaster.h @@ -6,7 +6,7 @@ * * Copyright (c) 2000-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/access/tuptoaster.h,v 1.32 2007/02/05 04:22:18 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/tuptoaster.h,v 1.33 2007/03/29 00:15:39 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -79,7 +79,8 @@ * ---------- */ extern HeapTuple toast_insert_or_update(Relation rel, - HeapTuple newtup, HeapTuple oldtup, bool use_wal); + HeapTuple newtup, HeapTuple oldtup, + bool use_wal, bool use_fsm); /* ---------- * toast_delete - diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 2963cc6616..33795de2bf 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.99 2007/03/19 23:38:32 wieck Exp $ + * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.100 2007/03/29 00:15:39 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -138,13 +138,17 @@ typedef struct RelationData char rd_indexvalid; /* state of rd_indexlist: 0 = not valid, 1 = * valid, 2 = temporarily forced */ SubTransactionId rd_createSubid; /* rel was created in current xact */ - SubTransactionId rd_newRelfilenodeSubid; /* rel had new relfilenode in current xact */ + SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned + * in current xact */ /* * rd_createSubid is the ID of the highest subtransaction the rel has * survived into; or zero if the rel was not created in the current top * transaction. This should be relied on only for optimization purposes; * it is possible for new-ness to be "forgotten" (eg, after CLUSTER). + * Likewise, rd_newRelfilenodeSubid is the ID of the highest subtransaction + * the relfilenode change has survived into, or zero if not changed in + * the current transaction (or we have forgotten changing it). */ Form_pg_class rd_rel; /* RELATION tuple */ TupleDesc rd_att; /* tuple descriptor */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index c7b549d1cf..25b60082a0 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.58 2007/03/03 20:08:41 momjian Exp $ + * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.59 2007/03/29 00:15:39 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -60,12 +60,12 @@ extern void RelationCacheInvalidateEntry(Oid relationId); extern void RelationCacheInvalidate(void); -extern void RelationCacheResetAtEOXact(void); - extern void AtEOXact_RelationCache(bool isCommit); extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid); +extern void RelationCacheMarkNewRelfilenode(Relation rel); + /* * Routines to help manage rebuilding of relcache init file */