diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 9f70a88a7a..6a65909d9a 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -863,16 +863,6 @@ PostgreSQL documentation and the two systems have different definitions of the collation used to sort the partitioning column. - - - It is best not to use parallelism when restoring from an archive made - with this option, because pg_restore will - not know exactly which partition(s) a given archive data item will - load data into. This could result in inefficiency due to lock - conflicts between parallel jobs, or perhaps even restore failures due - to foreign key constraints being set up before all the relevant data - is loaded. - diff --git a/doc/src/sgml/ref/pg_dumpall.sgml b/doc/src/sgml/ref/pg_dumpall.sgml index bedc5a1c9a..412a6bb64a 100644 --- a/doc/src/sgml/ref/pg_dumpall.sgml +++ b/doc/src/sgml/ref/pg_dumpall.sgml @@ -360,10 +360,6 @@ PostgreSQL documentation and the two systems have different definitions of the collation used to sort the partitioning column. - - diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index 1f24e79665..3cabd82e9c 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -228,6 +228,9 @@ getSchemaData(Archive *fout, int *numTablesPtr) pg_log_info("flagging inherited columns in subtables"); flagInhAttrs(fout->dopt, tblinfo, numTables); + pg_log_info("reading partitioning data"); + getPartitioningInfo(fout); + pg_log_info("reading indexes"); getIndexes(fout, tblinfo, numTables); @@ -280,7 +283,6 @@ static void flagInhTables(Archive *fout, TableInfo *tblinfo, int numTables, InhInfo *inhinfo, int numInherits) { - DumpOptions *dopt = fout->dopt; int i, j; @@ -296,18 +298,18 @@ flagInhTables(Archive *fout, TableInfo *tblinfo, int numTables, continue; /* - * Normally, we don't bother computing anything for non-target tables, - * but if load-via-partition-root is specified, we gather information - * on every partition in the system so that getRootTableInfo can trace - * from any given to leaf partition all the way up to the root. (We - * don't need to mark them as interesting for getTableAttrs, though.) + * Normally, we don't bother computing anything for non-target tables. + * However, we must find the parents of non-root partitioned tables in + * any case, so that we can trace from leaf partitions up to the root + * (in case a leaf is to be dumped but its parents are not). We need + * not mark such parents interesting for getTableAttrs, though. */ if (!tblinfo[i].dobj.dump) { mark_parents = false; - if (!dopt->load_via_partition_root || - !tblinfo[i].ispartition) + if (!(tblinfo[i].relkind == RELKIND_PARTITIONED_TABLE && + tblinfo[i].ispartition)) find_parents = false; } diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 87113cfd44..049b79fba9 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -91,6 +91,7 @@ static RestorePass _tocEntryRestorePass(TocEntry *te); static bool _tocEntryIsACL(TocEntry *te); static void _disableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te); static void _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te); +static bool is_load_via_partition_root(TocEntry *te); static void buildTocEntryArrays(ArchiveHandle *AH); static void _moveBefore(TocEntry *pos, TocEntry *te); static int _discoverArchiveFormat(ArchiveHandle *AH); @@ -878,6 +879,8 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, bool is_parallel) } else { + bool use_truncate; + _disableTriggersIfNecessary(AH, te); /* Select owner and schema as necessary */ @@ -889,13 +892,24 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, bool is_parallel) /* * In parallel restore, if we created the table earlier in - * the run then we wrap the COPY in a transaction and - * precede it with a TRUNCATE. If archiving is not on - * this prevents WAL-logging the COPY. This obtains a - * speedup similar to that from using single_txn mode in - * non-parallel restores. + * this run (so that we know it is empty) and we are not + * restoring a load-via-partition-root data item then we + * wrap the COPY in a transaction and precede it with a + * TRUNCATE. If wal_level is set to minimal this prevents + * WAL-logging the COPY. This obtains a speedup similar + * to that from using single_txn mode in non-parallel + * restores. + * + * We mustn't do this for load-via-partition-root cases + * because some data might get moved across partition + * boundaries, risking deadlock and/or loss of previously + * loaded data. (We assume that all partitions of a + * partitioned table will be treated the same way.) */ - if (is_parallel && te->created) + use_truncate = is_parallel && te->created && + !is_load_via_partition_root(te); + + if (use_truncate) { /* * Parallel restore is always talking directly to a @@ -936,7 +950,7 @@ restore_toc_entry(ArchiveHandle *AH, TocEntry *te, bool is_parallel) AH->outputKind = OUTPUT_SQLCMDS; /* close out the transaction started above */ - if (is_parallel && te->created) + if (use_truncate) CommitTransaction(&AH->public); _enableTriggersIfNecessary(AH, te); @@ -1028,6 +1042,43 @@ _enableTriggersIfNecessary(ArchiveHandle *AH, TocEntry *te) fmtQualifiedId(te->namespace, te->tag)); } +/* + * Detect whether a TABLE DATA TOC item is performing "load via partition + * root", that is the target table is an ancestor partition rather than the + * table the TOC item is nominally for. + * + * In newer archive files this can be detected by checking for a special + * comment placed in te->defn. In older files we have to fall back to seeing + * if the COPY statement targets the named table or some other one. This + * will not work for data dumped as INSERT commands, so we could give a false + * negative in that case; fortunately, that's a rarely-used option. + */ +static bool +is_load_via_partition_root(TocEntry *te) +{ + if (te->defn && + strncmp(te->defn, "-- load via partition root ", 27) == 0) + return true; + if (te->copyStmt && *te->copyStmt) + { + PQExpBuffer copyStmt = createPQExpBuffer(); + bool result; + + /* + * Build the initial part of the COPY as it would appear if the + * nominal target table is the actual target. If we see anything + * else, it must be a load-via-partition-root case. + */ + appendPQExpBuffer(copyStmt, "COPY %s ", + fmtQualifiedId(te->namespace, te->tag)); + result = strncmp(te->copyStmt, copyStmt->data, copyStmt->len) != 0; + destroyPQExpBuffer(copyStmt); + return result; + } + /* Assume it's not load-via-partition-root */ + return false; +} + /* * This is a routine that is part of the dumper interface, hence the 'Archive*' parameter. */ @@ -2956,8 +3007,12 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) res = res & ~REQ_DATA; } - /* If there's no definition command, there's no schema component */ - if (!te->defn || !te->defn[0]) + /* + * If there's no definition command, there's no schema component. Treat + * "load via partition root" comments as not schema. + */ + if (!te->defn || !te->defn[0] || + strncmp(te->defn, "-- load via partition root ", 27) == 0) res = res & ~REQ_SCHEMA; /* diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 345a16d489..3f63b34395 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -302,6 +302,7 @@ static void appendReloptionsArrayAH(PQExpBuffer buffer, const char *reloptions, static char *get_synchronized_snapshot(Archive *fout); static void setupDumpWorker(Archive *AHX); static TableInfo *getRootTableInfo(const TableInfo *tbinfo); +static bool forcePartitionRootLoad(const TableInfo *tbinfo); int @@ -2217,11 +2218,13 @@ dumpTableData_insert(Archive *fout, const void *dcontext) insertStmt = createPQExpBuffer(); /* - * When load-via-partition-root is set, get the root table name - * for the partition table, so that we can reload data through the - * root table. + * When load-via-partition-root is set or forced, get the root + * table name for the partition table, so that we can reload data + * through the root table. */ - if (dopt->load_via_partition_root && tbinfo->ispartition) + if (tbinfo->ispartition && + (dopt->load_via_partition_root || + forcePartitionRootLoad(tbinfo))) targettab = getRootTableInfo(tbinfo); else targettab = tbinfo; @@ -2419,6 +2422,35 @@ getRootTableInfo(const TableInfo *tbinfo) return parentTbinfo; } +/* + * forcePartitionRootLoad + * Check if we must force load_via_partition_root for this partition. + * + * This is required if any level of ancestral partitioned table has an + * unsafe partitioning scheme. + */ +static bool +forcePartitionRootLoad(const TableInfo *tbinfo) +{ + TableInfo *parentTbinfo; + + Assert(tbinfo->ispartition); + Assert(tbinfo->numParents == 1); + + parentTbinfo = tbinfo->parents[0]; + if (parentTbinfo->unsafe_partitions) + return true; + while (parentTbinfo->ispartition) + { + Assert(parentTbinfo->numParents == 1); + parentTbinfo = parentTbinfo->parents[0]; + if (parentTbinfo->unsafe_partitions) + return true; + } + + return false; +} + /* * dumpTableData - * dump the contents of a single table @@ -2433,34 +2465,40 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo) PQExpBuffer copyBuf = createPQExpBuffer(); PQExpBuffer clistBuf = createPQExpBuffer(); DataDumperPtr dumpFn; + char *tdDefn = NULL; char *copyStmt; const char *copyFrom; /* We had better have loaded per-column details about this table */ Assert(tbinfo->interesting); + /* + * When load-via-partition-root is set or forced, get the root table name + * for the partition table, so that we can reload data through the root + * table. Then construct a comment to be inserted into the TOC entry's + * defn field, so that such cases can be identified reliably. + */ + if (tbinfo->ispartition && + (dopt->load_via_partition_root || + forcePartitionRootLoad(tbinfo))) + { + TableInfo *parentTbinfo; + + parentTbinfo = getRootTableInfo(tbinfo); + copyFrom = fmtQualifiedDumpable(parentTbinfo); + printfPQExpBuffer(copyBuf, "-- load via partition root %s", + copyFrom); + tdDefn = pg_strdup(copyBuf->data); + } + else + copyFrom = fmtQualifiedDumpable(tbinfo); + if (dopt->dump_inserts == 0) { /* Dump/restore using COPY */ dumpFn = dumpTableData_copy; - - /* - * When load-via-partition-root is set, get the root table name for - * the partition table, so that we can reload data through the root - * table. - */ - if (dopt->load_via_partition_root && tbinfo->ispartition) - { - TableInfo *parentTbinfo; - - parentTbinfo = getRootTableInfo(tbinfo); - copyFrom = fmtQualifiedDumpable(parentTbinfo); - } - else - copyFrom = fmtQualifiedDumpable(tbinfo); - /* must use 2 steps here 'cause fmtId is nonreentrant */ - appendPQExpBuffer(copyBuf, "COPY %s ", + printfPQExpBuffer(copyBuf, "COPY %s ", copyFrom); appendPQExpBuffer(copyBuf, "%s FROM stdin;\n", fmtCopyColumnList(tbinfo, clistBuf)); @@ -2488,6 +2526,7 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo) .owner = tbinfo->rolname, .description = "TABLE DATA", .section = SECTION_DATA, + .createStmt = tdDefn, .copyStmt = copyStmt, .deps = &(tbinfo->dobj.dumpId), .nDeps = 1, @@ -7196,6 +7235,76 @@ getInherits(Archive *fout, int *numInherits) return inhinfo; } +/* + * getPartitioningInfo + * get information about partitioning + * + * For the most part, we only collect partitioning info about tables we + * intend to dump. However, this function has to consider all partitioned + * tables in the database, because we need to know about parents of partitions + * we are going to dump even if the parents themselves won't be dumped. + * + * Specifically, what we need to know is whether each partitioned table + * has an "unsafe" partitioning scheme that requires us to force + * load-via-partition-root mode for its children. Currently the only case + * for which we force that is hash partitioning on enum columns, since the + * hash codes depend on enum value OIDs which won't be replicated across + * dump-and-reload. There are other cases in which load-via-partition-root + * might be necessary, but we expect users to cope with them. + */ +void +getPartitioningInfo(Archive *fout) +{ + PQExpBuffer query; + PGresult *res; + int ntups; + + /* hash partitioning didn't exist before v11 */ + if (fout->remoteVersion < 110000) + return; + /* needn't bother if schema-only dump */ + if (fout->dopt->schemaOnly) + return; + + query = createPQExpBuffer(); + + /* + * Unsafe partitioning schemes are exactly those for which hash enum_ops + * appears among the partition opclasses. We needn't check partstrat. + * + * Note that this query may well retrieve info about tables we aren't + * going to dump and hence have no lock on. That's okay since we need not + * invoke any unsafe server-side functions. + */ + appendPQExpBufferStr(query, + "SELECT partrelid FROM pg_partitioned_table WHERE\n" + "(SELECT c.oid FROM pg_opclass c JOIN pg_am a " + "ON c.opcmethod = a.oid\n" + "WHERE opcname = 'enum_ops' " + "AND opcnamespace = 'pg_catalog'::regnamespace " + "AND amname = 'hash') = ANY(partclass)"); + + res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); + + ntups = PQntuples(res); + + for (int i = 0; i < ntups; i++) + { + Oid tabrelid = atooid(PQgetvalue(res, i, 0)); + TableInfo *tbinfo; + + tbinfo = findTableByOid(tabrelid); + if (tbinfo == NULL) + fatal("failed sanity check, table OID %u appearing in pg_partitioned_table not found", + tabrelid); + tbinfo->unsafe_partitions = true; + } + + PQclear(res); + + destroyPQExpBuffer(query); +} + /* * getIndexes * get information about every index on a dumpable table diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 0d03a8d2ec..55eecc6c94 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -298,6 +298,7 @@ typedef struct _tableInfo bool dummy_view; /* view's real definition must be postponed */ bool postponed_def; /* matview must be postponed into post-data */ bool ispartition; /* is table a partition? */ + bool unsafe_partitions; /* is it an unsafe partitioned table? */ /* * These fields are computed only if we decide the table is interesting @@ -703,6 +704,7 @@ extern ConvInfo *getConversions(Archive *fout, int *numConversions); extern TableInfo *getTables(Archive *fout, int *numTables); extern void getOwnedSeqs(Archive *fout, TableInfo tblinfo[], int numTables); extern InhInfo *getInherits(Archive *fout, int *numInherits); +extern void getPartitioningInfo(Archive *fout); extern void getIndexes(Archive *fout, TableInfo tblinfo[], int numTables); extern void getExtendedStatistics(Archive *fout); extern void getConstraints(Archive *fout, TableInfo tblinfo[], int numTables); diff --git a/src/bin/pg_dump/t/004_pg_dump_parallel.pl b/src/bin/pg_dump/t/004_pg_dump_parallel.pl new file mode 100644 index 0000000000..f41c2fa223 --- /dev/null +++ b/src/bin/pg_dump/t/004_pg_dump_parallel.pl @@ -0,0 +1,81 @@ + +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +use strict; +use warnings; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $dbname1 = 'regression_src'; +my $dbname2 = 'regression_dest1'; +my $dbname3 = 'regression_dest2'; + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->start; + +my $backupdir = $node->backup_dir; + +$node->run_log([ 'createdb', $dbname1 ]); +$node->run_log([ 'createdb', $dbname2 ]); +$node->run_log([ 'createdb', $dbname3 ]); + +$node->safe_psql( + $dbname1, + qq{ +create type digit as enum ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9'); + +-- plain table with index +create table tplain (en digit, data int unique); +insert into tplain select (x%10)::text::digit, x from generate_series(1,1000) x; + +-- non-troublesome hashed partitioning +create table ths (mod int, data int, unique(mod, data)) partition by hash(mod); +create table ths_p1 partition of ths for values with (modulus 3, remainder 0); +create table ths_p2 partition of ths for values with (modulus 3, remainder 1); +create table ths_p3 partition of ths for values with (modulus 3, remainder 2); +insert into ths select (x%10), x from generate_series(1,1000) x; + +-- dangerous hashed partitioning +create table tht (en digit, data int, unique(en, data)) partition by hash(en); +create table tht_p1 partition of tht for values with (modulus 3, remainder 0); +create table tht_p2 partition of tht for values with (modulus 3, remainder 1); +create table tht_p3 partition of tht for values with (modulus 3, remainder 2); +insert into tht select (x%10)::text::digit, x from generate_series(1,1000) x; + }); + +$node->command_ok( + [ + 'pg_dump', '-Fd', '--no-sync', '-j2', '-f', "$backupdir/dump1", + $node->connstr($dbname1) + ], + 'parallel dump'); + +$node->command_ok( + [ + 'pg_restore', '-v', + '-d', $node->connstr($dbname2), + '-j3', "$backupdir/dump1" + ], + 'parallel restore'); + +$node->command_ok( + [ + 'pg_dump', '-Fd', + '--no-sync', '-j2', + '-f', "$backupdir/dump2", + '--inserts', $node->connstr($dbname1) + ], + 'parallel dump as inserts'); + +$node->command_ok( + [ + 'pg_restore', '-v', + '-d', $node->connstr($dbname3), + '-j3', "$backupdir/dump2" + ], + 'parallel restore as inserts'); + +done_testing();