From c900c15269f0f900d666bd1b0c6df3eff5098678 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Fri, 29 Mar 2019 08:09:39 -0400 Subject: [PATCH] Warn more strongly about the dangers of exclusive backup mode. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Especially, warn about the hazards of mishandling the backup_label file. Adjust a couple of server messages to be more clear about the hazards associated with removing backup_label files, too. David Steele and Robert Haas, reviewed by Laurenz Albe, Martín Marqués, Peter Eisentraut, and Magnus Hagander. Discussion: http://postgr.es/m/7d85c387-000e-16f0-e00b-50bf83c22127@pgmasters.net --- doc/src/sgml/backup.sgml | 52 +++++++++++++++++++++++-------- src/backend/access/transam/xlog.c | 10 ++++-- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index a73fd4d044..b67da8916a 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -948,13 +948,26 @@ SELECT * FROM pg_stop_backup(false, true); Making an exclusive low level backup + + + + The exclusive backup method is deprecated and should be avoided. + Prior to PostgreSQL 9.6, this was the only + low-level method available, but it is now recommended that all users + upgrade their scripts to use non-exclusive backups. + + + The process for an exclusive backup is mostly the same as for a - non-exclusive one, but it differs in a few key steps. This type of backup - can only be taken on a primary and does not allow concurrent backups. - Prior to PostgreSQL 9.6, this - was the only low-level method available, but it is now recommended that - all users upgrade their scripts to use non-exclusive backups if possible. + non-exclusive one, but it differs in a few key steps. This type of + backup can only be taken on a primary and does not allow concurrent + backups. Moreover, because it writes a backup_label file on the + master, it can cause the master to fail to restart automatically after + a crash. On the other hand, the erroneous removal of a backup_label + file from a backup or standby is a common mistake which can can result + in serious data corruption. If it is necessary to use this method, + the following steps may be used. @@ -1011,9 +1024,17 @@ SELECT pg_start_backup('label', true); consider during this backup. - Note that if the server crashes during the backup it may not be - possible to restart until the backup_label file has been - manually deleted from the PGDATA directory. + As noted above, if the server crashes during the backup it may not be + possible to restart until the backup_label file has + been manually deleted from the PGDATA directory. Note + that it is very important to never remove the + backup_label file when restoring a backup, because + this will result in corruption. Confusion about when it is appropriate + to remove this file is a common cause of data corruption when using this + method; be very certain that you remove the file only on an existing + master and never when building a standby or restoring a backup, even if + you are building a standby that will subsequently be promoted to a new + master. @@ -1045,11 +1066,16 @@ SELECT pg_stop_backup(); If the archive process has fallen behind because of failures of the archive command, it will keep retrying until the archive succeeds and the backup is complete. - If you wish to place a time limit on the execution of - pg_stop_backup, set an appropriate - statement_timeout value, but make note that if - pg_stop_backup terminates because of this your backup - may not be valid. + + + + When using exclusive backup mode, it is absolutely imperative to ensure + that pg_stop_backup completes successfully at the + end of the backup. Even if the backup itself fails, for example due to + lack of disk space, failure to call pg_stop_backup + will leave the server in backup mode indefinitely, causing future backups + to fail and increasing the risk of a restart failure during the time that + backup_label exists. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 19d7911ec5..9840a55c10 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6364,14 +6364,20 @@ StartupXLOG(void) if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false)) ereport(FATAL, (errmsg("could not find redo location referenced by checkpoint record"), - errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir))); + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); } } else { ereport(FATAL, (errmsg("could not locate required checkpoint record"), - errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir))); + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); wasShutdown = false; /* keep compiler quiet */ }