Prevent re-use of a deleted relation's relfilenode until after the next

checkpoint. This guards against an unlikely data-loss scenario in which we re-use the relfilenode, then crash, then replay the deletion and recreation of the file. Even then we'd be OK if all insertions into the new relation had been WAL-logged ... but that's not guaranteed given all the no-WAL-logging optimizations that have recently been added. Patch by Heikki Linnakangas, per a discussion last month.
2007-11-15 20:36:40 +00:00 · 2007-11-15 20:36:40 +00:00 · 6cc4451b5c
commit 6cc4451b5c
parent 7a550cb95c
5 changed files with 274 additions and 25 deletions
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.286 2007/10/12 19:39:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.287 2007/11/15 20:36:40 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -45,6 +45,7 @@
 #include "storage/fd.h"
 #include "storage/pmsignal.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
 #include "utils/pg_locale.h"
@ -5663,6 +5664,14 @@ CreateCheckPoint(int flags)
 		UpdateControlFile();
 	}
 	/*
 	 * Let smgr prepare for checkpoint; this has to happen before we
 	 * determine the REDO pointer.  Note that smgr must not do anything
 	 * that'd have to be undone if we decide no checkpoint is needed.
 	 */
 	smgrpreckpt();
 	/* Begin filling in the checkpoint WAL record */
 	MemSet(&checkPoint, 0, sizeof(checkPoint));
 	checkPoint.ThisTimeLineID = ThisTimeLineID;
 	checkPoint.time = time(NULL);
@ -5886,6 +5895,11 @@ CreateCheckPoint(int flags)
 	 */
 	END_CRIT_SECTION();
 	/*
 	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
 	 */
 	smgrpostckpt();
 	/*
 	 * Delete old log files (those no longer needed even for previous
 	 * checkpoint).
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -57,6 +57,7 @@
 #include "commands/comment.h"
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
@ -460,13 +461,29 @@ DropTableSpace(DropTableSpaceStmt *stmt)
 	LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
 	/*
-	 * Try to remove the physical infrastructure
+	 * Try to remove the physical infrastructure. 
 	 */
 	if (!remove_tablespace_directories(tablespaceoid, false))
-		ereport(ERROR,
+	{
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+		/*
-				 errmsg("tablespace \"%s\" is not empty",
+		 * Not all files deleted?  However, there can be lingering empty files
-						tablespacename)));
+		 * in the directories, left behind by for example DROP TABLE, that
 		 * have been scheduled for deletion at next checkpoint (see comments
 		 * in mdunlink() for details).  We could just delete them immediately,
 		 * but we can't tell them apart from important data files that we
 		 * mustn't delete.  So instead, we force a checkpoint which will clean
 		 * out any lingering files, and try again.
 		 */
 		RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
 		if (!remove_tablespace_directories(tablespaceoid, false))
 		{
 			/* Still not empty, the files must be important then */
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("tablespace \"%s\" is not empty",
 							tablespacename)));
 		}
 	}
 	/* Record the filesystem change in XLOG */
 	{
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -34,6 +34,7 @@
 /* special values for the segno arg to RememberFsyncRequest */
 #define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
 #define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
 #define UNLINK_RELATION_REQUEST	(InvalidBlockNumber-2)
 /*
 * On Windows, we have to interpret EACCES as possibly meaning the same as
@ -113,6 +114,10 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
 * table remembers the pending operations.	We use a hash table mostly as
 * a convenient way of eliminating duplicate requests.
 *
 * We use a similar mechanism to remember no-longer-needed files that can
 * be deleted after the next checkpoint, but we use a linked list instead of
 * a hash table, because we don't expect there to be any duplicate requests.
 *
 * (Regular backends do not track pending operations locally, but forward
 * them to the bgwriter.)
 */
@ -131,9 +136,17 @@ typedef struct
 	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr when request was made */
 } PendingOperationEntry;
 typedef struct
 {
 	RelFileNode rnode;			/* the dead relation to delete */
 	CycleCtr cycle_ctr;			/* mdckpt_cycle_ctr when request was made */
 } PendingUnlinkEntry;
 static HTAB *pendingOpsTable = NULL;
 static List *pendingUnlinks = NIL;
 static CycleCtr mdsync_cycle_ctr = 0;
 static CycleCtr mdckpt_cycle_ctr = 0;
 typedef enum					/* behavior for mdopen & _mdfd_getseg */
@ -146,6 +159,7 @@ typedef enum					/* behavior for mdopen & _mdfd_getseg */
 /* local routines */
 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 static void register_unlink(RelFileNode rnode);
 static MdfdVec *_fdvec_alloc(void);
 #ifndef LET_OS_MANAGE_FILESIZE
@ -188,6 +202,7 @@ mdinit(void)
 									  100L,
 									  &hash_ctl,
 								   HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 		pendingUnlinks = NIL;
 	}
 }
@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)
 * Note that we're passed a RelFileNode --- by the time this is called,
 * there won't be an SMgrRelation hashtable entry anymore.
 *
 * Actually, we don't unlink the first segment file of the relation, but
 * just truncate it to zero length, and record a request to unlink it after
 * the next checkpoint.  Additional segments can be unlinked immediately,
 * however.  Leaving the empty file in place prevents that relfilenode
 * number from being reused.  The scenario this protects us from is:
 * 1. We delete a relation (and commit, and actually remove its file).
 * 2. We create a new relation, which by chance gets the same relfilenode as
 *    the just-deleted one (OIDs must've wrapped around for that to happen).
 * 3. We crash before another checkpoint occurs.
 * During replay, we would delete the file and then recreate it, which is fine
 * if the contents of the file were repopulated by subsequent WAL entries.
 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 * the contents of the file would be lost forever.  By leaving the empty file
 * until after the next checkpoint, we prevent reassignment of the relfilenode
 * number until it's safe, because relfilenode assignment skips over any
 * existing file.
 *
 * If isRedo is true, it's okay for the relation to be already gone.
- * Also, any failure should be reported as WARNING not ERROR, because
+ * Also, we should remove the file immediately instead of queuing a request
 * for later, since during redo there's no possibility of creating a
 * conflicting relation.
 *
 * Note: any failure should be reported as WARNING not ERROR, because
 * we are usually not in a transaction anymore when this is called.
 */
 void
 mdunlink(RelFileNode rnode, bool isRedo)
 {
 	char	   *path;
 	int ret;
 	/*
 	 * We have to clean out any pending fsync requests for the doomed relation,
@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)
 	path = relpath(rnode);
-	/* Delete the first segment, or only segment if not doing segmenting */
+	/*
-	if (unlink(path) < 0)
+	 * Delete or truncate the first segment, or only segment if not doing
 	 * segmenting
 	 */
 	if (isRedo)
 		ret = unlink(path);
 	else
 		ret = truncate(path, 0);
 	if (ret < 0)
 	{
 		if (!isRedo || errno != ENOENT)
 			ereport(WARNING,
@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
 #endif
 	pfree(path);
 	/* Register request to unlink first segment later */
 	if (!isRedo)
 		register_unlink(rnode);
 }
 /*
@ -1063,6 +1112,91 @@ mdsync(void)
 	mdsync_in_progress = false;
 }
 /*
 * mdpreckpt() -- Do pre-checkpoint work
 *
 * To distinguish unlink requests that arrived before this checkpoint
 * started from those that arrived during the checkpoint, we use a cycle
 * counter similar to the one we use for fsync requests. That cycle
 * counter is incremented here.
 *
 * This must be called *before* the checkpoint REDO point is determined.
 * That ensures that we won't delete files too soon.
 *
 * Note that we can't do anything here that depends on the assumption
 * that the checkpoint will be completed.
 */
 void
 mdpreckpt(void)
 {
 	ListCell *cell;
 	/*
 	 * In case the prior checkpoint wasn't completed, stamp all entries in
 	 * the list with the current cycle counter.  Anything that's in the
 	 * list at the start of checkpoint can surely be deleted after the
 	 * checkpoint is finished, regardless of when the request was made.
 	 */
 	foreach(cell, pendingUnlinks)
 	{
 		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
 		entry->cycle_ctr = mdckpt_cycle_ctr;
 	}
 	/*
 	 * Any unlink requests arriving after this point will be assigned the
 	 * next cycle counter, and won't be unlinked until next checkpoint.
 	 */
 	mdckpt_cycle_ctr++;
 }
 /*
 * mdpostckpt() -- Do post-checkpoint work
 *
 * Remove any lingering files that can now be safely removed.
 */
 void
 mdpostckpt(void)
 {
 	while (pendingUnlinks != NIL)
 	{
 		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
 		char *path;
 		/*
 		 * New entries are appended to the end, so if the entry is new
 		 * we've reached the end of old entries.
 		 */
 		if (entry->cycle_ctr == mdsync_cycle_ctr)
 			break;
 		/* Else assert we haven't missed it */
 		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
 		/* Unlink the file */
 		path = relpath(entry->rnode);
 		if (unlink(path) < 0)
 		{
 			/*
 			 * ENOENT shouldn't happen either, but it doesn't really matter
 			 * because we would've deleted it now anyway.
 			 */
 			if (errno != ENOENT)
 				ereport(WARNING,
 						(errcode_for_file_access(),
 						 errmsg("could not remove relation %u/%u/%u: %m",
 								entry->rnode.spcNode,
 								entry->rnode.dbNode,
 								entry->rnode.relNode)));
 		}
 		pfree(path);
 		pendingUnlinks = list_delete_first(pendingUnlinks);
 		pfree(entry);
 	}
 }
 /*
 * register_dirty_segment() -- Mark a relation segment as needing fsync
 *
@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 	}
 }
 /*
 * register_unlink() -- Schedule a file to be deleted after next checkpoint
 *
 * As with register_dirty_segment, this could involve either a local or
 * a remote pending-ops table.
 */
 static void
 register_unlink(RelFileNode rnode)
 {
 	if (pendingOpsTable)
 	{
 		/* push it into local pending-ops table */
 		RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
 	}
 	else
 	{
 		/*
 		 * Notify the bgwriter about it.  If we fail to queue the request
 		 * message, we have to sleep and try again, because we can't simply
 		 * delete the file now.  Ugly, but hopefully won't happen often.
 		 *
 		 * XXX should we just leave the file orphaned instead?
 		 */
 		Assert(IsUnderPostmaster);
 		while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
 			pg_usleep(10000L);	/* 10 msec seems a good number */
 	}
 }
 /*
 * RememberFsyncRequest() -- callback from bgwriter side of fsync request
 *
- * We stuff the fsync request into the local hash table for execution
+ * We stuff most fsync requests into the local hash table for execution
- * during the bgwriter's next checkpoint.
+ * during the bgwriter's next checkpoint.  UNLINK requests go into a
 * separate linked list, however, because they get processed separately.
 *
 * The range of possible segment numbers is way less than the range of
 * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
+ * We define three:
- * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
- * a whole database.  (These are a tad slow because the hash table has to be
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * searched linearly, but it doesn't seem worth rethinking the table structure
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- * for them.)
+ *   checkpoint.
 *
 * (Handling the FORGET_* requests is a tad slow because the hash table has
 * to be searched linearly, but it doesn't seem worth rethinking the table
 * structure for them.)
 */
 void
 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 			}
 		}
 	}
 	else if (segno == UNLINK_RELATION_REQUEST)
 	{
 		/* Unlink request: put it in the linked list */
 		MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
 		PendingUnlinkEntry *entry;
 		entry = palloc(sizeof(PendingUnlinkEntry));
 		entry->rnode = rnode;
 		entry->cycle_ctr = mdckpt_cycle_ctr;
 		pendingUnlinks = lappend(pendingUnlinks, entry);
 		MemoryContextSwitchTo(oldcxt);
 	}
 	else
 	{
 		/* Normal case: enter a request to fsync this segment */
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@ -11,7 +11,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.106 2007/09/05 18:10:48 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.107 2007/11/15 20:36:40 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -55,9 +55,11 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
 								  bool isTemp);
 	void		(*smgr_immedsync) (SMgrRelation reln);
-	void		(*smgr_commit) (void);	/* may be NULL */
+	void		(*smgr_commit) (void);		/* may be NULL */
-	void		(*smgr_abort) (void);	/* may be NULL */
+	void		(*smgr_abort) (void);		/* may be NULL */
-	void		(*smgr_sync) (void);	/* may be NULL */
+	void		(*smgr_pre_ckpt) (void);	/* may be NULL */
 	void		(*smgr_sync) (void);		/* may be NULL */
 	void		(*smgr_post_ckpt) (void);	/* may be NULL */
 } f_smgr;
@ -65,7 +67,7 @@ static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
 		mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
-		NULL, NULL, mdsync
+		NULL, NULL, mdpreckpt, mdsync, mdpostckpt
 	}
 };
@ -778,7 +780,22 @@ smgrabort(void)
 }
 /*
- *	smgrsync() -- Sync files to disk at checkpoint time.
+ *	smgrpreckpt() -- Prepare for checkpoint.
 */
 void
 smgrpreckpt(void)
 {
 	int			i;
 	for (i = 0; i < NSmgr; i++)
 	{
 		if (smgrsw[i].smgr_pre_ckpt)
 			(*(smgrsw[i].smgr_pre_ckpt)) ();
 	}
 }
 /*
 *	smgrsync() -- Sync files to disk during checkpoint.
 */
 void
 smgrsync(void)
@ -792,6 +809,21 @@ smgrsync(void)
 	}
 }
 /*
 *	smgrpostckpt() -- Post-checkpoint cleanup.
 */
 void
 smgrpostckpt(void)
 {
 	int			i;
 	for (i = 0; i < NSmgr; i++)
 	{
 		if (smgrsw[i].smgr_post_ckpt)
 			(*(smgrsw[i].smgr_post_ckpt)) ();
 	}
 }
 void
 smgr_redo(XLogRecPtr lsn, XLogRecord *record)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.59 2007/09/05 18:10:48 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.60 2007/11/15 20:36:40 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -83,7 +83,9 @@ extern void AtSubAbort_smgr(void);
 extern void PostPrepare_smgr(void);
 extern void smgrcommit(void);
 extern void smgrabort(void);
 extern void smgrpreckpt(void);
 extern void smgrsync(void);
 extern void smgrpostckpt(void);
 extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec);
@ -104,7 +106,9 @@ extern void mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer,
 extern BlockNumber mdnblocks(SMgrRelation reln);
 extern void mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp);
 extern void mdimmedsync(SMgrRelation reln);
 extern void mdpreckpt(void);
 extern void mdsync(void);
 extern void mdpostckpt(void);
 extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
 extern void ForgetRelationFsyncRequests(RelFileNode rnode);