Backport fsync queue compaction logic to all supported branches.

This backports commit 7f242d880b5b5d9642675517466d31373961cf98, except for the counter in pg_stat_bgwriter. The underlying problem (namely, that a full fsync request queue causes terrible checkpoint behavior) continues to be reported in the wild, and this code seems to be safe and robust enough to risk back-porting the fix.
2012-06-26 06:40:58 -04:00 · 2012-06-26 06:40:58 -04:00 · 5cea1b0a8c
commit 5cea1b0a8c
parent 248f1ac813
2 changed files with 127 additions and 10 deletions
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void);
 static void BgWriterNap(void);
 static bool IsCheckpointOnSchedule(double progress);
 static bool ImmediateCheckpointRequested(void);
 static bool CompactBgwriterRequestQueue(void);
 /* Signal handlers */
@ -1058,14 +1059,15 @@ RequestCheckpoint(int flags)
 * use high values for special flags; that's all internal to md.c, which
 * see for details.)
 *
- * If we are unable to pass over the request (at present, this can happen
+ * To avoid holding the lock for longer than necessary, we normally write
- * if the shared memory queue is full), we return false.  That forces
+ * to the requests[] queue without checking for duplicates.  The bgwriter
- * the backend to do its own fsync.  We hope that will be even more seldom.
+ * will have to eliminate dups internally anyway.  However, if we discover
- *
+ * that the queue is full, we make a pass over the entire queue to compact
- * Note: we presently make no attempt to eliminate duplicate requests
+ * it.  This is somewhat expensive, but the alternative is for the backend
- * in the requests[] queue.  The bgwriter will have to eliminate dups
+ * to perform its own fsync, which is far more expensive in practice.  It
- * internally anyway, so we may as well avoid holding the lock longer
+ * is theoretically possible a backend fsync might still be necessary, if
- * than we have to here.
+ * the queue is full and contains no duplicate entries.  In that case, we
 * let the backend know by returning false.
 */
 bool
 ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
@ -1083,8 +1085,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 	/* we count non-bgwriter writes even when the request queue overflows */
 	BgWriterShmem->num_backend_writes++;
 	/*
 	 * If the background writer isn't running or the request queue is full,
 	 * the backend will have to perform its own fsync request.  But before
 	 * forcing that to happen, we can try to compact the background writer
 	 * request queue.
 	 */
 	if (BgWriterShmem->bgwriter_pid == 0 ||
-		BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
+		(BgWriterShmem->num_requests >= BgWriterShmem->max_requests
 		&& !CompactBgwriterRequestQueue()))
 	{
 		LWLockRelease(BgWriterCommLock);
 		return false;
@ -1097,6 +1106,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 	return true;
 }
 /*
 * CompactBgwriterRequestQueue
 * 		Remove duplicates from the request queue to avoid backend fsyncs.
 *
 * Although a full fsync request queue is not common, it can lead to severe
 * performance problems when it does happen.  So far, this situation has
 * only been observed to occur when the system is under heavy write load,
 * and especially during the "sync" phase of a checkpoint.  Without this
 * logic, each backend begins doing an fsync for every block written, which
 * gets very expensive and can slow down the whole system.
 *
 * Trying to do this every time the queue is full could lose if there
 * aren't any removable entries.  But should be vanishingly rare in
 * practice: there's one queue entry per shared buffer.
 */
 static bool
 CompactBgwriterRequestQueue()
 {
 	struct BgWriterSlotMapping {
 		BgWriterRequest	request;
 		int		slot;
 	};
 	int			n,
 				preserve_count;
 	int			num_skipped = 0;
 	HASHCTL		ctl;
 	HTAB	   *htab;
 	bool	   *skip_slot;
 	/* must hold BgWriterCommLock in exclusive mode */
 	Assert(LWLockHeldByMe(BgWriterCommLock));
 	/* Initialize temporary hash table */
 	MemSet(&ctl, 0, sizeof(ctl));
 	ctl.keysize = sizeof(BgWriterRequest);
 	ctl.entrysize = sizeof(struct BgWriterSlotMapping);
 	ctl.hash = tag_hash;
 	htab = hash_create("CompactBgwriterRequestQueue",
 					   BgWriterShmem->num_requests,
 					   &ctl,
 					   HASH_ELEM | HASH_FUNCTION);
 	/* Initialize skip_slot array */
 	skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
 	/* 
 	 * The basic idea here is that a request can be skipped if it's followed
 	 * by a later, identical request.  It might seem more sensible to work
 	 * backwards from the end of the queue and check whether a request is
 	 * *preceded* by an earlier, identical request, in the hopes of doing less
 	 * copying.  But that might change the semantics, if there's an
 	 * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
 	 * we do it this way.  It would be possible to be even smarter if we made
 	 * the code below understand the specific semantics of such requests (it
 	 * could blow away preceding entries that would end up being cancelled
 	 * anyhow), but it's not clear that the extra complexity would buy us
 	 * anything.
 	 */
 	for (n = 0; n < BgWriterShmem->num_requests; ++n)
 	{
 		BgWriterRequest *request;
 		struct BgWriterSlotMapping *slotmap;
 		bool	found;
 		request = &BgWriterShmem->requests[n];
 		slotmap = hash_search(htab, request, HASH_ENTER, &found);
 		if (found)
 		{
 			skip_slot[slotmap->slot] = true;
 			++num_skipped;
 		}
 		slotmap->slot = n;
 	}
 	/* Done with the hash table. */
 	hash_destroy(htab);
 	/* If no duplicates, we're out of luck. */
 	if (!num_skipped)
 	{
 		pfree(skip_slot);
 		return false;
 	}
 	/* We found some duplicates; remove them. */
 	for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
 	{
 		if (skip_slot[n])
 			continue;
 		BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
 	}
 	ereport(DEBUG1,
 			(errmsg("compacted fsync request queue from %d entries to %d entries",
 				BgWriterShmem->num_requests, preserve_count)));
 	BgWriterShmem->num_requests = preserve_count;
 	/* Cleanup. */
 	pfree(skip_slot);
 	return true;
 }
 /*
 * AbsorbFsyncRequests
 *		Retrieve queued fsync requests and pass them to local smgr.
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@ -33,7 +33,13 @@
 /* interval for calling AbsorbFsyncRequests in mdsync */
 #define FSYNCS_PER_ABSORB		10
-/* special values for the segno arg to RememberFsyncRequest */
+/*
 * Special values for the segno arg to RememberFsyncRequest.
 *
 * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
 * fsync request from the queue if an identical, subsequent request is found.
 * See comments there before making changes here.
 */
 #define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
 #define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)