From 5cea1b0a8cfa25bdd31dcc1498e815b8aa72c49d Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 26 Jun 2012 06:40:58 -0400 Subject: [PATCH] Backport fsync queue compaction logic to all supported branches. This backports commit 7f242d880b5b5d9642675517466d31373961cf98, except for the counter in pg_stat_bgwriter. The underlying problem (namely, that a full fsync request queue causes terrible checkpoint behavior) continues to be reported in the wild, and this code seems to be safe and robust enough to risk back-porting the fix. --- src/backend/postmaster/bgwriter.c | 129 +++++++++++++++++++++++++++--- src/backend/storage/smgr/md.c | 8 +- 2 files changed, 127 insertions(+), 10 deletions(-) diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 831ea9478a..183edbb605 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void); static void BgWriterNap(void); static bool IsCheckpointOnSchedule(double progress); static bool ImmediateCheckpointRequested(void); +static bool CompactBgwriterRequestQueue(void); /* Signal handlers */ @@ -1058,14 +1059,15 @@ RequestCheckpoint(int flags) * use high values for special flags; that's all internal to md.c, which * see for details.) * - * If we are unable to pass over the request (at present, this can happen - * if the shared memory queue is full), we return false. That forces - * the backend to do its own fsync. We hope that will be even more seldom. - * - * Note: we presently make no attempt to eliminate duplicate requests - * in the requests[] queue. The bgwriter will have to eliminate dups - * internally anyway, so we may as well avoid holding the lock longer - * than we have to here. + * To avoid holding the lock for longer than necessary, we normally write + * to the requests[] queue without checking for duplicates. The bgwriter + * will have to eliminate dups internally anyway. However, if we discover + * that the queue is full, we make a pass over the entire queue to compact + * it. This is somewhat expensive, but the alternative is for the backend + * to perform its own fsync, which is far more expensive in practice. It + * is theoretically possible a backend fsync might still be necessary, if + * the queue is full and contains no duplicate entries. In that case, we + * let the backend know by returning false. */ bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) @@ -1083,8 +1085,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) /* we count non-bgwriter writes even when the request queue overflows */ BgWriterShmem->num_backend_writes++; + /* + * If the background writer isn't running or the request queue is full, + * the backend will have to perform its own fsync request. But before + * forcing that to happen, we can try to compact the background writer + * request queue. + */ if (BgWriterShmem->bgwriter_pid == 0 || - BgWriterShmem->num_requests >= BgWriterShmem->max_requests) + (BgWriterShmem->num_requests >= BgWriterShmem->max_requests + && !CompactBgwriterRequestQueue())) { LWLockRelease(BgWriterCommLock); return false; @@ -1097,6 +1106,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) return true; } +/* + * CompactBgwriterRequestQueue + * Remove duplicates from the request queue to avoid backend fsyncs. + * + * Although a full fsync request queue is not common, it can lead to severe + * performance problems when it does happen. So far, this situation has + * only been observed to occur when the system is under heavy write load, + * and especially during the "sync" phase of a checkpoint. Without this + * logic, each backend begins doing an fsync for every block written, which + * gets very expensive and can slow down the whole system. + * + * Trying to do this every time the queue is full could lose if there + * aren't any removable entries. But should be vanishingly rare in + * practice: there's one queue entry per shared buffer. + */ +static bool +CompactBgwriterRequestQueue() +{ + struct BgWriterSlotMapping { + BgWriterRequest request; + int slot; + }; + + int n, + preserve_count; + int num_skipped = 0; + HASHCTL ctl; + HTAB *htab; + bool *skip_slot; + + /* must hold BgWriterCommLock in exclusive mode */ + Assert(LWLockHeldByMe(BgWriterCommLock)); + + /* Initialize temporary hash table */ + MemSet(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(BgWriterRequest); + ctl.entrysize = sizeof(struct BgWriterSlotMapping); + ctl.hash = tag_hash; + htab = hash_create("CompactBgwriterRequestQueue", + BgWriterShmem->num_requests, + &ctl, + HASH_ELEM | HASH_FUNCTION); + + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); + + /* + * The basic idea here is that a request can be skipped if it's followed + * by a later, identical request. It might seem more sensible to work + * backwards from the end of the queue and check whether a request is + * *preceded* by an earlier, identical request, in the hopes of doing less + * copying. But that might change the semantics, if there's an + * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so + * we do it this way. It would be possible to be even smarter if we made + * the code below understand the specific semantics of such requests (it + * could blow away preceding entries that would end up being cancelled + * anyhow), but it's not clear that the extra complexity would buy us + * anything. + */ + for (n = 0; n < BgWriterShmem->num_requests; ++n) + { + BgWriterRequest *request; + struct BgWriterSlotMapping *slotmap; + bool found; + + request = &BgWriterShmem->requests[n]; + slotmap = hash_search(htab, request, HASH_ENTER, &found); + if (found) + { + skip_slot[slotmap->slot] = true; + ++num_skipped; + } + slotmap->slot = n; + } + + /* Done with the hash table. */ + hash_destroy(htab); + + /* If no duplicates, we're out of luck. */ + if (!num_skipped) + { + pfree(skip_slot); + return false; + } + + /* We found some duplicates; remove them. */ + for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n) + { + if (skip_slot[n]) + continue; + BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n]; + } + ereport(DEBUG1, + (errmsg("compacted fsync request queue from %d entries to %d entries", + BgWriterShmem->num_requests, preserve_count))); + BgWriterShmem->num_requests = preserve_count; + + /* Cleanup. */ + pfree(skip_slot); + return true; +} + /* * AbsorbFsyncRequests * Retrieve queued fsync requests and pass them to local smgr. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index e69c7d3bb8..78feae112b 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -33,7 +33,13 @@ /* interval for calling AbsorbFsyncRequests in mdsync */ #define FSYNCS_PER_ABSORB 10 -/* special values for the segno arg to RememberFsyncRequest */ +/* + * Special values for the segno arg to RememberFsyncRequest. + * + * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an + * fsync request from the queue if an identical, subsequent request is found. + * See comments there before making changes here. + */ #define FORGET_RELATION_FSYNC (InvalidBlockNumber) #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1) #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)