Backport fsync queue compaction logic to all supported branches.
This backports commit 7f242d880b5b5d9642675517466d31373961cf98, except for the counter in pg_stat_bgwriter. The underlying problem (namely, that a full fsync request queue causes terrible checkpoint behavior) continues to be reported in the wild, and this code seems to be safe and robust enough to risk back-porting the fix.
This commit is contained in:
parent
248f1ac813
commit
5cea1b0a8c
@ -179,6 +179,7 @@ static void CheckArchiveTimeout(void);
|
|||||||
static void BgWriterNap(void);
|
static void BgWriterNap(void);
|
||||||
static bool IsCheckpointOnSchedule(double progress);
|
static bool IsCheckpointOnSchedule(double progress);
|
||||||
static bool ImmediateCheckpointRequested(void);
|
static bool ImmediateCheckpointRequested(void);
|
||||||
|
static bool CompactBgwriterRequestQueue(void);
|
||||||
|
|
||||||
/* Signal handlers */
|
/* Signal handlers */
|
||||||
|
|
||||||
@ -1058,14 +1059,15 @@ RequestCheckpoint(int flags)
|
|||||||
* use high values for special flags; that's all internal to md.c, which
|
* use high values for special flags; that's all internal to md.c, which
|
||||||
* see for details.)
|
* see for details.)
|
||||||
*
|
*
|
||||||
* If we are unable to pass over the request (at present, this can happen
|
* To avoid holding the lock for longer than necessary, we normally write
|
||||||
* if the shared memory queue is full), we return false. That forces
|
* to the requests[] queue without checking for duplicates. The bgwriter
|
||||||
* the backend to do its own fsync. We hope that will be even more seldom.
|
* will have to eliminate dups internally anyway. However, if we discover
|
||||||
*
|
* that the queue is full, we make a pass over the entire queue to compact
|
||||||
* Note: we presently make no attempt to eliminate duplicate requests
|
* it. This is somewhat expensive, but the alternative is for the backend
|
||||||
* in the requests[] queue. The bgwriter will have to eliminate dups
|
* to perform its own fsync, which is far more expensive in practice. It
|
||||||
* internally anyway, so we may as well avoid holding the lock longer
|
* is theoretically possible a backend fsync might still be necessary, if
|
||||||
* than we have to here.
|
* the queue is full and contains no duplicate entries. In that case, we
|
||||||
|
* let the backend know by returning false.
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
||||||
@ -1083,8 +1085,15 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
|||||||
/* we count non-bgwriter writes even when the request queue overflows */
|
/* we count non-bgwriter writes even when the request queue overflows */
|
||||||
BgWriterShmem->num_backend_writes++;
|
BgWriterShmem->num_backend_writes++;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the background writer isn't running or the request queue is full,
|
||||||
|
* the backend will have to perform its own fsync request. But before
|
||||||
|
* forcing that to happen, we can try to compact the background writer
|
||||||
|
* request queue.
|
||||||
|
*/
|
||||||
if (BgWriterShmem->bgwriter_pid == 0 ||
|
if (BgWriterShmem->bgwriter_pid == 0 ||
|
||||||
BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
|
(BgWriterShmem->num_requests >= BgWriterShmem->max_requests
|
||||||
|
&& !CompactBgwriterRequestQueue()))
|
||||||
{
|
{
|
||||||
LWLockRelease(BgWriterCommLock);
|
LWLockRelease(BgWriterCommLock);
|
||||||
return false;
|
return false;
|
||||||
@ -1097,6 +1106,108 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CompactBgwriterRequestQueue
|
||||||
|
* Remove duplicates from the request queue to avoid backend fsyncs.
|
||||||
|
*
|
||||||
|
* Although a full fsync request queue is not common, it can lead to severe
|
||||||
|
* performance problems when it does happen. So far, this situation has
|
||||||
|
* only been observed to occur when the system is under heavy write load,
|
||||||
|
* and especially during the "sync" phase of a checkpoint. Without this
|
||||||
|
* logic, each backend begins doing an fsync for every block written, which
|
||||||
|
* gets very expensive and can slow down the whole system.
|
||||||
|
*
|
||||||
|
* Trying to do this every time the queue is full could lose if there
|
||||||
|
* aren't any removable entries. But should be vanishingly rare in
|
||||||
|
* practice: there's one queue entry per shared buffer.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
CompactBgwriterRequestQueue()
|
||||||
|
{
|
||||||
|
struct BgWriterSlotMapping {
|
||||||
|
BgWriterRequest request;
|
||||||
|
int slot;
|
||||||
|
};
|
||||||
|
|
||||||
|
int n,
|
||||||
|
preserve_count;
|
||||||
|
int num_skipped = 0;
|
||||||
|
HASHCTL ctl;
|
||||||
|
HTAB *htab;
|
||||||
|
bool *skip_slot;
|
||||||
|
|
||||||
|
/* must hold BgWriterCommLock in exclusive mode */
|
||||||
|
Assert(LWLockHeldByMe(BgWriterCommLock));
|
||||||
|
|
||||||
|
/* Initialize temporary hash table */
|
||||||
|
MemSet(&ctl, 0, sizeof(ctl));
|
||||||
|
ctl.keysize = sizeof(BgWriterRequest);
|
||||||
|
ctl.entrysize = sizeof(struct BgWriterSlotMapping);
|
||||||
|
ctl.hash = tag_hash;
|
||||||
|
htab = hash_create("CompactBgwriterRequestQueue",
|
||||||
|
BgWriterShmem->num_requests,
|
||||||
|
&ctl,
|
||||||
|
HASH_ELEM | HASH_FUNCTION);
|
||||||
|
|
||||||
|
/* Initialize skip_slot array */
|
||||||
|
skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The basic idea here is that a request can be skipped if it's followed
|
||||||
|
* by a later, identical request. It might seem more sensible to work
|
||||||
|
* backwards from the end of the queue and check whether a request is
|
||||||
|
* *preceded* by an earlier, identical request, in the hopes of doing less
|
||||||
|
* copying. But that might change the semantics, if there's an
|
||||||
|
* intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
|
||||||
|
* we do it this way. It would be possible to be even smarter if we made
|
||||||
|
* the code below understand the specific semantics of such requests (it
|
||||||
|
* could blow away preceding entries that would end up being cancelled
|
||||||
|
* anyhow), but it's not clear that the extra complexity would buy us
|
||||||
|
* anything.
|
||||||
|
*/
|
||||||
|
for (n = 0; n < BgWriterShmem->num_requests; ++n)
|
||||||
|
{
|
||||||
|
BgWriterRequest *request;
|
||||||
|
struct BgWriterSlotMapping *slotmap;
|
||||||
|
bool found;
|
||||||
|
|
||||||
|
request = &BgWriterShmem->requests[n];
|
||||||
|
slotmap = hash_search(htab, request, HASH_ENTER, &found);
|
||||||
|
if (found)
|
||||||
|
{
|
||||||
|
skip_slot[slotmap->slot] = true;
|
||||||
|
++num_skipped;
|
||||||
|
}
|
||||||
|
slotmap->slot = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Done with the hash table. */
|
||||||
|
hash_destroy(htab);
|
||||||
|
|
||||||
|
/* If no duplicates, we're out of luck. */
|
||||||
|
if (!num_skipped)
|
||||||
|
{
|
||||||
|
pfree(skip_slot);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We found some duplicates; remove them. */
|
||||||
|
for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
|
||||||
|
{
|
||||||
|
if (skip_slot[n])
|
||||||
|
continue;
|
||||||
|
BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
|
||||||
|
}
|
||||||
|
ereport(DEBUG1,
|
||||||
|
(errmsg("compacted fsync request queue from %d entries to %d entries",
|
||||||
|
BgWriterShmem->num_requests, preserve_count)));
|
||||||
|
BgWriterShmem->num_requests = preserve_count;
|
||||||
|
|
||||||
|
/* Cleanup. */
|
||||||
|
pfree(skip_slot);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* AbsorbFsyncRequests
|
* AbsorbFsyncRequests
|
||||||
* Retrieve queued fsync requests and pass them to local smgr.
|
* Retrieve queued fsync requests and pass them to local smgr.
|
||||||
|
@ -33,7 +33,13 @@
|
|||||||
/* interval for calling AbsorbFsyncRequests in mdsync */
|
/* interval for calling AbsorbFsyncRequests in mdsync */
|
||||||
#define FSYNCS_PER_ABSORB 10
|
#define FSYNCS_PER_ABSORB 10
|
||||||
|
|
||||||
/* special values for the segno arg to RememberFsyncRequest */
|
/*
|
||||||
|
* Special values for the segno arg to RememberFsyncRequest.
|
||||||
|
*
|
||||||
|
* Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
|
||||||
|
* fsync request from the queue if an identical, subsequent request is found.
|
||||||
|
* See comments there before making changes here.
|
||||||
|
*/
|
||||||
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
|
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
|
||||||
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
|
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
|
||||||
#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
|
#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user