From 9c2626acac80296bff94863756fb7affe85907a6 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 17 Jul 2012 16:56:08 -0400 Subject: [PATCH] Improve coding around the fsync request queue. In all branches back to 8.3, this patch fixes a questionable assumption in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are no uninitialized pad bytes in the request queue structs. This would only cause trouble if (a) there were such pad bytes, which could happen in 8.4 and up if the compiler makes enum ForkNumber narrower than 32 bits, but otherwise would require not-currently-planned changes in the widths of other typedefs; and (b) the kernel has not uniformly initialized the contents of shared memory to zeroes. Still, it seems a tad risky, and we can easily remove any risk by pre-zeroing the request array for ourselves. In addition to that, we need to establish a coding rule that struct RelFileNode can't contain any padding bytes, since such structs are copied into the request array verbatim. (There are other places that are assuming this anyway, it turns out.) In 9.1 and up, the risk was a bit larger because we were also effectively assuming that struct RelFileNodeBackend contained no pad bytes, and with fields of different types in there, that would be much easier to break. However, there is no good reason to ever transmit fsync or delete requests for temp files to the bgwriter/checkpointer, so we can revert the request structs to plain RelFileNode, getting rid of the padding risk and saving some marginal number of bytes and cycles in fsync queue manipulation while we are at it. The savings might be more than marginal during deletion of a temp relation, because the old code transmitted an entirely useless but nonetheless expensive-to-process ForgetRelationFsync request to the background process, and also had the background process perform the file deletion even though that can safely be done immediately. In addition, make some cleanup of nearby comments and small improvements to the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue. --- src/backend/postmaster/bgwriter.c | 71 ++++++++++++++++++++----------- src/include/storage/relfilenode.h | 4 ++ 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 8d6687584e..77970eead0 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -827,22 +827,29 @@ BgWriterShmemSize(void) void BgWriterShmemInit(void) { + Size size = BgWriterShmemSize(); bool found; BgWriterShmem = (BgWriterShmemStruct *) ShmemInitStruct("Background Writer Data", - BgWriterShmemSize(), + size, &found); if (BgWriterShmem == NULL) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("not enough shared memory for background writer"))); - if (found) - return; /* already initialized */ - MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct)); - SpinLockInit(&BgWriterShmem->ckpt_lck); - BgWriterShmem->max_requests = NBuffers; + if (!found) + { + /* + * First time through, so initialize. Note that we zero the whole + * requests array; this is so that CompactBgwriterRequestQueue + * can assume that any pad bytes in the request structs are zeroes. + */ + MemSet(BgWriterShmem, 0, size); + SpinLockInit(&BgWriterShmem->ckpt_lck); + BgWriterShmem->max_requests = NBuffers; + } } /* @@ -1028,25 +1035,27 @@ ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno) /* * CompactBgwriterRequestQueue - * Remove duplicates from the request queue to avoid backend fsyncs. + * Remove duplicates from the request queue to avoid backend fsyncs. + * Returns "true" if any entries were removed. * * Although a full fsync request queue is not common, it can lead to severe * performance problems when it does happen. So far, this situation has * only been observed to occur when the system is under heavy write load, - * and especially during the "sync" phase of a checkpoint. Without this + * and especially during the "sync" phase of a checkpoint. Without this * logic, each backend begins doing an fsync for every block written, which * gets very expensive and can slow down the whole system. * * Trying to do this every time the queue is full could lose if there - * aren't any removable entries. But should be vanishingly rare in + * aren't any removable entries. But that should be vanishingly rare in * practice: there's one queue entry per shared buffer. */ static bool -CompactBgwriterRequestQueue() +CompactBgwriterRequestQueue(void) { - struct BgWriterSlotMapping { - BgWriterRequest request; - int slot; + struct BgWriterSlotMapping + { + BgWriterRequest request; + int slot; }; int n, @@ -1059,20 +1068,22 @@ CompactBgwriterRequestQueue() /* must hold BgWriterCommLock in exclusive mode */ Assert(LWLockHeldByMe(BgWriterCommLock)); + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); + /* Initialize temporary hash table */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(BgWriterRequest); ctl.entrysize = sizeof(struct BgWriterSlotMapping); ctl.hash = tag_hash; + ctl.hcxt = CurrentMemoryContext; + htab = hash_create("CompactBgwriterRequestQueue", BgWriterShmem->num_requests, &ctl, - HASH_ELEM | HASH_FUNCTION); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); - /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); - - /* + /* * The basic idea here is that a request can be skipped if it's followed * by a later, identical request. It might seem more sensible to work * backwards from the end of the queue and check whether a request is @@ -1081,23 +1092,32 @@ CompactBgwriterRequestQueue() * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so * we do it this way. It would be possible to be even smarter if we made * the code below understand the specific semantics of such requests (it - * could blow away preceding entries that would end up being cancelled + * could blow away preceding entries that would end up being canceled * anyhow), but it's not clear that the extra complexity would buy us * anything. */ - for (n = 0; n < BgWriterShmem->num_requests; ++n) + for (n = 0; n < BgWriterShmem->num_requests; n++) { BgWriterRequest *request; struct BgWriterSlotMapping *slotmap; - bool found; + bool found; + /* + * We use the request struct directly as a hashtable key. This + * assumes that any padding bytes in the structs are consistently the + * same, which should be okay because we zeroed them in + * BgWriterShmemInit. Note also that RelFileNode had better + * contain no pad bytes. + */ request = &BgWriterShmem->requests[n]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { + /* Duplicate, so mark the previous occurrence as skippable */ skip_slot[slotmap->slot] = true; - ++num_skipped; + num_skipped++; } + /* Remember slot containing latest occurrence of this request value */ slotmap->slot = n; } @@ -1112,15 +1132,16 @@ CompactBgwriterRequestQueue() } /* We found some duplicates; remove them. */ - for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n) + preserve_count = 0; + for (n = 0; n < BgWriterShmem->num_requests; n++) { if (skip_slot[n]) continue; BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n]; } ereport(DEBUG1, - (errmsg("compacted fsync request queue from %d entries to %d entries", - BgWriterShmem->num_requests, preserve_count))); + (errmsg("compacted fsync request queue from %d entries to %d entries", + BgWriterShmem->num_requests, preserve_count))); BgWriterShmem->num_requests = preserve_count; /* Cleanup. */ diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index 9638294b4a..48f36e050e 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -38,6 +38,10 @@ * identified by pg_database.dattablespace). However this shorthand * is NOT allowed in RelFileNode structs --- the real tablespace ID * must be supplied when setting spcNode. + * + * Note: various places use RelFileNode in hashtable keys. Therefore, + * there *must not* be any unused padding bytes in this struct. That + * should be safe as long as all the fields are of type Oid. */ typedef struct RelFileNode {