diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 183edbb605..0bc596ce13 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -880,22 +880,29 @@ BgWriterShmemSize(void) void BgWriterShmemInit(void) { + Size size = BgWriterShmemSize(); bool found; BgWriterShmem = (BgWriterShmemStruct *) ShmemInitStruct("Background Writer Data", - BgWriterShmemSize(), + size, &found); if (BgWriterShmem == NULL) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("not enough shared memory for background writer"))); - if (found) - return; /* already initialized */ - MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct)); - SpinLockInit(&BgWriterShmem->ckpt_lck); - BgWriterShmem->max_requests = NBuffers; + if (!found) + { + /* + * First time through, so initialize. Note that we zero the whole + * requests array; this is so that CompactBgwriterRequestQueue + * can assume that any pad bytes in the request structs are zeroes. + */ + MemSet(BgWriterShmem, 0, size); + SpinLockInit(&BgWriterShmem->ckpt_lck); + BgWriterShmem->max_requests = NBuffers; + } } /* @@ -1108,25 +1115,27 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) /* * CompactBgwriterRequestQueue - * Remove duplicates from the request queue to avoid backend fsyncs. + * Remove duplicates from the request queue to avoid backend fsyncs. + * Returns "true" if any entries were removed. * * Although a full fsync request queue is not common, it can lead to severe * performance problems when it does happen. So far, this situation has * only been observed to occur when the system is under heavy write load, - * and especially during the "sync" phase of a checkpoint. Without this + * and especially during the "sync" phase of a checkpoint. Without this * logic, each backend begins doing an fsync for every block written, which * gets very expensive and can slow down the whole system. * * Trying to do this every time the queue is full could lose if there - * aren't any removable entries. But should be vanishingly rare in + * aren't any removable entries. But that should be vanishingly rare in * practice: there's one queue entry per shared buffer. */ static bool -CompactBgwriterRequestQueue() +CompactBgwriterRequestQueue(void) { - struct BgWriterSlotMapping { - BgWriterRequest request; - int slot; + struct BgWriterSlotMapping + { + BgWriterRequest request; + int slot; }; int n, @@ -1139,20 +1148,22 @@ CompactBgwriterRequestQueue() /* must hold BgWriterCommLock in exclusive mode */ Assert(LWLockHeldByMe(BgWriterCommLock)); + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); + /* Initialize temporary hash table */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(BgWriterRequest); ctl.entrysize = sizeof(struct BgWriterSlotMapping); ctl.hash = tag_hash; + ctl.hcxt = CurrentMemoryContext; + htab = hash_create("CompactBgwriterRequestQueue", BgWriterShmem->num_requests, &ctl, - HASH_ELEM | HASH_FUNCTION); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); - /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests); - - /* + /* * The basic idea here is that a request can be skipped if it's followed * by a later, identical request. It might seem more sensible to work * backwards from the end of the queue and check whether a request is @@ -1161,23 +1172,32 @@ CompactBgwriterRequestQueue() * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so * we do it this way. It would be possible to be even smarter if we made * the code below understand the specific semantics of such requests (it - * could blow away preceding entries that would end up being cancelled + * could blow away preceding entries that would end up being canceled * anyhow), but it's not clear that the extra complexity would buy us * anything. */ - for (n = 0; n < BgWriterShmem->num_requests; ++n) + for (n = 0; n < BgWriterShmem->num_requests; n++) { BgWriterRequest *request; struct BgWriterSlotMapping *slotmap; - bool found; + bool found; + /* + * We use the request struct directly as a hashtable key. This + * assumes that any padding bytes in the structs are consistently the + * same, which should be okay because we zeroed them in + * BgWriterShmemInit. Note also that RelFileNode had better + * contain no pad bytes. + */ request = &BgWriterShmem->requests[n]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { + /* Duplicate, so mark the previous occurrence as skippable */ skip_slot[slotmap->slot] = true; - ++num_skipped; + num_skipped++; } + /* Remember slot containing latest occurrence of this request value */ slotmap->slot = n; } @@ -1192,15 +1212,16 @@ CompactBgwriterRequestQueue() } /* We found some duplicates; remove them. */ - for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n) + preserve_count = 0; + for (n = 0; n < BgWriterShmem->num_requests; n++) { if (skip_slot[n]) continue; BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n]; } ereport(DEBUG1, - (errmsg("compacted fsync request queue from %d entries to %d entries", - BgWriterShmem->num_requests, preserve_count))); + (errmsg("compacted fsync request queue from %d entries to %d entries", + BgWriterShmem->num_requests, preserve_count))); BgWriterShmem->num_requests = preserve_count; /* Cleanup. */ diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 78feae112b..5a7dcda151 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -126,7 +126,7 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ typedef struct { RelFileNode rnode; /* the targeted relation */ - ForkNumber forknum; + ForkNumber forknum; /* which fork */ BlockNumber segno; /* which segment */ } PendingOperationTag; @@ -1209,7 +1209,7 @@ mdpostckpt(void) * If there is a local pending-ops table, just make an entry in it for * mdsync to process later. Otherwise, try to pass off the fsync request * to the background writer process. If that fails, just do the fsync - * locally before returning (we expect this will not happen often enough + * locally before returning (we hope this will not happen often enough * to be a performance problem). */ static void @@ -1237,6 +1237,9 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* * register_unlink() -- Schedule a file to be deleted after next checkpoint * + * We don't bother passing in the fork number, because this is only used + * with main forks. + * * As with register_dirty_segment, this could involve either a local or * a remote pending-ops table. */ @@ -1347,6 +1350,9 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt); PendingUnlinkEntry *entry; + /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ + Assert(forknum == MAIN_FORKNUM); + entry = palloc(sizeof(PendingUnlinkEntry)); entry->rnode = rnode; entry->cycle_ctr = mdckpt_cycle_ctr; @@ -1396,7 +1402,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) } /* - * ForgetRelationFsyncRequests -- forget any fsyncs for a rel + * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork */ void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index 9f77ef9470..92cfb3ef1f 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -61,6 +61,10 @@ typedef enum ForkNumber * identified by pg_database.dattablespace). However this shorthand * is NOT allowed in RelFileNode structs --- the real tablespace ID * must be supplied when setting spcNode. + * + * Note: various places use RelFileNode in hashtable keys. Therefore, + * there *must not* be any unused padding bytes in this struct. That + * should be safe as long as all the fields are of type Oid. */ typedef struct RelFileNode {