diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index b1be98f470..d00f985908 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -37,7 +37,7 @@ typedef struct /* * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from * being pinned by too many backends and each backend will only pin once - * because of bufmgr.c's PrivateRefCount array. + * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; } BufferCachePagesRec; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index e03394c08b..ff6c713b4f 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -20,7 +20,6 @@ BufferDesc *BufferDescriptors; char *BufferBlocks; -int32 *PrivateRefCount; /* @@ -50,16 +49,9 @@ int32 *PrivateRefCount; * * refcount -- Counts the number of processes holding pins on a buffer. * A buffer is pinned during IO and immediately after a BufferAlloc(). - * Pins must be released before end of transaction. - * - * PrivateRefCount -- Each buffer also has a private refcount that keeps - * track of the number of times the buffer is pinned in the current - * process. This is used for two purposes: first, if we pin a - * a buffer more than once, we only need to change the shared refcount - * once, thus only lock the shared state once; second, when a transaction - * aborts, it should only unpin the buffers exactly the number of times it - * has pinned them, so that it will not blow away buffers of another - * backend. + * Pins must be released before end of transaction. For efficiency the + * shared refcount isn't increased if a individual backend pins a buffer + * multiple times. Check the PrivateRefCount infrastructure in bufmgr.c. */ @@ -129,31 +121,6 @@ InitBufferPool(void) StrategyInitialize(!foundDescs); } -/* - * Initialize access to shared buffer pool - * - * This is called during backend startup (whether standalone or under the - * postmaster). It sets up for this backend's access to the already-existing - * buffer pool. - * - * NB: this is called before InitProcess(), so we do not have a PGPROC and - * cannot do LWLockAcquire; hence we can't actually access stuff in - * shared memory yet. We are only initializing local data here. - * (See also InitBufferPoolBackend, over in bufmgr.c.) - */ -void -InitBufferPoolAccess(void) -{ - /* - * Allocate and zero local arrays of per-buffer info. - */ - PrivateRefCount = (int32 *) calloc(NBuffers, sizeof(int32)); - if (!PrivateRefCount) - ereport(FATAL, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); -} - /* * BufferShmemSize * diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 938c554987..32404327cf 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -65,6 +65,15 @@ #define DROP_RELS_BSEARCH_THRESHOLD 20 +typedef struct PrivateRefCountEntry +{ + Buffer buffer; + int32 refcount; +} PrivateRefCountEntry; + +/* 64 bytes, about the size of a cache line on common systems */ +#define REFCOUNT_ARRAY_ENTRIES 8 + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -85,6 +94,281 @@ static bool IsForInput; /* local state for LockBufferForCleanup */ static volatile BufferDesc *PinCountWaitBuf = NULL; +/* + * Backend-Private refcount management: + * + * Each buffer also has a private refcount that keeps track of the number of + * times the buffer is pinned in the current process. This is so that the + * shared refcount needs to be modified only once if a buffer is pinned more + * than once by a individual backend. It's also used to check that no buffers + * are still pinned at the end of transactions and when exiting. + * + * + * To avoid - as we used to - requiring an array with NBuffers entries to keep + * track of local buffers we use a small sequentially searched array + * (PrivateRefCountArray) and a overflow hash table (PrivateRefCountHash) to + * keep track of backend local pins. + * + * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all + * refcounts are kept track of in the array; after that, new array entries + * displace old ones into the hash table. That way a frequently used entry + * can't get "stuck" in the hashtable while infrequent ones clog the array. + * + * Note that in most scenarios the number of pinned buffers will not exceed + * REFCOUNT_ARRAY_ENTRIES. + */ +static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; +static HTAB *PrivateRefCountHash = NULL; +static int32 PrivateRefCountOverflowed = 0; +static uint32 PrivateRefCountClock = 0; + +static PrivateRefCountEntry* GetPrivateRefCountEntry(Buffer buffer, bool create, bool do_move); +static inline int32 GetPrivateRefCount(Buffer buffer); +static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); + +/* + * Return the PrivateRefCount entry for the passed buffer. + * + * Returns NULL if create = false is passed and the buffer doesn't have a + * PrivateRefCount entry; allocates a new PrivateRefCountEntry if currently + * none exists and create = true is passed. + * + * If do_move is true - only allowed for create = false - the entry is + * optimized for frequent access. + * + * When a returned refcount entry isn't used anymore it has to be forgotten, + * using ForgetPrivateRefCountEntry(). + * + * Only works for shared buffers. + */ +static PrivateRefCountEntry* +GetPrivateRefCountEntry(Buffer buffer, bool create, bool do_move) +{ + PrivateRefCountEntry *res; + PrivateRefCountEntry *free = NULL; + bool found = false; + int i; + + Assert(!create || do_move); + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + /* + * First search for references in the array, that'll be sufficient in the + * majority of cases. + */ + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + res = &PrivateRefCountArray[i]; + + if (res->buffer == buffer) + return res; + + /* Remember where to put a new refcount, should it become necessary. */ + if (free == NULL && res->buffer == InvalidBuffer) + free = res; + } + + /* + * By here we know that the buffer, if already pinned, isn't residing in + * the array. + */ + res = NULL; + found = false; + + /* + * Look up the buffer in the hashtable if we've previously overflowed into + * it. + */ + if (PrivateRefCountOverflowed > 0) + { + res = hash_search(PrivateRefCountHash, + (void *) &buffer, + HASH_FIND, + &found); + } + + if (!found) + { + if (!create) + { + /* Neither array nor hash have an entry and no new entry is needed */ + return NULL; + } + else if (free != NULL) + { + /* add entry into the free array slot */ + free->buffer = buffer; + free->refcount = 0; + + return free; + } + else + { + /* + * Move entry from the current clock position in the array into the + * hashtable. Use that slot. + */ + PrivateRefCountEntry *arrayent; + PrivateRefCountEntry *hashent; + + /* select victim slot */ + arrayent = &PrivateRefCountArray[ + PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES]; + Assert(arrayent->buffer != InvalidBuffer); + + /* enter victim array entry into hashtable */ + hashent = hash_search(PrivateRefCountHash, + (void *) &arrayent->buffer, + HASH_ENTER, + &found); + Assert(!found); + hashent->refcount = arrayent->refcount; + + /* fill the now free array slot */ + arrayent->buffer = buffer; + arrayent->refcount = 0; + + PrivateRefCountOverflowed++; + + return arrayent; + + } + } + else + { + if (!do_move) + { + return res; + } + else if (found && free != NULL) + { + /* move buffer from hashtable into the free array slot */ + + /* fill array slot */ + free->buffer = buffer; + free->refcount = res->refcount; + + /* delete from hashtable */ + hash_search(PrivateRefCountHash, + (void *) &buffer, + HASH_REMOVE, + &found); + Assert(found); + Assert(PrivateRefCountOverflowed > 0); + PrivateRefCountOverflowed--; + + return free; + } + else + { + /* + * Swap the entry in the hash table with the one in the array at the + * current clock position. + */ + PrivateRefCountEntry *arrayent; + PrivateRefCountEntry *hashent; + + /* select victim slot */ + arrayent = &PrivateRefCountArray[ + PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES]; + Assert(arrayent->buffer != InvalidBuffer); + + /* enter victim entry into the hashtable */ + hashent = hash_search(PrivateRefCountHash, + (void *) &arrayent->buffer, + HASH_ENTER, + &found); + Assert(!found); + hashent->refcount = arrayent->refcount; + + /* fill now free array entry with previously searched entry */ + arrayent->buffer = res->buffer; + arrayent->refcount = res->refcount; + + /* and remove the old entry */ + hash_search(PrivateRefCountHash, + (void *) &arrayent->buffer, + HASH_REMOVE, + &found); + Assert(found); + + /* PrivateRefCountOverflowed stays the same -1 + +1 = 0*/ + + return arrayent; + } + } + + Assert(false); /* unreachable */ + return NULL; +} + +/* + * Returns how many times the passed buffer is pinned by this backend. + * + * Only works for shared memory buffers! + */ +static inline int32 +GetPrivateRefCount(Buffer buffer) +{ + PrivateRefCountEntry *ref; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + ref = GetPrivateRefCountEntry(buffer, false, false); + + if (ref == NULL) + return 0; + return ref->refcount; +} + +/* + * Release resources used to track the reference count of a buffer which we no + * longer have pinned and don't want to pin again immediately. + */ +static void +ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) +{ + Assert(ref->refcount == 0); + + if (ref >= &PrivateRefCountArray[0] && + ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) + { + ref->buffer = InvalidBuffer; + } + else + { + bool found; + Buffer buffer = ref->buffer; + hash_search(PrivateRefCountHash, + (void *) &buffer, + HASH_REMOVE, + &found); + Assert(found); + Assert(PrivateRefCountOverflowed > 0); + PrivateRefCountOverflowed--; + } +} + +/* + * BufferIsPinned + * True iff the buffer is pinned (also checks for valid buffer number). + * + * NOTE: what we check here is that *this* backend holds a pin on + * the buffer. We do not care whether some other backend does. + */ +#define BufferIsPinned(bufnum) \ +( \ + !BufferIsValid(bufnum) ? \ + false \ + : \ + BufferIsLocal(bufnum) ? \ + (LocalRefCount[-(bufnum) - 1] > 0) \ + : \ + (GetPrivateRefCount(bufnum) > 0) \ +) + static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, @@ -940,7 +1224,7 @@ retry: UnlockBufHdr(buf); LWLockRelease(oldPartitionLock); /* safety check: should definitely not be our *own* pin */ - if (PrivateRefCount[buf->buf_id] != 0) + if (GetPrivateRefCount(buf->buf_id) > 0) elog(ERROR, "buffer is pinned in InvalidateBuffer"); WaitIO(buf); goto retry; @@ -999,7 +1283,7 @@ MarkBufferDirty(Buffer buffer) bufHdr = &BufferDescriptors[buffer - 1]; - Assert(PrivateRefCount[buffer - 1] > 0); + Assert(BufferIsPinned(buffer)); /* unfortunately we can't check if the lock is held exclusively */ Assert(LWLockHeldByMe(bufHdr->content_lock)); @@ -1046,9 +1330,9 @@ ReleaseAndReadBuffer(Buffer buffer, if (BufferIsValid(buffer)) { + Assert(BufferIsPinned(buffer)); if (BufferIsLocal(buffer)) { - Assert(LocalRefCount[-buffer - 1] > 0); bufHdr = &LocalBufferDescriptors[-buffer - 1]; if (bufHdr->tag.blockNum == blockNum && RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) && @@ -1059,7 +1343,6 @@ ReleaseAndReadBuffer(Buffer buffer, } else { - Assert(PrivateRefCount[buffer - 1] > 0); bufHdr = &BufferDescriptors[buffer - 1]; /* we have pin, so it's ok to examine tag without spinlock */ if (bufHdr->tag.blockNum == blockNum && @@ -1096,8 +1379,11 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy) { int b = buf->buf_id; bool result; + PrivateRefCountEntry *ref; - if (PrivateRefCount[b] == 0) + ref = GetPrivateRefCountEntry(b + 1, true, true); + + if (ref->refcount == 0) { LockBufHdr(buf); buf->refcount++; @@ -1119,8 +1405,9 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy) /* If we previously pinned the buffer, it must surely be valid */ result = true; } - PrivateRefCount[b]++; - Assert(PrivateRefCount[b] > 0); + + ref->refcount++; + Assert(ref->refcount > 0); ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(buf)); return result; @@ -1143,12 +1430,15 @@ static void PinBuffer_Locked(volatile BufferDesc *buf) { int b = buf->buf_id; + PrivateRefCountEntry *ref; - if (PrivateRefCount[b] == 0) + ref = GetPrivateRefCountEntry(b + 1, true, true); + + if (ref->refcount == 0) buf->refcount++; UnlockBufHdr(buf); - PrivateRefCount[b]++; - Assert(PrivateRefCount[b] > 0); + ref->refcount++; + Assert(ref->refcount > 0); ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(buf)); } @@ -1164,15 +1454,19 @@ PinBuffer_Locked(volatile BufferDesc *buf) static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner) { + PrivateRefCountEntry *ref; int b = buf->buf_id; + ref = GetPrivateRefCountEntry(b + 1, false, false); + Assert(ref != NULL); + if (fixOwner) ResourceOwnerForgetBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(buf)); - Assert(PrivateRefCount[b] > 0); - PrivateRefCount[b]--; - if (PrivateRefCount[b] == 0) + Assert(ref->refcount > 0); + ref->refcount--; + if (ref->refcount == 0) { /* I'd better not still hold any locks on the buffer */ Assert(!LWLockHeldByMe(buf->content_lock)); @@ -1197,6 +1491,8 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner) } else UnlockBufHdr(buf); + + ForgetPrivateRefCountEntry(ref); } } @@ -1702,6 +1998,10 @@ SyncOneBuffer(int buf_id, bool skip_recently_used) /* * AtEOXact_Buffers - clean up at end of transaction. + * + * As of PostgreSQL 8.0, buffer pins should get released by the + * ResourceOwner mechanism. This routine is just a debugging + * cross-check that no pins remain. */ void AtEOXact_Buffers(bool isCommit) @@ -1709,6 +2009,36 @@ AtEOXact_Buffers(bool isCommit) CheckForBufferLeaks(); AtEOXact_LocalBuffers(isCommit); + + Assert(PrivateRefCountOverflowed == 0); +} + +/* + * Initialize access to shared buffer pool + * + * This is called during backend startup (whether standalone or under the + * postmaster). It sets up for this backend's access to the already-existing + * buffer pool. + * + * NB: this is called before InitProcess(), so we do not have a PGPROC and + * cannot do LWLockAcquire; hence we can't actually access stuff in + * shared memory yet. We are only initializing local data here. + * (See also InitBufferPoolBackend) + */ +void +InitBufferPoolAccess(void) +{ + HASHCTL hash_ctl; + + memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(int32); + hash_ctl.entrysize = sizeof(PrivateRefCountArray); + hash_ctl.hash = oid_hash; /* a bit more efficient than tag_hash */ + + PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl, + HASH_ELEM | HASH_FUNCTION); } /* @@ -1754,16 +2084,34 @@ CheckForBufferLeaks(void) { #ifdef USE_ASSERT_CHECKING int RefCountErrors = 0; - Buffer b; + PrivateRefCountEntry *res; + int i; - for (b = 1; b <= NBuffers; b++) + /* check the array */ + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { - if (PrivateRefCount[b - 1] != 0) + res = &PrivateRefCountArray[i]; + + if (res->buffer != InvalidBuffer) { - PrintBufferLeakWarning(b); + PrintBufferLeakWarning(res->buffer); RefCountErrors++; } } + + /* if neccessary search the hash */ + if (PrivateRefCountOverflowed) + { + HASH_SEQ_STATUS hstat; + hash_seq_init(&hstat, PrivateRefCountHash); + while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL) + { + PrintBufferLeakWarning(res->buffer); + RefCountErrors++; + } + + } + Assert(RefCountErrors == 0); #endif } @@ -1789,7 +2137,7 @@ PrintBufferLeakWarning(Buffer buffer) else { buf = &BufferDescriptors[buffer - 1]; - loccount = PrivateRefCount[buffer - 1]; + loccount = GetPrivateRefCount(buffer); backend = InvalidBackendId; } @@ -2329,7 +2677,7 @@ PrintBufferDescs(void) i, buf->freeNext, relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum), buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i]); + buf->refcount, GetPrivateRefCount(i)); } } #endif @@ -2343,7 +2691,7 @@ PrintPinnedBufs(void) for (i = 0; i < NBuffers; ++i, ++buf) { - if (PrivateRefCount[i] > 0) + if (GetPrivateRefCount(i + 1) > 0) { /* theoretically we should lock the bufhdr here */ elog(LOG, @@ -2352,7 +2700,7 @@ PrintPinnedBufs(void) i, buf->freeNext, relpath(buf->tag.rnode, buf->tag.forkNum), buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i]); + buf->refcount, GetPrivateRefCount(i + 1)); } } } @@ -2509,6 +2857,7 @@ void ReleaseBuffer(Buffer buffer) { volatile BufferDesc *bufHdr; + PrivateRefCountEntry *ref; if (!BufferIsValid(buffer)) elog(ERROR, "bad buffer ID: %d", buffer); @@ -2524,10 +2873,12 @@ ReleaseBuffer(Buffer buffer) bufHdr = &BufferDescriptors[buffer - 1]; - Assert(PrivateRefCount[buffer - 1] > 0); + ref = GetPrivateRefCountEntry(buffer, false, false); + Assert(ref != NULL); + Assert(ref->refcount > 0); - if (PrivateRefCount[buffer - 1] > 1) - PrivateRefCount[buffer - 1]--; + if (ref->refcount > 1) + ref->refcount--; else UnpinBuffer(bufHdr, false); } @@ -2561,7 +2912,12 @@ IncrBufferRefCount(Buffer buffer) if (BufferIsLocal(buffer)) LocalRefCount[-buffer - 1]++; else - PrivateRefCount[buffer - 1]++; + { + PrivateRefCountEntry *ref; + ref = GetPrivateRefCountEntry(buffer, false, true); + Assert(ref != NULL); + ref->refcount++; + } } /* @@ -2595,7 +2951,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) bufHdr = &BufferDescriptors[buffer - 1]; - Assert(PrivateRefCount[buffer - 1] > 0); + Assert(GetPrivateRefCount(buffer) > 0); /* here, either share or exclusive lock is OK */ Assert(LWLockHeldByMe(bufHdr->content_lock)); @@ -2813,9 +3169,9 @@ LockBufferForCleanup(Buffer buffer) } /* There should be exactly one local pin */ - if (PrivateRefCount[buffer - 1] != 1) + if (GetPrivateRefCount(buffer) != 1) elog(ERROR, "incorrect local pin count: %d", - PrivateRefCount[buffer - 1]); + GetPrivateRefCount(buffer)); bufHdr = &BufferDescriptors[buffer - 1]; @@ -2880,7 +3236,7 @@ HoldingBufferPinThatDelaysRecovery(void) if (bufid < 0) return false; - if (PrivateRefCount[bufid] > 0) + if (GetPrivateRefCount(bufid + 1) > 0) return true; return false; @@ -2910,8 +3266,8 @@ ConditionalLockBufferForCleanup(Buffer buffer) } /* There should be exactly one local pin */ - Assert(PrivateRefCount[buffer - 1] > 0); - if (PrivateRefCount[buffer - 1] != 1) + Assert(GetPrivateRefCount(buffer) > 0); + if (GetPrivateRefCount(buffer) != 1) return false; /* Try to acquire lock */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 89447d0b3d..42d9120ed3 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -55,7 +55,6 @@ extern int target_prefetch_pages; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; -extern PGDLLIMPORT int32 *PrivateRefCount; /* in localbuf.c */ extern PGDLLIMPORT int NLocBuffer; @@ -101,24 +100,6 @@ extern PGDLLIMPORT int32 *LocalRefCount; (bufnum) != InvalidBuffer \ ) -/* - * BufferIsPinned - * True iff the buffer is pinned (also checks for valid buffer number). - * - * NOTE: what we check here is that *this* backend holds a pin on - * the buffer. We do not care whether some other backend does. - */ -#define BufferIsPinned(bufnum) \ -( \ - !BufferIsValid(bufnum) ? \ - false \ - : \ - BufferIsLocal(bufnum) ? \ - (LocalRefCount[-(bufnum) - 1] > 0) \ - : \ - (PrivateRefCount[(bufnum) - 1] > 0) \ -) - /* * BufferGetBlock * Returns a reference to a disk page image associated with a buffer.