diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 182c1d4234..f475e63a78 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.6 2003/11/29 19:51:56 pgsql Exp $ +$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $ Notes about shared buffer access rules -------------------------------------- @@ -97,153 +97,149 @@ for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a single relation anyway. -Buffer replacement strategy interface: +Buffer replacement strategy interface +------------------------------------- -The two files freelist.c and buf_table.c contain the buffer cache -replacement strategy. The interface to the strategy is: +The file freelist.c contains the buffer cache replacement strategy. +The interface to the strategy is: - BufferDesc * - StrategyBufferLookup(BufferTag *tagPtr, bool recheck) + BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck, + int *cdb_found_index) - This is allways the first call made by the buffer manager - to check if a disk page is in memory. If so, the function - returns the buffer descriptor and no further action is - required. +This is always the first call made by the buffer manager to check if a disk +page is in memory. If so, the function returns the buffer descriptor and no +further action is required. If the page is not in memory, +StrategyBufferLookup() returns NULL. - If the page is not in memory, StrategyBufferLookup() - returns NULL. +The flag recheck tells the strategy that this is a second lookup after +flushing a dirty block. If the buffer manager has to evict another buffer, +it will release the bufmgr lock while doing the write IO. During this time, +another backend could possibly fault in the same page this backend is after, +so we have to check again after the IO is done if the page is in memory now. - The flag recheck tells the strategy that this is a second - lookup after flushing a dirty block. If the buffer manager - has to evict another buffer, he will release the bufmgr lock - while doing the write IO. During this time, another backend - could possibly fault in the same page this backend is after, - so we have to check again after the IO is done if the page - is in memory now. +*cdb_found_index is set to the index of the found CDB, or -1 if none. +This is not intended to be used by the caller, except to pass to +StrategyReplaceBuffer(). - BufferDesc * - StrategyGetBuffer(void) + BufferDesc *StrategyGetBuffer(int *cdb_replace_index) - The buffer manager calls this function to get an unpinned - cache buffer who's content can be evicted. The returned - buffer might be empty, clean or dirty. +The buffer manager calls this function to get an unpinned cache buffer whose +content can be evicted. The returned buffer might be empty, clean or dirty. - The returned buffer is only a cadidate for replacement. - It is possible that while the buffer is written, another - backend finds and modifies it, so that it is dirty again. - The buffer manager will then call StrategyGetBuffer() - again to ask for another candidate. +The returned buffer is only a candidate for replacement. It is possible that +while the buffer is being written, another backend finds and modifies it, so +that it is dirty again. The buffer manager will then have to call +StrategyGetBuffer() again to ask for another candidate. - void - StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, - BlockNumber blockNum) - - Called by the buffer manager at the time it is about to - change the association of a buffer with a disk page. +*cdb_replace_index is set to the index of the candidate CDB, or -1 if none +(meaning we are using a previously free buffer). This is not intended to be +used by the caller, except to pass to StrategyReplaceBuffer(). - Before this call, StrategyBufferLookup() still has to find - the buffer even if it was returned by StrategyGetBuffer() - as a candidate for replacement. + void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, + int cdb_found_index, int cdb_replace_index) - After this call, this buffer must be returned for a - lookup of the new page identified by rnode and blockNum. +Called by the buffer manager at the time it is about to change the association +of a buffer with a disk page. - void - StrategyInvalidateBuffer(BufferDesc *buf) +Before this call, StrategyBufferLookup() still has to find the buffer under +its old tag, even if it was returned by StrategyGetBuffer() as a candidate +for replacement. - Called from various parts to inform that the content of - this buffer has been thrown away. This happens for example - in the case of dropping a relation. +After this call, this buffer must be returned for a lookup of the new page +identified by *newTag. - The buffer must be clean and unpinned on call. +cdb_found_index and cdb_replace_index must be the auxiliary values +returned by previous calls to StrategyBufferLookup and StrategyGetBuffer. - If the buffer associated with a disk page, StrategyBufferLookup() - must not return it for this page after the call. + void StrategyInvalidateBuffer(BufferDesc *buf) - void - StrategyHintVacuum(bool vacuum_active) +Called by the buffer manager to inform the strategy that the content of this +buffer is being thrown away. This happens for example in the case of dropping +a relation. The buffer must be clean and unpinned on call. - Because vacuum reads all relations of the entire database - through the buffer manager, it can greatly disturb the - buffer replacement strategy. This function is used by vacuum - to inform that all subsequent buffer lookups are caused - by vacuum scanning relations. +If the buffer was associated with a disk page, StrategyBufferLookup() +must not return it for this page after the call. - -Buffer replacement strategy: + void StrategyHintVacuum(bool vacuum_active) -The buffer replacement strategy actually used in freelist.c is a -version of the Adaptive Replacement Cache (ARC) special tailored for -PostgreSQL. +Because VACUUM reads all relations of the entire database through the buffer +manager, it can greatly disturb the buffer replacement strategy. This function +is used by VACUUM to inform the strategy that subsequent buffer lookups are +(or are not) caused by VACUUM scanning relations. + + +Buffer replacement strategy +--------------------------- + +The buffer replacement strategy actually used in freelist.c is a version of +the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL. The algorithm works as follows: - C is the size of the cache in number of pages (conf: shared_buffers) - ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block - is allwayt associated with one unique file page and "can" point to - one shared buffer. +C is the size of the cache in number of pages (a/k/a shared_buffers or +NBuffers). ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block +is always associated with one unique file page. It may point to one shared +buffer, or may indicate that the file page is not in a buffer but has been +accessed recently. - All file pages known in by the directory are managed in 4 LRU lists - named B1, T1, T2 and B2. The T1 and T2 lists are the "real" cache - entries, linking a file page to a memory buffer where the page is - currently cached. Consequently T1len+T2len <= C. B1 and B2 are - ghost cache directories that extend T1 and T2 so that the strategy - remembers pages longer. The strategy tries to keep B1len+T1len and - B2len+T2len both at C. T1len and T2 len vary over the runtime - depending on the lookup pattern and its resulting cache hits. The - desired size of T1len is called T1target. +All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and +T2 lists are the "real" cache entries, linking a file page to a memory buffer +where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2 +are ghost cache directories that extend T1 and T2 so that the strategy +remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len +both at C. T1len and T2len vary over the runtime depending on the lookup +pattern and its resulting cache hits. The desired size of T1len is called +T1target. - Assuming we have a full cache, one of 5 cases happens on a lookup: +Assuming we have a full cache, one of 5 cases happens on a lookup: - MISS On a cache miss, depending on T1target and the actual T1len - the LRU buffer of T1 or T2 is evicted. Its CDB is removed - from the T list and added as MRU of the corresponding B list. - The now free buffer is replaced with the requested page - and added as MRU of T1. +MISS On a cache miss, depending on T1target and the actual T1len + the LRU buffer of either T1 or T2 is evicted. Its CDB is removed + from the T list and added as MRU of the corresponding B list. + The now free buffer is replaced with the requested page + and added as MRU of T1. - T1 hit The T1 CDB is moved to the MRU position of the T2 list. +T1 hit The T1 CDB is moved to the MRU position of the T2 list. - T2 hit The T2 CDB is moved to the MRU position of the T2 list. +T2 hit The T2 CDB is moved to the MRU position of the T2 list. - B1 hit This means that a buffer that was evicted from the T1 - list is now requested again, indicating that T1target is - too small (otherwise it would still be in T1 and thus in - memory). The strategy raises T1target, evicts a buffer - depending on T1target and T1len and places the CDB at - MRU of T2. +B1 hit This means that a buffer that was evicted from the T1 + list is now requested again, indicating that T1target is + too small (otherwise it would still be in T1 and thus in + memory). The strategy raises T1target, evicts a buffer + depending on T1target and T1len and places the CDB at + MRU of T2. - B2 hit This means the opposite of B1, the T2 list is probably too - small. So the strategy lowers T1target, evicts a buffer - and places the CDB at MRU of T2. +B2 hit This means the opposite of B1, the T2 list is probably too + small. So the strategy lowers T1target, evicts a buffer + and places the CDB at MRU of T2. - Thus, every page that is found on lookup in any of the four lists - ends up as the MRU of the T2 list. The T2 list therefore is the - "frequency" cache, holding frequently requested pages. +Thus, every page that is found on lookup in any of the four lists +ends up as the MRU of the T2 list. The T2 list therefore is the +"frequency" cache, holding frequently requested pages. - Every page that is seen for the first time ends up as the MRU of - the T1 list. The T1 list is the "recency" cache, holding recent - newcomers. +Every page that is seen for the first time ends up as the MRU of the T1 +list. The T1 list is the "recency" cache, holding recent newcomers. - The tailoring done for PostgreSQL has to do with the way, the - query executor works. A typical UPDATE or DELETE first scans the - relation, searching for the tuples and then calls heap_update() or - heap_delete(). This causes at least 2 lookups for the block in the - same statement. In the case of multiple matches in one block even - more often. As a result, every block touched in an UPDATE or DELETE - would directly jump into the T2 cache, which is wrong. To prevent - this the strategy remembers which transaction added a buffer to the - T1 list and will not promote it from there into the T2 cache during - the same transaction. - - Another specialty is the change of the strategy during VACUUM. - Lookups during VACUUM do not represent application needs, so it - would be wrong to change the cache balance T1target due to that - or to cause massive cache evictions. Therefore, a page read in to - satisfy vacuum (not those that actually cause a hit on any list) - is placed at the LRU position of the T1 list, for immediate - reuse. Since Vacuum usually requests many pages very fast, the - natural side effect of this is that it will get back the very - buffers it filled and possibly modified on the next call and will - therefore do it's work in a few shared memory buffers, while using - whatever it finds in the cache already. +The tailoring done for PostgreSQL has to do with the way the query executor +works. A typical UPDATE or DELETE first scans the relation, searching for the +tuples and then calls heap_update() or heap_delete(). This causes at least 2 +lookups for the block in the same statement. In the case of multiple matches +in one block even more often. As a result, every block touched in an UPDATE or +DELETE would directly jump into the T2 cache, which is wrong. To prevent this +the strategy remembers which transaction added a buffer to the T1 list and +will not promote it from there into the T2 cache during the same transaction. +Another specialty is the change of the strategy during VACUUM. Lookups during +VACUUM do not represent application needs, and do not suggest that the page +will be hit again soon, so it would be wrong to change the cache balance +T1target due to that or to cause massive cache evictions. Therefore, a page +read in to satisfy vacuum is placed at the LRU position of the T1 list, for +immediate reuse. Also, if we happen to get a hit on a CDB entry during +VACUUM, we do not promote the page above its current position in the list. +Since VACUUM usually requests many pages very fast, the effect of this is that +it will get back the very buffers it filled and possibly modified on the next +call and will therefore do its work in a few shared memory buffers, while +being able to use whatever it finds in the cache already. This also implies +that most of the write traffic caused by a VACUUM will be done by the VACUUM +itself and not pushed off onto other processes. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index a671bf9f7f..e0aa0e93e8 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -8,35 +8,15 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.62 2004/02/12 15:06:56 wieck Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.63 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" -#include -#include -#include - -#include "catalog/catalog.h" -#include "executor/execdebug.h" -#include "miscadmin.h" -#include "storage/buf.h" -#include "storage/buf_internals.h" #include "storage/bufmgr.h" -#include "storage/fd.h" -#include "storage/ipc.h" -#include "storage/lmgr.h" -#include "storage/shmem.h" -#include "storage/smgr.h" -#include "storage/lwlock.h" -#include "utils/builtins.h" -#include "utils/hsearch.h" -#include "utils/memutils.h" +#include "storage/buf_internals.h" -int ShowPinTrace = 0; - -int Data_Descriptors; BufferDesc *BufferDescriptors; Block *BufferBlockPointers; @@ -44,6 +24,14 @@ Block *BufferBlockPointers; long *PrivateRefCount; /* also used in freelist.c */ bits8 *BufferLocks; /* flag bits showing locks I have set */ +/* statistics counters */ +long int ReadBufferCount; +long int ReadLocalBufferCount; +long int BufferHitCount; +long int LocalBufferHitCount; +long int BufferFlushCount; +long int LocalBufferFlushCount; + /* * Data Structures: @@ -61,48 +49,35 @@ bits8 *BufferLocks; /* flag bits showing locks I have set */ * see freelist.c. A buffer cannot be replaced while in * use either by data manager or during IO. * - * WriteBufferBack: - * currently, a buffer is only written back at the time - * it is selected for replacement. It should - * be done sooner if possible to reduce latency of - * BufferAlloc(). Maybe there should be a daemon process. * * Synchronization/Locking: * * BufMgrLock lock -- must be acquired before manipulating the - * buffer queues (lookup/freelist). Must be released + * buffer search datastructures (lookup/freelist, as well as the + * flag bits of any buffer). Must be released * before exit and before doing any IO. * * IO_IN_PROGRESS -- this is a flag in the buffer descriptor. * It must be set when an IO is initiated and cleared at - * the end of the IO. It is there to make sure that one + * the end of the IO. It is there to make sure that one * process doesn't start to use a buffer while another is * faulting it in. see IOWait/IOSignal. * - * refcount -- A buffer is pinned during IO and immediately - * after a BufferAlloc(). A buffer is always either pinned - * or on the freelist but never both. The buffer must be - * released, written, or flushed before the end of - * transaction. + * refcount -- Counts the number of processes holding pins on a buffer. + * A buffer is pinned during IO and immediately after a BufferAlloc(). + * Pins must be released before end of transaction. * - * PrivateRefCount -- Each buffer also has a private refcount the keeps + * PrivateRefCount -- Each buffer also has a private refcount that keeps * track of the number of times the buffer is pinned in the current - * processes. This is used for two purposes, first, if we pin a + * process. This is used for two purposes: first, if we pin a * a buffer more than once, we only need to change the shared refcount - * once, thus only lock the buffer pool once, second, when a transaction + * once, thus only lock the shared state once; second, when a transaction * aborts, it should only unpin the buffers exactly the number of times it * has pinned them, so that it will not blow away buffers of another * backend. * */ -long int ReadBufferCount; -long int ReadLocalBufferCount; -long int BufferHitCount; -long int LocalBufferHitCount; -long int BufferFlushCount; -long int LocalBufferFlushCount; - /* * Initialize shared buffer pool @@ -118,8 +93,6 @@ InitBufferPool(void) foundDescs; int i; - Data_Descriptors = NBuffers; - /* * It's probably not really necessary to grab the lock --- if there's * anyone else attached to the shmem at this point, we've got @@ -131,7 +104,7 @@ InitBufferPool(void) BufferDescriptors = (BufferDesc *) ShmemInitStruct("Buffer Descriptors", - Data_Descriptors * sizeof(BufferDesc), &foundDescs); + NBuffers * sizeof(BufferDesc), &foundDescs); BufferBlocks = (char *) ShmemInitStruct("Buffer Blocks", @@ -152,9 +125,9 @@ InitBufferPool(void) /* * link the buffers into a single linked list. This will become the - * LiFo list of unused buffers returned by StragegyGetBuffer(). + * LIFO list of unused buffers returned by StrategyGetBuffer(). */ - for (i = 0; i < Data_Descriptors; block += BLCKSZ, buf++, i++) + for (i = 0; i < NBuffers; block += BLCKSZ, buf++, i++) { Assert(ShmemIsValid((unsigned long) block)); @@ -173,7 +146,7 @@ InitBufferPool(void) } /* Correct last entry */ - BufferDescriptors[Data_Descriptors - 1].bufNext = -1; + BufferDescriptors[NBuffers - 1].bufNext = -1; } /* Init other shared buffer-management stuff */ @@ -215,35 +188,31 @@ InitBufferPoolAccess(void) BufferBlockPointers[i] = (Block) MAKE_PTR(BufferDescriptors[i].data); } -/* ----------------------------------------------------- +/* * BufferShmemSize * * compute the size of shared memory for the buffer pool including * data pages, buffer descriptors, hash tables, etc. - * ---------------------------------------------------- */ int BufferShmemSize(void) { int size = 0; - /* size of shmem index hash table */ - size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt)); - /* size of buffer descriptors */ size += MAXALIGN(NBuffers * sizeof(BufferDesc)); - /* size of the shared replacement strategy control block */ - size += MAXALIGN(sizeof(BufferStrategyControl)); - - /* size of the ARC directory blocks */ - size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB)); - /* size of data pages */ size += NBuffers * MAXALIGN(BLCKSZ); /* size of buffer hash table */ size += hash_estimate_size(NBuffers * 2, sizeof(BufferLookupEnt)); + /* size of the shared replacement strategy control block */ + size += MAXALIGN(sizeof(BufferStrategyControl)); + + /* size of the ARC directory blocks */ + size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB)); + return size; } diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 33590b65fd..3829444195 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -3,46 +3,42 @@ * buf_table.c * routines for finding buffers in the buffer pool. * + * NOTE: these days, what this table actually provides is a mapping from + * BufferTags to CDB indexes, not directly to buffers. The function names + * are thus slight misnomers. + * + * Note: all routines in this file assume that the BufMgrLock is held + * by the caller, so no synchronization is needed. + * + * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.34 2003/12/14 00:34:47 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.35 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ -/* - * OLD COMMENTS - * - * Data Structures: - * - * Buffers are identified by their BufferTag (buf.h). This - * file contains routines for allocating a shmem hash table to - * map buffer tags to buffer descriptors. - * - * Synchronization: - * - * All routines in this file assume BufMgrLock is held by their caller. - */ - #include "postgres.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" + static HTAB *SharedBufHash; /* * Initialize shmem hash table for mapping buffers + * size is the desired hash table size (2*NBuffers for ARC algorithm) */ void InitBufTable(int size) { HASHCTL info; - /* assume lock is held */ + /* assume no locking is needed yet */ /* BufferTag maps to Buffer */ info.keysize = sizeof(BufferTag); @@ -60,6 +56,7 @@ InitBufTable(int size) /* * BufTableLookup + * Lookup the given BufferTag; return CDB index, or -1 if not found */ int BufTableLookup(BufferTag *tagPtr) @@ -78,10 +75,11 @@ BufTableLookup(BufferTag *tagPtr) } /* - * BufTableDelete + * BufTableInsert + * Insert a hashtable entry for given tag and CDB index */ -bool -BufTableInsert(BufferTag *tagPtr, Buffer buf_id) +void +BufTableInsert(BufferTag *tagPtr, int cdb_id) { BufferLookupEnt *result; bool found; @@ -97,14 +95,14 @@ BufTableInsert(BufferTag *tagPtr, Buffer buf_id) if (found) /* found something else in the table? */ elog(ERROR, "shared buffer hash table corrupted"); - result->id = buf_id; - return TRUE; + result->id = cdb_id; } /* * BufTableDelete + * Delete the hashtable entry for given tag */ -bool +void BufTableDelete(BufferTag *tagPtr) { BufferLookupEnt *result; @@ -114,6 +112,4 @@ BufTableDelete(BufferTag *tagPtr) if (!result) /* shouldn't happen */ elog(ERROR, "shared buffer hash table corrupted"); - - return TRUE; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d515a7a259..a80435b7ec 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.160 2004/02/12 20:07:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -54,9 +54,9 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "utils/relcache.h" - #include "pgstat.h" + #define BufferGetLSN(bufHdr) \ (*((XLogRecPtr*) MAKE_PTR((bufHdr)->data))) @@ -64,15 +64,17 @@ /* GUC variable */ bool zero_damaged_pages = false; +#ifdef NOT_USED +int ShowPinTrace = 0; +#endif + int BgWriterDelay = 200; int BgWriterPercent = 1; int BgWriterMaxpages = 100; -static void WaitIO(BufferDesc *buf); -static void StartBufferIO(BufferDesc *buf, bool forInput); -static void TerminateBufferIO(BufferDesc *buf); -static void ContinueBufferIO(BufferDesc *buf, bool forInput); -static void buffer_write_error_callback(void *arg); +long NDirectFileRead; /* some I/O's are direct file access. + * bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ /* * Macro : BUFFER_IS_BROKEN @@ -80,18 +82,22 @@ static void buffer_write_error_callback(void *arg); */ #define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY)) + +static void PinBuffer(BufferDesc *buf); +static void UnpinBuffer(BufferDesc *buf); +static void WaitIO(BufferDesc *buf); +static void StartBufferIO(BufferDesc *buf, bool forInput); +static void TerminateBufferIO(BufferDesc *buf); +static void ContinueBufferIO(BufferDesc *buf, bool forInput); +static void buffer_write_error_callback(void *arg); static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum, bool bufferLockHeld); static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr); static void BufferReplace(BufferDesc *bufHdr); - -#ifdef NOT_USED -void PrintBufferDescs(void); -#endif - static void write_buffer(Buffer buffer, bool unpin); + /* * ReadBuffer -- returns a buffer containing the requested * block of the requested relation. If the blknum @@ -282,14 +288,15 @@ BufferAlloc(Relation reln, BufferDesc *buf, *buf2; BufferTag newTag; /* identity of requested block */ + int cdb_found_index, + cdb_replace_index; bool inProgress; /* buffer undergoing IO */ - /* create a new tag so we can lookup the buffer */ - /* assume that the relation is already open */ + /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(&newTag, reln, blockNum); /* see if the block is in the buffer pool already */ - buf = StrategyBufferLookup(&newTag, false); + buf = StrategyBufferLookup(&newTag, false, &cdb_found_index); if (buf != NULL) { /* @@ -332,6 +339,13 @@ BufferAlloc(Relation reln, } LWLockRelease(BufMgrLock); + + /* + * Do the cost accounting for vacuum + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + return buf; } @@ -345,16 +359,16 @@ BufferAlloc(Relation reln, inProgress = FALSE; for (buf = NULL; buf == NULL;) { - buf = StrategyGetBuffer(); + buf = StrategyGetBuffer(&cdb_replace_index); - /* GetFreeBuffer will abort if it can't find a free buffer */ + /* StrategyGetBuffer will elog if it can't find a free buffer */ Assert(buf); /* * There should be exactly one pin on the buffer after it is * allocated -- ours. If it had a pin it wouldn't have been on * the free list. No one else could have pinned it between - * GetFreeBuffer and here because we have the BufMgrLock. + * StrategyGetBuffer and here because we have the BufMgrLock. */ Assert(buf->refcount == 0); buf->refcount = 1; @@ -438,7 +452,7 @@ BufferAlloc(Relation reln, * we haven't gotten around to insert the new tag into the * buffer table. So we need to check here. -ay 3/95 */ - buf2 = StrategyBufferLookup(&newTag, true); + buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index); if (buf2 != NULL) { /* @@ -471,6 +485,15 @@ BufferAlloc(Relation reln, } LWLockRelease(BufMgrLock); + + /* + * Do the cost accounting for vacuum. (XXX perhaps better + * to consider this a miss? We didn't have to do the read, + * but we did have to write ...) + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + return buf2; } } @@ -485,8 +508,8 @@ BufferAlloc(Relation reln, * Tell the buffer replacement strategy that we are replacing the * buffer content. Then rename the buffer. */ - StrategyReplaceBuffer(buf, reln, blockNum); - INIT_BUFFERTAG(&(buf->tag), reln, blockNum); + StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index); + buf->tag = newTag; /* * Buffer contents are currently invalid. Have to mark IO IN PROGRESS @@ -501,6 +524,12 @@ BufferAlloc(Relation reln, LWLockRelease(BufMgrLock); + /* + * Do the cost accounting for vacuum + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageMiss; + return buf; } @@ -624,20 +653,93 @@ ReleaseAndReadBuffer(Buffer buffer, } /* - * BufferSync -- Write all dirty buffers in the pool. + * PinBuffer -- make buffer unavailable for replacement. * - * This is called at checkpoint time and writes out all dirty shared buffers, + * This should be applied only to shared buffers, never local ones. + * Bufmgr lock must be held by caller. + */ +static void +PinBuffer(BufferDesc *buf) +{ + int b = BufferDescriptorGetBuffer(buf) - 1; + + if (PrivateRefCount[b] == 0) + buf->refcount++; + PrivateRefCount[b]++; + Assert(PrivateRefCount[b] > 0); +} + +/* + * UnpinBuffer -- make buffer available for replacement. + * + * This should be applied only to shared buffers, never local ones. + * Bufmgr lock must be held by caller. + */ +static void +UnpinBuffer(BufferDesc *buf) +{ + int b = BufferDescriptorGetBuffer(buf) - 1; + + Assert(buf->refcount > 0); + Assert(PrivateRefCount[b] > 0); + PrivateRefCount[b]--; + if (PrivateRefCount[b] == 0) + buf->refcount--; + + if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 && + buf->refcount == 1) + { + /* we just released the last pin other than the waiter's */ + buf->flags &= ~BM_PIN_COUNT_WAITER; + ProcSendSignal(buf->wait_backend_id); + } + else + { + /* do nothing */ + } +} + +/* + * BufferSync -- Write out dirty buffers in the pool. + * + * This is called at checkpoint time to write out all dirty shared buffers, * and by the background writer process to write out some of the dirty blocks. + * percent/maxpages should be zero in the former case, and nonzero limit + * values in the latter. */ int BufferSync(int percent, int maxpages) { + BufferDesc **dirty_buffers; + BufferTag *buftags; + int num_buffer_dirty; int i; - BufferDesc *bufHdr; ErrorContextCallback errcontext; - int num_buffer_dirty; - int *buffer_dirty; + /* + * Get a list of all currently dirty buffers and how many there are. + * We do not flush buffers that get dirtied after we started. They + * have to wait until the next checkpoint. + */ + dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *)); + buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag)); + + LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); + num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags, + NBuffers); + + /* + * If called by the background writer, we are usually asked to + * only write out some portion of dirty buffers now, to prevent + * the IO storm at checkpoint time. + */ + if (percent > 0) + { + Assert(percent <= 100); + num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100; + } + if (maxpages > 0 && num_buffer_dirty > maxpages) + num_buffer_dirty = maxpages; /* Setup error traceback support for ereport() */ errcontext.callback = buffer_write_error_callback; @@ -646,47 +748,22 @@ BufferSync(int percent, int maxpages) error_context_stack = &errcontext; /* - * Get a list of all currently dirty buffers and how many there are. - * We do not flush buffers that get dirtied after we started. They - * have to wait until the next checkpoint. + * Loop over buffers to be written. Note the BufMgrLock is held at + * loop top, but is released and reacquired intraloop, so we aren't + * holding it long. */ - buffer_dirty = (int *)palloc(NBuffers * sizeof(int)); - - LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); - num_buffer_dirty = StrategyDirtyBufferList(buffer_dirty, NBuffers); - LWLockRelease(BufMgrLock); - - /* - * If called by the background writer, we are usually asked to - * only write out some percentage of dirty buffers now, to prevent - * the IO storm at checkpoint time. - */ - if (percent > 0 && num_buffer_dirty > 10) - { - Assert(percent <= 100); - num_buffer_dirty = (num_buffer_dirty * percent) / 100; - if (maxpages > 0 && num_buffer_dirty > maxpages) - num_buffer_dirty = maxpages; - } - for (i = 0; i < num_buffer_dirty; i++) { + BufferDesc *bufHdr = dirty_buffers[i]; Buffer buffer; XLogRecPtr recptr; SMgrRelation reln; - LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); - - bufHdr = &BufferDescriptors[buffer_dirty[i]]; errcontext.arg = bufHdr; - if (!(bufHdr->flags & BM_VALID)) - { - LWLockRelease(BufMgrLock); - continue; - } - /* + * Check it is still the same page and still needs writing. + * * We can check bufHdr->cntxDirty here *without* holding any lock * on buffer context as long as we set this flag in access methods * *before* logging changes with XLogInsert(): if someone will set @@ -694,11 +771,12 @@ BufferSync(int percent, int maxpages) * checkpoint.redo points before log record for upcoming changes * and so we are not required to write such dirty buffer. */ - if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) - { - LWLockRelease(BufMgrLock); + if (!(bufHdr->flags & BM_VALID)) + continue; + if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) + continue; + if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) continue; - } /* * IO synchronization. Note that we do it with unpinned buffer to @@ -707,12 +785,13 @@ BufferSync(int percent, int maxpages) if (bufHdr->flags & BM_IO_IN_PROGRESS) { WaitIO(bufHdr); - if (!(bufHdr->flags & BM_VALID) || - (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))) - { - LWLockRelease(BufMgrLock); + /* Still need writing? */ + if (!(bufHdr->flags & BM_VALID)) + continue; + if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) + continue; + if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) continue; - } } /* @@ -723,10 +802,11 @@ BufferSync(int percent, int maxpages) PinBuffer(bufHdr); StartBufferIO(bufHdr, false); /* output IO start */ - buffer = BufferDescriptorGetBuffer(bufHdr); - + /* Release BufMgrLock while doing xlog work */ LWLockRelease(BufMgrLock); + buffer = BufferDescriptorGetBuffer(bufHdr); + /* * Protect buffer content against concurrent update */ @@ -740,8 +820,12 @@ BufferSync(int percent, int maxpages) /* * Now it's safe to write buffer to disk. Note that no one else - * should not be able to write it while we were busy with locking - * and log flushing because of we setted IO flag. + * should have been able to write it while we were busy with + * locking and log flushing because we set the IO flag. + * + * Before we issue the actual write command, clear the just-dirtied + * flag. This lets us recognize concurrent changes (note that only + * hint-bit changes are possible since we hold the buffer shlock). */ LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty); @@ -767,12 +851,12 @@ BufferSync(int percent, int maxpages) * Release the per-buffer readlock, reacquire BufMgrLock. */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - BufferFlushCount++; LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ TerminateBufferIO(bufHdr); /* Sync IO finished */ + BufferFlushCount++; /* * If this buffer was marked by someone as DIRTY while we were @@ -781,14 +865,16 @@ BufferSync(int percent, int maxpages) if (!(bufHdr->flags & BM_JUST_DIRTIED)) bufHdr->flags &= ~BM_DIRTY; UnpinBuffer(bufHdr); - LWLockRelease(BufMgrLock); } - pfree(buffer_dirty); + LWLockRelease(BufMgrLock); /* Pop the error context stack */ error_context_stack = errcontext.previous; + pfree(dirty_buffers); + pfree(buftags); + return num_buffer_dirty; } @@ -818,11 +904,6 @@ WaitIO(BufferDesc *buf) } -long NDirectFileRead; /* some I/O's are direct file access. - * bypass bufmgr */ -long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ - - /* * Return a palloc'd string containing buffer usage statistics. */ @@ -892,9 +973,9 @@ AtEOXact_Buffers(bool isCommit) if (isCommit) elog(WARNING, - "buffer refcount leak: [%03d] (bufNext=%d, " - "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)", - i, buf->bufNext, + "buffer refcount leak: [%03d] " + "(rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)", + i, buf->tag.rnode.tblNode, buf->tag.rnode.relNode, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); @@ -1021,6 +1102,26 @@ BufferGetBlockNumber(Buffer buffer) return BufferDescriptors[buffer - 1].tag.blockNum; } +/* + * BufferGetFileNode + * Returns the relation ID (RelFileNode) associated with a buffer. + * + * This should make the same checks as BufferGetBlockNumber, but since the + * two are generally called together, we don't bother. + */ +RelFileNode +BufferGetFileNode(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + bufHdr = &(LocalBufferDescriptors[-buffer - 1]); + else + bufHdr = &BufferDescriptors[buffer - 1]; + + return (bufHdr->tag.rnode); +} + /* * BufferReplace * @@ -1663,7 +1764,11 @@ refcount = %ld, file: %s, line: %d\n", * * This routine might get called many times on the same page, if we are making * the first scan after commit of an xact that added/deleted many tuples. - * So, be as quick as we can if the buffer is already dirty. + * So, be as quick as we can if the buffer is already dirty. We do this by + * not acquiring BufMgrLock if it looks like the status bits are already OK. + * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after + * we look, because the buffer content update is already done and will be + * reflected in the I/O.) */ void SetBufferCommitInfoNeedsSave(Buffer buffer) @@ -2008,19 +2113,6 @@ AbortBufferIO(void) } } -RelFileNode -BufferGetFileNode(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - bufHdr = &(LocalBufferDescriptors[-buffer - 1]); - else - bufHdr = &BufferDescriptors[buffer - 1]; - - return (bufHdr->tag.rnode); -} - /* * Error context callback for errors occurring during buffer writes. */ diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 595e4905a8..c14d446497 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -3,210 +3,208 @@ * freelist.c * routines for manipulating the buffer pool's replacement strategy. * + * Note: all routines in this file assume that the BufMgrLock is held + * by the caller, so no synchronization is needed. + * + * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.41 2004/02/12 15:06:56 wieck Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.42 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ -/* - * OLD COMMENTS - * - * Data Structures: - * SharedFreeList is a circular queue. Notice that this - * is a shared memory queue so the next/prev "ptrs" are - * buffer ids, not addresses. - * - * Sync: all routines in this file assume that the buffer - * semaphore has been acquired by the caller. - */ - #include "postgres.h" +#include "access/xact.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" -#include "storage/ipc.h" -#include "storage/proc.h" -#include "access/xact.h" -#include "miscadmin.h" -#ifndef MAX -#define MAX(a,b) (((a) > (b)) ? (a) : (b)) -#endif -#ifndef MIN -#define MIN(a,b) (((a) < (b)) ? (a) : (b)) -#endif +/* GUC variable: time in seconds between statistics reports */ +int DebugSharedBuffers = 0; + +/* Pointers to shared state */ static BufferStrategyControl *StrategyControl = NULL; static BufferStrategyCDB *StrategyCDB = NULL; -static int strategy_cdb_found; -static int strategy_cdb_replace; -static int strategy_get_from; - -int DebugSharedBuffers = 0; - -static bool strategy_hint_vacuum; +/* Backend-local state about whether currently vacuuming */ +static bool strategy_hint_vacuum = false; static TransactionId strategy_vacuum_xid; -#define T1_TARGET StrategyControl->target_T1_size -#define B1_LENGTH StrategyControl->listSize[STRAT_LIST_B1] -#define T1_LENGTH StrategyControl->listSize[STRAT_LIST_T1] -#define T2_LENGTH StrategyControl->listSize[STRAT_LIST_T2] -#define B2_LENGTH StrategyControl->listSize[STRAT_LIST_B2] +#define T1_TARGET (StrategyControl->target_T1_size) +#define B1_LENGTH (StrategyControl->listSize[STRAT_LIST_B1]) +#define T1_LENGTH (StrategyControl->listSize[STRAT_LIST_T1]) +#define T2_LENGTH (StrategyControl->listSize[STRAT_LIST_T2]) +#define B2_LENGTH (StrategyControl->listSize[STRAT_LIST_B2]) /* * Macro to remove a CDB from whichever list it currently is on */ #define STRAT_LIST_REMOVE(cdb) \ -{ \ - AssertMacro((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS); \ - if ((cdb)->prev < 0) \ - StrategyControl->listHead[(cdb)->list] = (cdb)->next; \ - else \ - StrategyCDB[(cdb)->prev].next = (cdb)->next; \ - if ((cdb)->next < 0) \ - StrategyControl->listTail[(cdb)->list] = (cdb)->prev; \ - else \ - StrategyCDB[(cdb)->next].prev = (cdb)->prev; \ - StrategyControl->listSize[(cdb)->list]--; \ - (cdb)->list = STRAT_LIST_UNUSED; \ -} +do { \ + Assert((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS); \ + if ((cdb)->prev < 0) \ + StrategyControl->listHead[(cdb)->list] = (cdb)->next; \ + else \ + StrategyCDB[(cdb)->prev].next = (cdb)->next; \ + if ((cdb)->next < 0) \ + StrategyControl->listTail[(cdb)->list] = (cdb)->prev; \ + else \ + StrategyCDB[(cdb)->next].prev = (cdb)->prev; \ + StrategyControl->listSize[(cdb)->list]--; \ + (cdb)->list = STRAT_LIST_UNUSED; \ +} while(0) /* * Macro to add a CDB to the tail of a list (MRU position) */ #define STRAT_MRU_INSERT(cdb,l) \ -{ \ - AssertMacro((cdb)->list == STRAT_LIST_UNUSED); \ - if (StrategyControl->listTail[(l)] < 0) \ - { \ - (cdb)->prev = (cdb)->next = -1; \ - StrategyControl->listHead[(l)] = \ - StrategyControl->listTail[(l)] = \ - ((cdb) - StrategyCDB); \ - } \ - else \ - { \ - (cdb)->next = -1; \ - (cdb)->prev = StrategyControl->listTail[(l)]; \ - StrategyCDB[StrategyControl->listTail[(l)]].next = \ - ((cdb) - StrategyCDB); \ - StrategyControl->listTail[(l)] = \ - ((cdb) - StrategyCDB); \ - } \ - StrategyControl->listSize[(l)]++; \ - (cdb)->list = (l); \ -} +do { \ + Assert((cdb)->list == STRAT_LIST_UNUSED); \ + if (StrategyControl->listTail[(l)] < 0) \ + { \ + (cdb)->prev = (cdb)->next = -1; \ + StrategyControl->listHead[(l)] = \ + StrategyControl->listTail[(l)] = \ + ((cdb) - StrategyCDB); \ + } \ + else \ + { \ + (cdb)->next = -1; \ + (cdb)->prev = StrategyControl->listTail[(l)]; \ + StrategyCDB[StrategyControl->listTail[(l)]].next = \ + ((cdb) - StrategyCDB); \ + StrategyControl->listTail[(l)] = \ + ((cdb) - StrategyCDB); \ + } \ + StrategyControl->listSize[(l)]++; \ + (cdb)->list = (l); \ +} while(0) /* * Macro to add a CDB to the head of a list (LRU position) */ #define STRAT_LRU_INSERT(cdb,l) \ -{ \ - AssertMacro((cdb)->list == STRAT_LIST_UNUSED); \ - if (StrategyControl->listHead[(l)] < 0) \ - { \ - (cdb)->prev = (cdb)->next = -1; \ - StrategyControl->listHead[(l)] = \ - StrategyControl->listTail[(l)] = \ - ((cdb) - StrategyCDB); \ - } \ - else \ - { \ - (cdb)->prev = -1; \ - (cdb)->next = StrategyControl->listHead[(l)]; \ - StrategyCDB[StrategyControl->listHead[(l)]].prev = \ - ((cdb) - StrategyCDB); \ - StrategyControl->listHead[(l)] = \ - ((cdb) - StrategyCDB); \ - } \ - StrategyControl->listSize[(l)]++; \ - (cdb)->list = (l); \ -} +do { \ + Assert((cdb)->list == STRAT_LIST_UNUSED); \ + if (StrategyControl->listHead[(l)] < 0) \ + { \ + (cdb)->prev = (cdb)->next = -1; \ + StrategyControl->listHead[(l)] = \ + StrategyControl->listTail[(l)] = \ + ((cdb) - StrategyCDB); \ + } \ + else \ + { \ + (cdb)->prev = -1; \ + (cdb)->next = StrategyControl->listHead[(l)]; \ + StrategyCDB[StrategyControl->listHead[(l)]].prev = \ + ((cdb) - StrategyCDB); \ + StrategyControl->listHead[(l)] = \ + ((cdb) - StrategyCDB); \ + } \ + StrategyControl->listSize[(l)]++; \ + (cdb)->list = (l); \ +} while(0) +/* + * Printout for use when DebugSharedBuffers is enabled + */ +static void +StrategyStatsDump(void) +{ + time_t now = time(NULL); + + if (StrategyControl->stat_report + DebugSharedBuffers < now) + { + long all_hit, b1_hit, t1_hit, t2_hit, b2_hit; + int id, t1_clean, t2_clean; + ErrorContextCallback *errcxtold; + + id = StrategyControl->listHead[STRAT_LIST_T1]; + t1_clean = 0; + while (id >= 0) + { + if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY) + break; + t1_clean++; + id = StrategyCDB[id].next; + } + id = StrategyControl->listHead[STRAT_LIST_T2]; + t2_clean = 0; + while (id >= 0) + { + if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY) + break; + t2_clean++; + id = StrategyCDB[id].next; + } + + if (StrategyControl->num_lookup == 0) + { + all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0; + } + else + { + b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 / + StrategyControl->num_lookup); + t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 / + StrategyControl->num_lookup); + t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 / + StrategyControl->num_lookup); + b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 / + StrategyControl->num_lookup); + all_hit = b1_hit + t1_hit + t2_hit + b2_hit; + } + + errcxtold = error_context_stack; + error_context_stack = NULL; + elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d", + T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH); + elog(DEBUG1, "ARC total =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%", + all_hit, b1_hit, t1_hit, t2_hit, b2_hit); + elog(DEBUG1, "ARC clean buffers at LRU T1= %5d T2= %5d", + t1_clean, t2_clean); + error_context_stack = errcxtold; + + StrategyControl->num_lookup = 0; + StrategyControl->num_hit[STRAT_LIST_B1] = 0; + StrategyControl->num_hit[STRAT_LIST_T1] = 0; + StrategyControl->num_hit[STRAT_LIST_T2] = 0; + StrategyControl->num_hit[STRAT_LIST_B2] = 0; + StrategyControl->stat_report = now; + } +} + /* * StrategyBufferLookup * * Lookup a page request in the cache directory. A buffer is only - * returned for a T1 or T2 cache hit. B1 and B2 hits are only - * remembered here to later affect the behaviour. + * returned for a T1 or T2 cache hit. B1 and B2 hits are just + * remembered here, to possibly affect the behaviour later. + * + * recheck indicates we are rechecking after I/O wait; do not change + * internal status in this case. + * + * *cdb_found_index is set to the index of the found CDB, or -1 if none. + * This is not intended to be used by the caller, except to pass to + * StrategyReplaceBuffer(). */ BufferDesc * -StrategyBufferLookup(BufferTag *tagPtr, bool recheck) +StrategyBufferLookup(BufferTag *tagPtr, bool recheck, + int *cdb_found_index) { BufferStrategyCDB *cdb; - time_t now; + /* Optional stats printout */ if (DebugSharedBuffers > 0) - { - time(&now); - if (StrategyControl->stat_report + DebugSharedBuffers < now) - { - long all_hit, b1_hit, t1_hit, t2_hit, b2_hit; - int id, t1_clean, t2_clean; - ErrorContextCallback *errcxtold; - - id = StrategyControl->listHead[STRAT_LIST_T1]; - t1_clean = 0; - while (id >= 0) - { - if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY) - break; - t1_clean++; - id = StrategyCDB[id].next; - } - id = StrategyControl->listHead[STRAT_LIST_T2]; - t2_clean = 0; - while (id >= 0) - { - if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY) - break; - t2_clean++; - id = StrategyCDB[id].next; - } - - if (StrategyControl->num_lookup == 0) - { - all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0; - } - else - { - b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 / - StrategyControl->num_lookup); - t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 / - StrategyControl->num_lookup); - t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 / - StrategyControl->num_lookup); - b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 / - StrategyControl->num_lookup); - all_hit = b1_hit + t1_hit + t2_hit + b2_hit; - } - - errcxtold = error_context_stack; - error_context_stack = NULL; - elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d", - T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH); - elog(DEBUG1, "ARC total =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%", - all_hit, b1_hit, t1_hit, t2_hit, b2_hit); - elog(DEBUG1, "ARC clean buffers at LRU T1= %5d T2= %5d", - t1_clean, t2_clean); - error_context_stack = errcxtold; - - StrategyControl->num_lookup = 0; - StrategyControl->num_hit[STRAT_LIST_B1] = 0; - StrategyControl->num_hit[STRAT_LIST_T1] = 0; - StrategyControl->num_hit[STRAT_LIST_T2] = 0; - StrategyControl->num_hit[STRAT_LIST_B2] = 0; - StrategyControl->stat_report = now; - } - } + StrategyStatsDump(); /* * Count lookups @@ -216,72 +214,34 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck) /* * Lookup the block in the shared hash table */ - strategy_cdb_found = BufTableLookup(tagPtr); + *cdb_found_index = BufTableLookup(tagPtr); /* - * Handle CDB lookup miss + * Done if complete CDB lookup miss */ - if (strategy_cdb_found < 0) - { - if (!recheck) - { - /* - * This is an initial lookup and we have a complete - * cache miss (block found nowhere). This means we - * remember according to the current T1 size and the - * target T1 size from where we take a block if we - * need one later. - */ - if (T1_LENGTH >= MAX(1, T1_TARGET)) - strategy_get_from = STRAT_LIST_T1; - else - strategy_get_from = STRAT_LIST_T2; - } - - /* - * Do the cost accounting for vacuum - */ - if (VacuumCostActive) - VacuumCostBalance += VacuumCostPageMiss; - - /* report cache miss */ + if (*cdb_found_index < 0) return NULL; - } /* * We found a CDB */ - cdb = &StrategyCDB[strategy_cdb_found]; + cdb = &StrategyCDB[*cdb_found_index]; /* * Count hits */ StrategyControl->num_hit[cdb->list]++; - if (VacuumCostActive) - VacuumCostBalance += VacuumCostPageHit; /* * If this is a T2 hit, we simply move the CDB to the * T2 MRU position and return the found buffer. + * + * A CDB in T2 cannot have t1_vacuum set, so we needn't check. However, + * if the current process is VACUUM then it doesn't promote to MRU. */ if (cdb->list == STRAT_LIST_T2) { - STRAT_LIST_REMOVE(cdb); - STRAT_MRU_INSERT(cdb, STRAT_LIST_T2); - - return &BufferDescriptors[cdb->buf_id]; - } - - /* - * If this is a T1 hit, we move the buffer to the T2 MRU - * only if another transaction had read it into T1. This is - * required because any UPDATE or DELETE in PostgreSQL does - * multiple ReadBuffer(), first during the scan, later during - * the heap_update() or heap_delete(). - */ - if (cdb->list == STRAT_LIST_T1) - { - if (!TransactionIdIsCurrentTransactionId(cdb->t1_xid)) + if (!strategy_hint_vacuum) { STRAT_LIST_REMOVE(cdb); STRAT_MRU_INSERT(cdb, STRAT_LIST_T2); @@ -291,19 +251,59 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck) } /* - * In the case of a recheck we don't care about B1 or B2 hits here. - * The bufmgr does this call only to make sure noone faulted in the - * block while we where busy flushing another. Now for this really - * to end up as a B1 or B2 cache hit, we must have been flushing for - * quite some time as the block not only must have been read, but - * also traveled through the queue and evicted from the T cache again - * already. + * If this is a T1 hit, we move the buffer to the T2 MRU only if another + * transaction had read it into T1, *and* neither transaction is a VACUUM. + * This is required because any UPDATE or DELETE in PostgreSQL does + * multiple ReadBuffer(), first during the scan, later during the + * heap_update() or heap_delete(). Otherwise move to T1 MRU. VACUUM + * doesn't even get to make that happen. */ - if (recheck) + if (cdb->list == STRAT_LIST_T1) { - return NULL; + if (!strategy_hint_vacuum) + { + if (!cdb->t1_vacuum && + !TransactionIdIsCurrentTransactionId(cdb->t1_xid)) + { + STRAT_LIST_REMOVE(cdb); + STRAT_MRU_INSERT(cdb, STRAT_LIST_T2); + } + else + { + STRAT_LIST_REMOVE(cdb); + STRAT_MRU_INSERT(cdb, STRAT_LIST_T1); + /* + * If a non-VACUUM process references a page recently loaded + * by VACUUM, clear the stigma; the state will now be the + * same as if this process loaded it originally. + */ + if (cdb->t1_vacuum) + { + cdb->t1_xid = GetCurrentTransactionId(); + cdb->t1_vacuum = false; + } + } + } + + return &BufferDescriptors[cdb->buf_id]; } + /* + * In the case of a recheck we don't care about B1 or B2 hits here. + * The bufmgr does this call only to make sure no-one faulted in the + * block while we where busy flushing another; we don't want to doubly + * adjust the T1target. + * + * Now for this really to end up as a B1 or B2 cache hit, we must have + * been flushing for quite some time as the block not only must have been + * read, but also traveled through the queue and evicted from the T cache + * again already. + * + * VACUUM re-reads shouldn't adjust the target either. + */ + if (recheck || strategy_hint_vacuum) + return NULL; + /* * Adjust the target size of the T1 cache depending on if this is * a B1 or B2 hit. @@ -316,8 +316,8 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck) * small. Adjust the T1 target size and continue * below. */ - T1_TARGET = MIN(T1_TARGET + MAX(B2_LENGTH / B1_LENGTH, 1), - Data_Descriptors); + T1_TARGET = Min(T1_TARGET + Max(B2_LENGTH / B1_LENGTH, 1), + NBuffers); break; case STRAT_LIST_B2: @@ -325,26 +325,17 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck) * B2 hit means that the T2 cache is probably too * small. Adjust the T1 target size and continue * below. - */ - T1_TARGET = MAX(T1_TARGET - MAX(B1_LENGTH / B2_LENGTH, 1), 0); + */ + T1_TARGET = Max(T1_TARGET - Max(B1_LENGTH / B2_LENGTH, 1), 0); break; default: - elog(ERROR, "Buffer hash table corrupted - CDB on list %d found", - cdb->list); + elog(ERROR, "buffer hash table corrupted: CDB->list = %d", + cdb->list); } /* - * Decide where to take from if we will be out of - * free blocks later in StrategyGetBuffer(). - */ - if (T1_LENGTH >= MAX(1, T1_TARGET)) - strategy_get_from = STRAT_LIST_T1; - else - strategy_get_from = STRAT_LIST_T2; - - /* - * Even if we had seen the block in the past, it's data is + * Even though we had seen the block in the past, its data is * not currently in memory ... cache miss to the bufmgr. */ return NULL; @@ -357,18 +348,25 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck) * Called by the bufmgr to get the next candidate buffer to use in * BufferAlloc(). The only hard requirement BufferAlloc() has is that * this buffer must not currently be pinned. + * + * *cdb_replace_index is set to the index of the candidate CDB, or -1 if + * none (meaning we are using a previously free buffer). This is not + * intended to be used by the caller, except to pass to + * StrategyReplaceBuffer(). */ BufferDesc * -StrategyGetBuffer(void) +StrategyGetBuffer(int *cdb_replace_index) { int cdb_id; BufferDesc *buf; if (StrategyControl->listFreeBuffers < 0) { - /* We don't have a free buffer, must take one from T1 or T2 */ - - if (strategy_get_from == STRAT_LIST_T1) + /* + * We don't have a free buffer, must take one from T1 or T2. + * Choose based on trying to converge T1len to T1target. + */ + if (T1_LENGTH >= Max(1, T1_TARGET)) { /* * We should take the first unpinned buffer from T1. @@ -379,7 +377,7 @@ StrategyGetBuffer(void) buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id]; if (buf->refcount == 0) { - strategy_cdb_replace = cdb_id; + *cdb_replace_index = cdb_id; Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1); return buf; } @@ -387,7 +385,7 @@ StrategyGetBuffer(void) } /* - * No unpinned T1 buffer found - pardon T2 cache. + * No unpinned T1 buffer found - try T2 cache. */ cdb_id = StrategyControl->listHead[STRAT_LIST_T2]; while (cdb_id >= 0) @@ -395,7 +393,7 @@ StrategyGetBuffer(void) buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id]; if (buf->refcount == 0) { - strategy_cdb_replace = cdb_id; + *cdb_replace_index = cdb_id; Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2); return buf; } @@ -405,7 +403,7 @@ StrategyGetBuffer(void) /* * No unpinned buffers at all!!! */ - elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers"); + elog(ERROR, "no unpinned buffers available"); } else { @@ -418,7 +416,7 @@ StrategyGetBuffer(void) buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id]; if (buf->refcount == 0) { - strategy_cdb_replace = cdb_id; + *cdb_replace_index = cdb_id; Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2); return buf; } @@ -426,7 +424,7 @@ StrategyGetBuffer(void) } /* - * No unpinned T2 buffer found - pardon T1 cache. + * No unpinned T2 buffer found - try T1 cache. */ cdb_id = StrategyControl->listHead[STRAT_LIST_T1]; while (cdb_id >= 0) @@ -434,7 +432,7 @@ StrategyGetBuffer(void) buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id]; if (buf->refcount == 0) { - strategy_cdb_replace = cdb_id; + *cdb_replace_index = cdb_id; Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1); return buf; } @@ -444,7 +442,7 @@ StrategyGetBuffer(void) /* * No unpinned buffers at all!!! */ - elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers"); + elog(ERROR, "no unpinned buffers available"); } } else @@ -459,13 +457,13 @@ StrategyGetBuffer(void) * that there will never be any reason to recheck. Otherwise * we would leak shared buffers here! */ - strategy_cdb_replace = -1; + *cdb_replace_index = -1; buf = &BufferDescriptors[StrategyControl->listFreeBuffers]; StrategyControl->listFreeBuffers = buf->bufNext; buf->bufNext = -1; - /* Buffer of freelist cannot be pinned */ + /* Buffer in freelist cannot be pinned */ Assert(buf->refcount == 0); Assert(!(buf->flags & BM_DIRTY)); @@ -480,54 +478,59 @@ StrategyGetBuffer(void) /* * StrategyReplaceBuffer * - * Called by the buffer manager to inform us that he possibly flushed - * a buffer and is now about to replace the content. Prior to this call, + * Called by the buffer manager to inform us that he flushed a buffer + * and is now about to replace the content. Prior to this call, * the cache algorithm still reports the buffer as in the cache. After * this call we report the new block, even if IO might still need to - * start. + * be done to bring in the new content. + * + * cdb_found_index and cdb_replace_index must be the auxiliary values + * returned by previous calls to StrategyBufferLookup and StrategyGetBuffer. */ void -StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum) +StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, + int cdb_found_index, int cdb_replace_index) { BufferStrategyCDB *cdb_found; BufferStrategyCDB *cdb_replace; - if (strategy_cdb_found >= 0) + if (cdb_found_index >= 0) { - /* This was a ghost buffer cache hit (B1 or B2) */ - cdb_found = &StrategyCDB[strategy_cdb_found]; + /* This must have been a ghost buffer cache hit (B1 or B2) */ + cdb_found = &StrategyCDB[cdb_found_index]; /* Assert that the buffer remembered in cdb_found is the one */ /* the buffer manager is currently faulting in */ - Assert(BUFFERTAG_EQUALS(&(cdb_found->buf_tag), rnode, blockNum)); + Assert(BUFFERTAGS_EQUAL(&(cdb_found->buf_tag), newTag)); - if (strategy_cdb_replace >= 0) + if (cdb_replace_index >= 0) { /* We are satisfying it with an evicted T buffer */ - cdb_replace = &StrategyCDB[strategy_cdb_replace]; + cdb_replace = &StrategyCDB[cdb_replace_index]; /* Assert that the buffer remembered in cdb_replace is */ /* the one the buffer manager has just evicted */ Assert(cdb_replace->list == STRAT_LIST_T1 || - cdb_replace->list == STRAT_LIST_T2); + cdb_replace->list == STRAT_LIST_T2); Assert(cdb_replace->buf_id == buf->buf_id); Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag))); - /* If this was a T1 buffer faulted in by vacuum, just */ - /* do not cause the CDB end up in the B1 list, so that */ - /* the vacuum scan does not affect T1_target adjusting */ - if (strategy_hint_vacuum) + /* + * Under normal circumstances we move the evicted T list entry to + * the corresponding B list. However, T1 entries that exist only + * because of VACUUM are just thrown into the unused list instead. + * We don't expect them to be touched again by the VACUUM, and if + * we put them into B1 then VACUUM would skew T1_target adjusting. + */ + if (cdb_replace->t1_vacuum) { BufTableDelete(&(cdb_replace->buf_tag)); STRAT_LIST_REMOVE(cdb_replace); - cdb_replace->buf_id = -1; cdb_replace->next = StrategyControl->listUnusedCDB; - StrategyControl->listUnusedCDB = strategy_cdb_replace; + StrategyControl->listUnusedCDB = cdb_replace_index; } else { - /* Under normal circumstances move the evicted */ - /* T list entry to it's corresponding B list */ if (cdb_replace->list == STRAT_LIST_T1) { STRAT_LIST_REMOVE(cdb_replace); @@ -539,25 +542,26 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum) STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2); } } - /* And clear it's block reference */ + /* And clear its block reference */ cdb_replace->buf_id = -1; } else { - /* or we satisfy it with an unused buffer */ + /* We are satisfying it with an unused buffer */ } - /* Now the found B CDB get's the buffer and is moved to T2 */ + /* Now the found B CDB gets the buffer and is moved to T2 */ cdb_found->buf_id = buf->buf_id; STRAT_LIST_REMOVE(cdb_found); STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2); } else { - /* This was a complete cache miss, so we need to create */ - /* a new CDB. The goal is to keep T1len+B1len <= c */ - - if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= Data_Descriptors) + /* + * This was a complete cache miss, so we need to create + * a new CDB. The goal is to keep T1len+B1len <= c. + */ + if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= NBuffers) { /* So if B1 isn't empty and T1len+B1len >= c we take B1-LRU */ cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]]; @@ -587,18 +591,20 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum) } } - /* Set the CDB's buf_tag and insert the hash key */ - INIT_BUFFERTAG(&(cdb_found->buf_tag), rnode, blockNum); + /* Set the CDB's buf_tag and insert it into the hash table */ + cdb_found->buf_tag = *newTag; BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB)); - if (strategy_cdb_replace >= 0) + if (cdb_replace_index >= 0) { - /* The buffer was formerly in a T list, move it's CDB - * to the corresponding B list */ - cdb_replace = &StrategyCDB[strategy_cdb_replace]; + /* + * The buffer was formerly in a T list, move its CDB + * to the corresponding B list + */ + cdb_replace = &StrategyCDB[cdb_replace_index]; Assert(cdb_replace->list == STRAT_LIST_T1 || - cdb_replace->list == STRAT_LIST_T2); + cdb_replace->list == STRAT_LIST_T2); Assert(cdb_replace->buf_id == buf->buf_id); Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag))); @@ -612,32 +618,32 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum) STRAT_LIST_REMOVE(cdb_replace); STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2); } - /* And clear it's block reference */ + /* And clear its block reference */ cdb_replace->buf_id = -1; } else { - /* or we satisfy it with an unused buffer */ + /* We are satisfying it with an unused buffer */ } /* Assign the buffer id to the new CDB */ cdb_found->buf_id = buf->buf_id; /* - * Specialized VACUUM optimization. If this "complete cache miss" - * happened because vacuum needed the page, we want it later on - * to be placed at the LRU instead of the MRU position of T1. + * Specialized VACUUM optimization. If this complete cache miss + * happened because vacuum needed the page, we place it at the LRU + * position of T1; normally it goes at the MRU position. */ if (strategy_hint_vacuum) { - if (strategy_vacuum_xid != GetCurrentTransactionId()) + if (TransactionIdIsCurrentTransactionId(strategy_vacuum_xid)) + STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1); + else { + /* VACUUM must have been aborted by error, reset flag */ strategy_hint_vacuum = false; STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1); } - else - STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1); - } else STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1); @@ -645,8 +651,10 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum) /* * Remember the Xid when this buffer went onto T1 to avoid * a single UPDATE promoting a newcomer straight into T2. + * Also remember if it was loaded for VACUUM. */ cdb_found->t1_xid = GetCurrentTransactionId(); + cdb_found->t1_vacuum = strategy_hint_vacuum; } } @@ -673,8 +681,7 @@ StrategyInvalidateBuffer(BufferDesc *buf) */ cdb_id = BufTableLookup(&(buf->tag)); if (cdb_id < 0) - elog(ERROR, "StrategyInvalidateBuffer() buffer %d not in directory", - buf->buf_id); + elog(ERROR, "buffer %d not in buffer hash table", buf->buf_id); cdb = &StrategyCDB[cdb_id]; /* @@ -694,7 +701,7 @@ StrategyInvalidateBuffer(BufferDesc *buf) StrategyControl->listUnusedCDB = cdb_id; /* - * Clear out the buffers tag and add it to the list of + * Clear out the buffer's tag and add it to the list of * currently unused buffers. */ CLEAR_BUFFERTAG(&(buf->tag)); @@ -702,7 +709,9 @@ StrategyInvalidateBuffer(BufferDesc *buf) StrategyControl->listFreeBuffers = buf->buf_id; } - +/* + * StrategyHintVacuum -- tell us whether VACUUM is active + */ void StrategyHintVacuum(bool vacuum_active) { @@ -710,9 +719,24 @@ StrategyHintVacuum(bool vacuum_active) strategy_vacuum_xid = GetCurrentTransactionId(); } - +/* + * StrategyDirtyBufferList + * + * Returns a list of dirty buffers, in priority order for writing. + * Note that the caller may choose not to write them all. + * + * The caller must beware of the possibility that a buffer is no longer dirty, + * or even contains a different page, by the time he reaches it. If it no + * longer contains the same page it need not be written, even if it is (again) + * dirty. + * + * Buffer pointers are stored into buffers[], and corresponding tags into + * buftags[], both of size max_buffers. The function returns the number of + * buffer IDs stored. + */ int -StrategyDirtyBufferList(int *buffer_list, int max_buffers) +StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags, + int max_buffers) { int num_buffer_dirty = 0; int cdb_id_t1; @@ -724,13 +748,13 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers) * Traverse the T1 and T2 list LRU to MRU in "parallel" * and add all dirty buffers found in that order to the list. * The ARC strategy keeps all used buffers including pinned ones - * in the T1 or T2 list. So we cannot loose any dirty buffers. + * in the T1 or T2 list. So we cannot miss any dirty buffers. */ cdb_id_t1 = StrategyControl->listHead[STRAT_LIST_T1]; cdb_id_t2 = StrategyControl->listHead[STRAT_LIST_T2]; while ((cdb_id_t1 >= 0 || cdb_id_t2 >= 0) && - num_buffer_dirty < max_buffers) + num_buffer_dirty < max_buffers) { if (cdb_id_t1 >= 0) { @@ -741,7 +765,9 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers) { if ((buf->flags & BM_DIRTY) || (buf->cntxDirty)) { - buffer_list[num_buffer_dirty++] = buf_id; + buffers[num_buffer_dirty] = buf; + buftags[num_buffer_dirty] = buf->tag; + num_buffer_dirty++; } } @@ -757,7 +783,9 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers) { if ((buf->flags & BM_DIRTY) || (buf->cntxDirty)) { - buffer_list[num_buffer_dirty++] = buf_id; + buffers[num_buffer_dirty] = buf; + buftags[num_buffer_dirty] = buf->tag; + num_buffer_dirty++; } } @@ -785,16 +813,16 @@ StrategyInitialize(bool init) /* * Initialize the shared CDB lookup hashtable */ - InitBufTable(Data_Descriptors * 2); + InitBufTable(NBuffers * 2); /* * Get or create the shared strategy control block and the CDB's */ StrategyControl = (BufferStrategyControl *) - ShmemInitStruct("Buffer Strategy Status", - sizeof(BufferStrategyControl) + - sizeof(BufferStrategyCDB) * (Data_Descriptors * 2 - 1), - &found); + ShmemInitStruct("Buffer Strategy Status", + sizeof(BufferStrategyControl) + + sizeof(BufferStrategyCDB) * (NBuffers * 2 - 1), + &found); StrategyCDB = &(StrategyControl->cdb[0]); if (!found) @@ -805,8 +833,8 @@ StrategyInitialize(bool init) Assert(init); /* - * Grab the whole linked list of free buffers for our - * strategy + * Grab the whole linked list of free buffers for our strategy. + * We assume it was previously set up by InitBufferPool(). */ StrategyControl->listFreeBuffers = 0; @@ -814,7 +842,7 @@ StrategyInitialize(bool init) * We start off with a target T1 list size of * half the available cache blocks. */ - StrategyControl->target_T1_size = Data_Descriptors / 2; + StrategyControl->target_T1_size = NBuffers / 2; /* * Initialize B1, T1, T2 and B2 lists to be empty @@ -832,14 +860,14 @@ StrategyInitialize(bool init) /* * All CDB's are linked as the listUnusedCDB */ - for (i = 0; i < Data_Descriptors * 2; i++) + for (i = 0; i < NBuffers * 2; i++) { StrategyCDB[i].next = i + 1; StrategyCDB[i].list = STRAT_LIST_UNUSED; CLEAR_BUFFERTAG(&(StrategyCDB[i].buf_tag)); StrategyCDB[i].buf_id = -1; } - StrategyCDB[Data_Descriptors * 2 - 1].next = -1; + StrategyCDB[NBuffers * 2 - 1].next = -1; StrategyControl->listUnusedCDB = 0; } else @@ -847,91 +875,3 @@ StrategyInitialize(bool init) Assert(!init); } } - - -#undef PinBuffer - -/* - * PinBuffer -- make buffer unavailable for replacement. - * - * This should be applied only to shared buffers, never local ones. - * Bufmgr lock must be held by caller. - */ -void -PinBuffer(BufferDesc *buf) -{ - int b = BufferDescriptorGetBuffer(buf) - 1; - - if (PrivateRefCount[b] == 0) - buf->refcount++; - PrivateRefCount[b]++; - Assert(PrivateRefCount[b] > 0); -} - -#ifdef NOT_USED -void -PinBuffer_Debug(char *file, int line, BufferDesc *buf) -{ - PinBuffer(buf); - if (ShowPinTrace) - { - Buffer buffer = BufferDescriptorGetBuffer(buf); - - fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} -#endif - -#undef UnpinBuffer - -/* - * UnpinBuffer -- make buffer available for replacement. - * - * This should be applied only to shared buffers, never local ones. - * Bufmgr lock must be held by caller. - */ -void -UnpinBuffer(BufferDesc *buf) -{ - int b = BufferDescriptorGetBuffer(buf) - 1; - - Assert(buf->refcount > 0); - Assert(PrivateRefCount[b] > 0); - PrivateRefCount[b]--; - if (PrivateRefCount[b] == 0) - buf->refcount--; - - if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 && - buf->refcount == 1) - { - /* we just released the last pin other than the waiter's */ - buf->flags &= ~BM_PIN_COUNT_WAITER; - ProcSendSignal(buf->wait_backend_id); - } - else - { - /* do nothing */ - } -} - -#ifdef NOT_USED -void -UnpinBuffer_Debug(char *file, int line, BufferDesc *buf) -{ - UnpinBuffer(buf); - if (ShowPinTrace) - { - Buffer buffer = BufferDescriptorGetBuffer(buf); - - fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} -#endif - - diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index ac738d8f77..3e8c2a6c1b 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.65 2004/02/25 19:41:22 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.66 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -60,7 +60,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, * moderately-accurate estimates for the big hogs, plus 100K for the * stuff that's too small to bother with estimating. */ - size = BufferShmemSize(); + size = hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt)); + size += BufferShmemSize(); size += LockShmemSize(maxBackends); size += XLOGShmemSize(); size += CLOGShmemSize(); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index f401791d93..fc83396ff0 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.68 2004/02/12 15:06:56 wieck Exp $ + * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.69 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,15 +21,6 @@ #include "storage/lwlock.h" -/* Buf Mgr constants */ -/* in bufmgr.c */ -extern int Data_Descriptors; -extern int Free_List_Descriptor; -extern int Lookup_List_Descriptor; -extern int Num_Descriptors; - -extern int ShowPinTrace; - /* * Flags for buffer descriptors */ @@ -51,10 +42,13 @@ typedef bits16 BufFlags; * that the backend flushing the buffer doesn't even believe the relation is * visible yet (its xact may have started before the xact that created the * rel). The storage manager must be able to cope anyway. + * + * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have + * to be fixed to zero them, since this struct is used as a hash key. */ typedef struct buftag { - RelFileNode rnode; + RelFileNode rnode; /* physical relation identifier */ BlockNumber blockNum; /* blknum relative to begin of reln */ } BufferTag; @@ -71,12 +65,6 @@ typedef struct buftag (a)->rnode = (xx_reln)->rd_node \ ) -#define BUFFERTAG_EQUALS(a,xx_reln,xx_blockNum) \ -( \ - (a)->rnode.tblNode == (xx_reln)->rd_node.tblNode && \ - (a)->rnode.relNode == (xx_reln)->rd_node.relNode && \ - (a)->blockNum == (xx_blockNum) \ -) #define BUFFERTAGS_EQUAL(a,b) \ ( \ (a)->rnode.tblNode == (b)->rnode.tblNode && \ @@ -93,7 +81,7 @@ typedef struct sbufdesc Buffer bufNext; /* link in freelist chain */ SHMEM_OFFSET data; /* pointer to data in buf pool */ - /* tag and id must be together for table lookup */ + /* tag and id must be together for table lookup (still true?) */ BufferTag tag; /* file/block identifier */ int buf_id; /* buffer's index number (from 0) */ @@ -108,7 +96,7 @@ typedef struct sbufdesc /* * We can't physically remove items from a disk page if another * backend has the buffer pinned. Hence, a backend may need to wait - * for all other pins to go away. This is signaled by setting its own + * for all other pins to go away. This is signaled by storing its own * backend ID into wait_backend_id and setting flag bit * BM_PIN_COUNT_WAITER. At present, there can be only one such waiter * per buffer. @@ -128,17 +116,17 @@ typedef struct sbufdesc #define BL_IO_IN_PROGRESS (1 << 0) /* unimplemented */ #define BL_PIN_COUNT_LOCK (1 << 1) -/* entry for buffer hashtable */ +/* entry for buffer lookup hashtable */ typedef struct { - BufferTag key; - Buffer id; + BufferTag key; /* Tag of a disk page */ + int id; /* CDB id of associated CDB */ } BufferLookupEnt; /* * Definitions for the buffer replacement strategy */ -#define STRAT_LIST_UNUSED -1 +#define STRAT_LIST_UNUSED (-1) #define STRAT_LIST_B1 0 #define STRAT_LIST_T1 1 #define STRAT_LIST_T2 2 @@ -150,12 +138,13 @@ typedef struct */ typedef struct { - int prev; /* links in the queue */ + int prev; /* list links */ int next; - int list; /* current list */ - BufferTag buf_tag; /* buffer key */ - Buffer buf_id; /* currently assigned data buffer */ + short list; /* ID of list it is currently in */ + bool t1_vacuum; /* t => present only because of VACUUM */ TransactionId t1_xid; /* the xid this entry went onto T1 */ + BufferTag buf_tag; /* page identifier */ + int buf_id; /* currently assigned data buffer, or -1 */ } BufferStrategyCDB; /* @@ -163,7 +152,6 @@ typedef struct */ typedef struct { - int target_T1_size; /* What T1 size are we aiming for */ int listUnusedCDB; /* All unused StrategyCDB */ int listHead[STRAT_NUM_LISTS]; /* ARC lists B1, T1, T2 and B2 */ @@ -175,8 +163,10 @@ typedef struct long num_hit[STRAT_NUM_LISTS]; time_t stat_report; - BufferStrategyCDB cdb[1]; /* The cache directory */ + /* Array of CDB's starts here */ + BufferStrategyCDB cdb[1]; /* VARIABLE SIZE ARRAY */ } BufferStrategyControl; + /* counters in buf_init.c */ extern long int ReadBufferCount; @@ -191,24 +181,25 @@ extern long int LocalBufferFlushCount; * Bufmgr Interface: */ -/* Internal routines: only called by buf.c */ +/* Internal routines: only called by bufmgr */ -/*freelist.c*/ -extern void PinBuffer(BufferDesc *buf); -extern void UnpinBuffer(BufferDesc *buf); -extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck); -extern BufferDesc *StrategyGetBuffer(void); -extern void StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum); +/* freelist.c */ +extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck, + int *cdb_found_index); +extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index); +extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, + int cdb_found_index, int cdb_replace_index); extern void StrategyInvalidateBuffer(BufferDesc *buf); extern void StrategyHintVacuum(bool vacuum_active); -extern int StrategyDirtyBufferList(int *buffer_dirty, int max_buffers); +extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags, + int max_buffers); extern void StrategyInitialize(bool init); /* buf_table.c */ extern void InitBufTable(int size); extern int BufTableLookup(BufferTag *tagPtr); -extern bool BufTableInsert(BufferTag *tagPtr, Buffer buf_id); -extern bool BufTableDelete(BufferTag *tagPtr); +extern void BufTableInsert(BufferTag *tagPtr, int cdb_id); +extern void BufTableDelete(BufferTag *tagPtr); /* bufmgr.c */ extern BufferDesc *BufferDescriptors;