Replace the BufMgrLock with separate locks on the lookup hashtable and
the freelist, plus per-buffer spinlocks that protect access to individual shared buffer headers. This requires abandoning a global freelist (since the freelist is a global contention point), which shoots down ARC and 2Q as well as plain LRU management. Adopt a clock sweep algorithm instead. Preliminary results show substantial improvement in multi-backend situations.
This commit is contained in:
parent
5592a6cf46
commit
5d5087363d
@ -1,5 +1,5 @@
|
||||
<!--
|
||||
$PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.306 2005/03/02 19:58:54 tgl Exp $
|
||||
$PostgreSQL: pgsql/doc/src/sgml/runtime.sgml,v 1.307 2005/03/04 20:21:05 tgl Exp $
|
||||
-->
|
||||
|
||||
<chapter Id="runtime">
|
||||
@ -1379,9 +1379,7 @@ SET ENABLE_SEQSCAN TO OFF;
|
||||
Specifies the delay between activity rounds for the
|
||||
background writer. In each round the writer issues writes
|
||||
for some number of dirty buffers (controllable by the
|
||||
following parameters). The selected buffers will always be
|
||||
the least recently used ones among the currently dirty
|
||||
buffers. It then sleeps for <varname>bgwriter_delay</>
|
||||
following parameters). It then sleeps for <varname>bgwriter_delay</>
|
||||
milliseconds, and repeats. The default value is 200. Note
|
||||
that on many systems, the effective resolution of sleep
|
||||
delays is 10 milliseconds; setting <varname>bgwriter_delay</>
|
||||
@ -1393,32 +1391,77 @@ SET ENABLE_SEQSCAN TO OFF;
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="guc-bgwriter-percent" xreflabel="bgwriter_percent">
|
||||
<term><varname>bgwriter_percent</varname> (<type>integer</type>)</term>
|
||||
<varlistentry id="guc-bgwriter-lru-percent" xreflabel="bgwriter_lru_percent">
|
||||
<term><varname>bgwriter_lru_percent</varname> (<type>floating point</type>)</term>
|
||||
<indexterm>
|
||||
<primary><varname>bgwriter_percent</> configuration parameter</primary>
|
||||
<primary><varname>bgwriter_lru_percent</> configuration parameter</primary>
|
||||
</indexterm>
|
||||
<listitem>
|
||||
<para>
|
||||
In each round, no more than this percentage of the currently
|
||||
dirty buffers will be written (rounding up any fraction to
|
||||
the next whole number of buffers). The default value is
|
||||
1. This option can only be set at server start or in the
|
||||
To reduce the probability that server processes will need to issue
|
||||
their own writes, the background writer tries to write buffers that
|
||||
are likely to be recycled soon. In each round, it examines up to
|
||||
<varname>bgwriter_lru_percent</> of the buffers that are nearest to
|
||||
being recycled, and writes any that are dirty.
|
||||
The default value is 1.0 (this is a percentage of the total number
|
||||
of shared buffers).
|
||||
This option can only be set at server start or in the
|
||||
<filename>postgresql.conf</filename> file.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="guc-bgwriter-maxpages" xreflabel="bgwriter_maxpages">
|
||||
<term><varname>bgwriter_maxpages</varname> (<type>integer</type>)</term>
|
||||
<varlistentry id="guc-bgwriter-lru-maxpages" xreflabel="bgwriter_lru_maxpages">
|
||||
<term><varname>bgwriter_lru_maxpages</varname> (<type>integer</type>)</term>
|
||||
<indexterm>
|
||||
<primary><varname>bgwriter_maxpages</> configuration parameter</primary>
|
||||
<primary><varname>bgwriter_lru_maxpages</> configuration parameter</primary>
|
||||
</indexterm>
|
||||
<listitem>
|
||||
<para>
|
||||
In each round, no more than this many dirty buffers will be
|
||||
written. The default value is 100. This option can only be
|
||||
set at server start or in the
|
||||
In each round, no more than this many buffers will be written
|
||||
as a result of scanning soon-to-be-recycled buffers.
|
||||
The default value is 5.
|
||||
This option can only be set at server start or in the
|
||||
<filename>postgresql.conf</filename> file.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="guc-bgwriter-all-percent" xreflabel="bgwriter_all_percent">
|
||||
<term><varname>bgwriter_all_percent</varname> (<type>floating point</type>)</term>
|
||||
<indexterm>
|
||||
<primary><varname>bgwriter_all_percent</> configuration parameter</primary>
|
||||
</indexterm>
|
||||
<listitem>
|
||||
<para>
|
||||
To reduce the amount of work that will be needed at checkpoint time,
|
||||
the background writer also does a circular scan through the entire
|
||||
buffer pool, writing buffers that are found to be dirty.
|
||||
In each round, it examines up to
|
||||
<varname>bgwriter_all_percent</> of the buffers for this purpose.
|
||||
The default value is 0.333 (this is a percentage of the total number
|
||||
of shared buffers). With the default <varname>bgwriter_delay</>
|
||||
setting, this will allow the entire shared buffer pool to be scanned
|
||||
about once per minute.
|
||||
This option can only be set at server start or in the
|
||||
<filename>postgresql.conf</filename> file.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="guc-bgwriter-all-maxpages" xreflabel="bgwriter_all_maxpages">
|
||||
<term><varname>bgwriter_all_maxpages</varname> (<type>integer</type>)</term>
|
||||
<indexterm>
|
||||
<primary><varname>bgwriter_all_maxpages</> configuration parameter</primary>
|
||||
</indexterm>
|
||||
<listitem>
|
||||
<para>
|
||||
In each round, no more than this many buffers will be written
|
||||
as a result of the scan of the entire buffer pool. (If this
|
||||
limit is reached, the scan stops, and resumes at the next buffer
|
||||
during the next round.)
|
||||
The default value is 5.
|
||||
This option can only be set at server start or in the
|
||||
<filename>postgresql.conf</filename> file.
|
||||
</para>
|
||||
</listitem>
|
||||
@ -1426,13 +1469,19 @@ SET ENABLE_SEQSCAN TO OFF;
|
||||
</variablelist>
|
||||
|
||||
<para>
|
||||
Smaller values of <varname>bgwriter_percent</varname> and
|
||||
<varname>bgwriter_maxpages</varname> reduce the extra I/O load
|
||||
Smaller values of <varname>bgwriter_all_percent</varname> and
|
||||
<varname>bgwriter_all_maxpages</varname> reduce the extra I/O load
|
||||
caused by the background writer, but leave more work to be done
|
||||
at checkpoint time. To reduce load spikes at checkpoints,
|
||||
increase the values. To disable background writing entirely,
|
||||
set <varname>bgwriter_percent</varname> and/or
|
||||
<varname>bgwriter_maxpages</varname> to zero.
|
||||
increase these two values.
|
||||
Similarly, smaller values of <varname>bgwriter_lru_percent</varname> and
|
||||
<varname>bgwriter_lru_maxpages</varname> reduce the extra I/O load
|
||||
caused by the background writer, but make it more likely that server
|
||||
processes will have to issue writes for themselves, delaying interactive
|
||||
queries.
|
||||
To disable background writing entirely,
|
||||
set both <varname>maxpages</varname> values and/or both
|
||||
<varname>percent</varname> values to zero.
|
||||
</para>
|
||||
</sect3>
|
||||
|
||||
@ -3866,20 +3915,6 @@ plruby.bar = true # generates error, unknown class name
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="guc-debug-shared-buffers" xreflabel="debug_shared_buffers">
|
||||
<term><varname>debug_shared_buffers</varname> (<type>integer</type>)</term>
|
||||
<indexterm>
|
||||
<primary><varname>debug_shared_buffers</> configuration parameter</primary>
|
||||
</indexterm>
|
||||
<listitem>
|
||||
<para>
|
||||
Number of seconds between ARC reports.
|
||||
If set greater than zero, emit ARC statistics to the log every so many
|
||||
seconds. Zero (the default) disables reporting.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="guc-pre-auth-delay" xreflabel="pre_auth_delay">
|
||||
<term><varname>pre_auth_delay</varname> (<type>integer</type>)</term>
|
||||
<indexterm>
|
||||
|
@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.244 2005/01/10 20:02:19 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.245 2005/03/04 20:21:05 tgl Exp $
|
||||
*
|
||||
*
|
||||
* INTERFACE ROUTINES
|
||||
@ -1060,7 +1060,6 @@ setRelhasindex(Oid relid, bool hasindex, bool isprimary, Oid reltoastidxid)
|
||||
/* Send out shared cache inval if necessary */
|
||||
if (!IsBootstrapProcessingMode())
|
||||
CacheInvalidateHeapTuple(pg_class, tuple);
|
||||
BufferSync(-1, -1);
|
||||
}
|
||||
else if (dirty)
|
||||
{
|
||||
|
@ -15,7 +15,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.151 2005/02/26 18:43:33 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.152 2005/03/04 20:21:05 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -339,7 +339,7 @@ createdb(const CreatedbStmt *stmt)
|
||||
* up-to-date for the copy. (We really only need to flush buffers for
|
||||
* the source database, but bufmgr.c provides no API for that.)
|
||||
*/
|
||||
BufferSync(-1, -1);
|
||||
BufferSync();
|
||||
|
||||
/*
|
||||
* Close virtual file descriptors so the kernel has more available for
|
||||
@ -1201,7 +1201,7 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
* up-to-date for the copy. (We really only need to flush buffers for
|
||||
* the source database, but bufmgr.c provides no API for that.)
|
||||
*/
|
||||
BufferSync(-1, -1);
|
||||
BufferSync();
|
||||
|
||||
#ifndef WIN32
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.302 2005/02/26 18:43:33 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.303 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -36,7 +36,6 @@
|
||||
#include "commands/vacuum.h"
|
||||
#include "executor/executor.h"
|
||||
#include "miscadmin.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/freespace.h"
|
||||
#include "storage/sinval.h"
|
||||
#include "storage/smgr.h"
|
||||
|
@ -37,7 +37,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.14 2005/02/19 23:16:15 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.15 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -116,9 +116,6 @@ static BgWriterShmemStruct *BgWriterShmem;
|
||||
* GUC parameters
|
||||
*/
|
||||
int BgWriterDelay = 200;
|
||||
int BgWriterPercent = 1;
|
||||
int BgWriterMaxPages = 100;
|
||||
|
||||
int CheckPointTimeout = 300;
|
||||
int CheckPointWarning = 30;
|
||||
|
||||
@ -274,7 +271,6 @@ BackgroundWriterMain(void)
|
||||
bool force_checkpoint = false;
|
||||
time_t now;
|
||||
int elapsed_secs;
|
||||
int n;
|
||||
long udelay;
|
||||
|
||||
/*
|
||||
@ -365,16 +361,13 @@ BackgroundWriterMain(void)
|
||||
* checkpoints happen at a predictable spacing.
|
||||
*/
|
||||
last_checkpoint_time = now;
|
||||
|
||||
/* Nap for configured time before rechecking */
|
||||
n = 1;
|
||||
}
|
||||
else
|
||||
n = BufferSync(BgWriterPercent, BgWriterMaxPages);
|
||||
BgBufferSync();
|
||||
|
||||
/*
|
||||
* Nap for the configured time or sleep for 10 seconds if there
|
||||
* was nothing to do at all.
|
||||
* Nap for the configured time, or sleep for 10 seconds if there
|
||||
* is no bgwriter activity configured.
|
||||
*
|
||||
* On some platforms, signals won't interrupt the sleep. To ensure
|
||||
* we respond reasonably promptly when someone signals us, break
|
||||
@ -383,7 +376,11 @@ BackgroundWriterMain(void)
|
||||
*
|
||||
* We absorb pending requests after each short sleep.
|
||||
*/
|
||||
udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
|
||||
if ((bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0) ||
|
||||
(bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0))
|
||||
udelay = BgWriterDelay * 1000L;
|
||||
else
|
||||
udelay = 10000000L;
|
||||
while (udelay > 1000000L)
|
||||
{
|
||||
if (got_SIGHUP || checkpoint_requested || shutdown_requested)
|
||||
|
@ -1,12 +1,12 @@
|
||||
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $
|
||||
$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.8 2005/03/04 20:21:06 tgl Exp $
|
||||
|
||||
Notes about shared buffer access rules
|
||||
--------------------------------------
|
||||
|
||||
There are two separate access control mechanisms for shared disk buffers:
|
||||
reference counts (a/k/a pin counts) and buffer locks. (Actually, there's
|
||||
a third level of access control: one must hold the appropriate kind of
|
||||
lock on a relation before one can legally access any page belonging to
|
||||
reference counts (a/k/a pin counts) and buffer content locks. (Actually,
|
||||
there's a third level of access control: one must hold the appropriate kind
|
||||
of lock on a relation before one can legally access any page belonging to
|
||||
the relation. Relation-level locks are not discussed here.)
|
||||
|
||||
Pins: one must "hold a pin on" a buffer (increment its reference count)
|
||||
@ -26,7 +26,7 @@ handled by waiting to obtain the relation-level lock, which is why you'd
|
||||
better hold one first.) Pins may not be held across transaction
|
||||
boundaries, however.
|
||||
|
||||
Buffer locks: there are two kinds of buffer locks, shared and exclusive,
|
||||
Buffer content locks: there are two kinds of buffer lock, shared and exclusive,
|
||||
which act just as you'd expect: multiple backends can hold shared locks on
|
||||
the same buffer, but an exclusive lock prevents anyone else from holding
|
||||
either shared or exclusive lock. (These can alternatively be called READ
|
||||
@ -38,12 +38,12 @@ the same buffer. One must pin a buffer before trying to lock it.
|
||||
Buffer access rules:
|
||||
|
||||
1. To scan a page for tuples, one must hold a pin and either shared or
|
||||
exclusive lock. To examine the commit status (XIDs and status bits) of
|
||||
a tuple in a shared buffer, one must likewise hold a pin and either shared
|
||||
exclusive content lock. To examine the commit status (XIDs and status bits)
|
||||
of a tuple in a shared buffer, one must likewise hold a pin and either shared
|
||||
or exclusive lock.
|
||||
|
||||
2. Once one has determined that a tuple is interesting (visible to the
|
||||
current transaction) one may drop the buffer lock, yet continue to access
|
||||
current transaction) one may drop the content lock, yet continue to access
|
||||
the tuple's data for as long as one holds the buffer pin. This is what is
|
||||
typically done by heap scans, since the tuple returned by heap_fetch
|
||||
contains a pointer to tuple data in the shared buffer. Therefore the
|
||||
@ -52,9 +52,9 @@ change, but that is assumed not to matter after the initial determination
|
||||
of visibility is made.
|
||||
|
||||
3. To add a tuple or change the xmin/xmax fields of an existing tuple,
|
||||
one must hold a pin and an exclusive lock on the containing buffer.
|
||||
one must hold a pin and an exclusive content lock on the containing buffer.
|
||||
This ensures that no one else might see a partially-updated state of the
|
||||
tuple.
|
||||
tuple while they are doing visibility checks.
|
||||
|
||||
4. It is considered OK to update tuple commit status bits (ie, OR the
|
||||
values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or
|
||||
@ -76,7 +76,7 @@ no other backend can be holding a reference to an existing tuple that it
|
||||
might expect to examine again. Note that another backend might pin the
|
||||
buffer (increment the refcount) while one is performing the cleanup, but
|
||||
it won't be able to actually examine the page until it acquires shared
|
||||
or exclusive lock.
|
||||
or exclusive content lock.
|
||||
|
||||
|
||||
VACUUM FULL ignores rule #5, because it instead acquires exclusive lock at
|
||||
@ -97,149 +97,142 @@ for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a
|
||||
single relation anyway.
|
||||
|
||||
|
||||
Buffer replacement strategy interface
|
||||
-------------------------------------
|
||||
Buffer manager's internal locking
|
||||
---------------------------------
|
||||
|
||||
The file freelist.c contains the buffer cache replacement strategy.
|
||||
The interface to the strategy is:
|
||||
Before PostgreSQL 8.1, all operations of the shared buffer manager itself
|
||||
were protected by a single system-wide lock, the BufMgrLock, which
|
||||
unsurprisingly proved to be a source of contention. The new locking scheme
|
||||
avoids grabbing system-wide exclusive locks in common code paths. It works
|
||||
like this:
|
||||
|
||||
BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
|
||||
int *cdb_found_index)
|
||||
* There is a system-wide LWLock, the BufMappingLock, that notionally
|
||||
protects the mapping from buffer tags (page identifiers) to buffers.
|
||||
(Physically, it can be thought of as protecting the hash table maintained
|
||||
by buf_table.c.) To look up whether a buffer exists for a tag, it is
|
||||
sufficient to obtain share lock on the BufMappingLock. Note that one
|
||||
must pin the found buffer, if any, before releasing the BufMappingLock.
|
||||
To alter the page assignment of any buffer, one must hold exclusive lock
|
||||
on the BufMappingLock. This lock must be held across adjusting the buffer's
|
||||
header fields and changing the buf_table hash table. The only common
|
||||
operation that needs exclusive lock is reading in a page that was not
|
||||
in shared buffers already, which will require at least a kernel call
|
||||
and usually a wait for I/O, so it will be slow anyway.
|
||||
|
||||
This is always the first call made by the buffer manager to check if a disk
|
||||
page is in memory. If so, the function returns the buffer descriptor and no
|
||||
further action is required. If the page is not in memory,
|
||||
StrategyBufferLookup() returns NULL.
|
||||
* A separate system-wide LWLock, the BufFreelistLock, provides mutual
|
||||
exclusion for operations that access the buffer free list or select
|
||||
buffers for replacement. This is always taken in exclusive mode since
|
||||
there are no read-only operations on those data structures. The buffer
|
||||
management policy is designed so that BufFreelistLock need not be taken
|
||||
except in paths that will require I/O, and thus will be slow anyway.
|
||||
(Details appear below.) It is never necessary to hold the BufMappingLock
|
||||
and the BufFreelistLock at the same time.
|
||||
|
||||
The flag recheck tells the strategy that this is a second lookup after
|
||||
flushing a dirty block. If the buffer manager has to evict another buffer,
|
||||
it will release the bufmgr lock while doing the write IO. During this time,
|
||||
another backend could possibly fault in the same page this backend is after,
|
||||
so we have to check again after the IO is done if the page is in memory now.
|
||||
* Each buffer header contains a spinlock that must be taken when examining
|
||||
or changing fields of that buffer header. This allows operations such as
|
||||
ReleaseBuffer to make local state changes without taking any system-wide
|
||||
lock. We use a spinlock, not an LWLock, since there are no cases where
|
||||
the lock needs to be held for more than a few instructions.
|
||||
|
||||
*cdb_found_index is set to the index of the found CDB, or -1 if none.
|
||||
This is not intended to be used by the caller, except to pass to
|
||||
StrategyReplaceBuffer().
|
||||
Note that a buffer header's spinlock does not control access to the data
|
||||
held within the buffer. Each buffer header also contains an LWLock, the
|
||||
"buffer content lock", that *does* represent the right to access the data
|
||||
in the buffer. It is used per the rules above.
|
||||
|
||||
BufferDesc *StrategyGetBuffer(int *cdb_replace_index)
|
||||
|
||||
The buffer manager calls this function to get an unpinned cache buffer whose
|
||||
content can be evicted. The returned buffer might be empty, clean or dirty.
|
||||
|
||||
The returned buffer is only a candidate for replacement. It is possible that
|
||||
while the buffer is being written, another backend finds and modifies it, so
|
||||
that it is dirty again. The buffer manager will then have to call
|
||||
StrategyGetBuffer() again to ask for another candidate.
|
||||
|
||||
*cdb_replace_index is set to the index of the candidate CDB, or -1 if none
|
||||
(meaning we are using a previously free buffer). This is not intended to be
|
||||
used by the caller, except to pass to StrategyReplaceBuffer().
|
||||
|
||||
void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
|
||||
int cdb_found_index, int cdb_replace_index)
|
||||
|
||||
Called by the buffer manager at the time it is about to change the association
|
||||
of a buffer with a disk page.
|
||||
|
||||
Before this call, StrategyBufferLookup() still has to find the buffer under
|
||||
its old tag, even if it was returned by StrategyGetBuffer() as a candidate
|
||||
for replacement.
|
||||
|
||||
After this call, this buffer must be returned for a lookup of the new page
|
||||
identified by *newTag.
|
||||
|
||||
cdb_found_index and cdb_replace_index must be the auxiliary values
|
||||
returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
|
||||
|
||||
void StrategyInvalidateBuffer(BufferDesc *buf)
|
||||
|
||||
Called by the buffer manager to inform the strategy that the content of this
|
||||
buffer is being thrown away. This happens for example in the case of dropping
|
||||
a relation. The buffer must be clean and unpinned on call.
|
||||
|
||||
If the buffer was associated with a disk page, StrategyBufferLookup()
|
||||
must not return it for this page after the call.
|
||||
|
||||
void StrategyHintVacuum(bool vacuum_active)
|
||||
|
||||
Because VACUUM reads all relations of the entire database through the buffer
|
||||
manager, it can greatly disturb the buffer replacement strategy. This function
|
||||
is used by VACUUM to inform the strategy that subsequent buffer lookups are
|
||||
(or are not) caused by VACUUM scanning relations.
|
||||
There is yet another set of per-buffer LWLocks, the io_in_progress locks,
|
||||
that are used to wait for I/O on a buffer to complete. The process doing
|
||||
a read or write takes exclusive lock for the duration, and processes that
|
||||
need to wait for completion try to take shared locks (which they release
|
||||
immediately upon obtaining). XXX on systems where an LWLock represents
|
||||
nontrivial resources, it's fairly annoying to need so many locks. Possibly
|
||||
we could use per-backend LWLocks instead (a buffer header would then contain
|
||||
a field to show which backend is doing its I/O).
|
||||
|
||||
|
||||
Buffer replacement strategy
|
||||
---------------------------
|
||||
|
||||
The buffer replacement strategy actually used in freelist.c is a version of
|
||||
the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL.
|
||||
There is a "free list" of buffers that are prime candidates for replacement.
|
||||
In particular, buffers that are completely free (contain no valid page) are
|
||||
always in this list. We may also throw buffers into this list if we
|
||||
consider their pages unlikely to be needed soon. The list is singly-linked
|
||||
using fields in the buffer headers; we maintain head and tail pointers in
|
||||
global variables. (Note: although the list links are in the buffer headers,
|
||||
they are considered to be protected by the BufFreelistLock, not the
|
||||
buffer-header spinlocks.) To choose a victim buffer to recycle when there
|
||||
are no free buffers available, we use a simple clock-sweep algorithm, which
|
||||
avoids the need to take system-wide locks during common operations. It
|
||||
works like this:
|
||||
|
||||
The algorithm works as follows:
|
||||
Each buffer header contains a usage counter, which is incremented (up to a
|
||||
small limit value) whenever the buffer is unpinned. (This requires only the
|
||||
buffer header spinlock, which would have to be taken anyway to decrement the
|
||||
buffer reference count, so it's nearly free.)
|
||||
|
||||
C is the size of the cache in number of pages (a/k/a shared_buffers or
|
||||
NBuffers). ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
|
||||
is always associated with one unique file page. It may point to one shared
|
||||
buffer, or may indicate that the file page is not in a buffer but has been
|
||||
accessed recently.
|
||||
The "clock hand" is a buffer index, NextVictimBuffer, that moves circularly
|
||||
through all the available buffers. NextVictimBuffer is protected by the
|
||||
BufFreelistLock.
|
||||
|
||||
All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and
|
||||
T2 lists are the "real" cache entries, linking a file page to a memory buffer
|
||||
where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2
|
||||
are ghost cache directories that extend T1 and T2 so that the strategy
|
||||
remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len
|
||||
both at C. T1len and T2len vary over the runtime depending on the lookup
|
||||
pattern and its resulting cache hits. The desired size of T1len is called
|
||||
T1target.
|
||||
The algorithm for a process that needs to obtain a victim buffer is:
|
||||
|
||||
Assuming we have a full cache, one of 5 cases happens on a lookup:
|
||||
1. Obtain BufFreelistLock.
|
||||
|
||||
MISS On a cache miss, depending on T1target and the actual T1len
|
||||
the LRU buffer of either T1 or T2 is evicted. Its CDB is removed
|
||||
from the T list and added as MRU of the corresponding B list.
|
||||
The now free buffer is replaced with the requested page
|
||||
and added as MRU of T1.
|
||||
2. If buffer free list is nonempty, remove its head buffer. If the buffer
|
||||
is pinned or has a nonzero usage count, it cannot be used; ignore it and
|
||||
return to the start of step 2. Otherwise, pin the buffer, release
|
||||
BufFreelistLock, and return the buffer.
|
||||
|
||||
T1 hit The T1 CDB is moved to the MRU position of the T2 list.
|
||||
3. Otherwise, select the buffer pointed to by NextVictimBuffer, and
|
||||
circularly advance NextVictimBuffer for next time.
|
||||
|
||||
T2 hit The T2 CDB is moved to the MRU position of the T2 list.
|
||||
4. If the selected buffer is pinned or has a nonzero usage count, it cannot
|
||||
be used. Decrement its usage count (if nonzero) and return to step 3 to
|
||||
examine the next buffer.
|
||||
|
||||
B1 hit This means that a buffer that was evicted from the T1
|
||||
list is now requested again, indicating that T1target is
|
||||
too small (otherwise it would still be in T1 and thus in
|
||||
memory). The strategy raises T1target, evicts a buffer
|
||||
depending on T1target and T1len and places the CDB at
|
||||
MRU of T2.
|
||||
5. Pin the selected buffer, release BufFreelistLock, and return the buffer.
|
||||
|
||||
B2 hit This means the opposite of B1, the T2 list is probably too
|
||||
small. So the strategy lowers T1target, evicts a buffer
|
||||
and places the CDB at MRU of T2.
|
||||
(Note that if the selected buffer is dirty, we will have to write it out
|
||||
before we can recycle it; if someone else pins the buffer meanwhile we will
|
||||
have to give up and try another buffer. This however is not a concern
|
||||
of the basic select-a-victim-buffer algorithm.)
|
||||
|
||||
Thus, every page that is found on lookup in any of the four lists
|
||||
ends up as the MRU of the T2 list. The T2 list therefore is the
|
||||
"frequency" cache, holding frequently requested pages.
|
||||
A special provision is that while running VACUUM, a backend does not
|
||||
increment the usage count on buffers it accesses. In fact, if ReleaseBuffer
|
||||
sees that it is dropping the pin count to zero and the usage count is zero,
|
||||
then it appends the buffer to the tail of the free list. (This implies that
|
||||
VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
|
||||
this shouldn't create much of a contention problem.) This provision
|
||||
encourages VACUUM to work in a relatively small number of buffers rather
|
||||
than blowing out the entire buffer cache. It is reasonable since a page
|
||||
that has been touched only by VACUUM is unlikely to be needed again soon.
|
||||
|
||||
Every page that is seen for the first time ends up as the MRU of the T1
|
||||
list. The T1 list is the "recency" cache, holding recent newcomers.
|
||||
|
||||
The tailoring done for PostgreSQL has to do with the way the query executor
|
||||
works. A typical UPDATE or DELETE first scans the relation, searching for the
|
||||
tuples and then calls heap_update() or heap_delete(). This causes at least 2
|
||||
lookups for the block in the same statement. In the case of multiple matches
|
||||
in one block even more often. As a result, every block touched in an UPDATE or
|
||||
DELETE would directly jump into the T2 cache, which is wrong. To prevent this
|
||||
the strategy remembers which transaction added a buffer to the T1 list and
|
||||
will not promote it from there into the T2 cache during the same transaction.
|
||||
|
||||
Another specialty is the change of the strategy during VACUUM. Lookups during
|
||||
VACUUM do not represent application needs, and do not suggest that the page
|
||||
will be hit again soon, so it would be wrong to change the cache balance
|
||||
T1target due to that or to cause massive cache evictions. Therefore, a page
|
||||
read in to satisfy vacuum is placed at the LRU position of the T1 list, for
|
||||
immediate reuse. Also, if we happen to get a hit on a CDB entry during
|
||||
VACUUM, we do not promote the page above its current position in the list.
|
||||
Since VACUUM usually requests many pages very fast, the effect of this is that
|
||||
it will get back the very buffers it filled and possibly modified on the next
|
||||
call and will therefore do its work in a few shared memory buffers, while
|
||||
being able to use whatever it finds in the cache already. This also implies
|
||||
that most of the write traffic caused by a VACUUM will be done by the VACUUM
|
||||
itself and not pushed off onto other processes.
|
||||
|
||||
|
||||
Background writer's processing
|
||||
------------------------------
|
||||
|
||||
The background writer is designed to write out pages that are likely to be
|
||||
recycled soon, thereby offloading the writing work from active backends.
|
||||
To do this, it scans forward circularly from the current position of
|
||||
NextVictimBuffer (which it does not change!), looking for buffers that are
|
||||
dirty and not pinned nor marked with a positive usage count. It pins,
|
||||
writes, and releases any such buffer.
|
||||
|
||||
If we can assume that reading NextVictimBuffer is an atomic action, then
|
||||
the writer doesn't even need to take the BufFreelistLock in order to look
|
||||
for buffers to write; it needs only to spinlock each buffer header for long
|
||||
enough to check the dirtybit. Even without that assumption, the writer
|
||||
only needs to take the lock long enough to read the variable value, not
|
||||
while scanning the buffers. (This is a very substantial improvement in
|
||||
the contention cost of the writer compared to PG 8.0.)
|
||||
|
||||
During a checkpoint, the writer's strategy must be to write every dirty
|
||||
buffer (pinned or not!). We may as well make it start this scan from
|
||||
NextVictimBuffer, however, so that the first-to-be-written pages are the
|
||||
ones that backends might otherwise have to write for themselves soon.
|
||||
|
@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.71 2005/02/03 23:29:11 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.72 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -22,6 +22,8 @@ BufferDesc *BufferDescriptors;
|
||||
Block *BufferBlockPointers;
|
||||
int32 *PrivateRefCount;
|
||||
|
||||
static char *BufferBlocks;
|
||||
|
||||
/* statistics counters */
|
||||
long int ReadBufferCount;
|
||||
long int ReadLocalBufferCount;
|
||||
@ -50,16 +52,11 @@ long int LocalBufferFlushCount;
|
||||
*
|
||||
* Synchronization/Locking:
|
||||
*
|
||||
* BufMgrLock lock -- must be acquired before manipulating the
|
||||
* buffer search datastructures (lookup/freelist, as well as the
|
||||
* flag bits of any buffer). Must be released
|
||||
* before exit and before doing any IO.
|
||||
*
|
||||
* IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
|
||||
* It must be set when an IO is initiated and cleared at
|
||||
* the end of the IO. It is there to make sure that one
|
||||
* process doesn't start to use a buffer while another is
|
||||
* faulting it in. see IOWait/IOSignal.
|
||||
* faulting it in. see WaitIO and related routines.
|
||||
*
|
||||
* refcount -- Counts the number of processes holding pins on a buffer.
|
||||
* A buffer is pinned during IO and immediately after a BufferAlloc().
|
||||
@ -85,10 +82,8 @@ long int LocalBufferFlushCount;
|
||||
void
|
||||
InitBufferPool(void)
|
||||
{
|
||||
char *BufferBlocks;
|
||||
bool foundBufs,
|
||||
foundDescs;
|
||||
int i;
|
||||
|
||||
BufferDescriptors = (BufferDesc *)
|
||||
ShmemInitStruct("Buffer Descriptors",
|
||||
@ -102,52 +97,42 @@ InitBufferPool(void)
|
||||
{
|
||||
/* both should be present or neither */
|
||||
Assert(foundDescs && foundBufs);
|
||||
/* note: this path is only taken in EXEC_BACKEND case */
|
||||
}
|
||||
else
|
||||
{
|
||||
BufferDesc *buf;
|
||||
char *block;
|
||||
|
||||
/*
|
||||
* It's probably not really necessary to grab the lock --- if
|
||||
* there's anyone else attached to the shmem at this point, we've
|
||||
* got problems.
|
||||
*/
|
||||
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
|
||||
int i;
|
||||
|
||||
buf = BufferDescriptors;
|
||||
block = BufferBlocks;
|
||||
|
||||
/*
|
||||
* Initialize all the buffer headers.
|
||||
*/
|
||||
for (i = 0; i < NBuffers; block += BLCKSZ, buf++, i++)
|
||||
for (i = 0; i < NBuffers; buf++, i++)
|
||||
{
|
||||
Assert(ShmemIsValid((unsigned long) block));
|
||||
|
||||
/*
|
||||
* The bufNext fields link together all totally-unused buffers.
|
||||
* Subsequent management of this list is done by
|
||||
* StrategyGetBuffer().
|
||||
*/
|
||||
buf->bufNext = i + 1;
|
||||
|
||||
CLEAR_BUFFERTAG(buf->tag);
|
||||
buf->flags = 0;
|
||||
buf->usage_count = 0;
|
||||
buf->refcount = 0;
|
||||
buf->wait_backend_id = 0;
|
||||
|
||||
SpinLockInit(&buf->buf_hdr_lock);
|
||||
|
||||
buf->buf_id = i;
|
||||
|
||||
buf->data = MAKE_OFFSET(block);
|
||||
buf->flags = 0;
|
||||
buf->refcount = 0;
|
||||
/*
|
||||
* Initially link all the buffers together as unused.
|
||||
* Subsequent management of this list is done by freelist.c.
|
||||
*/
|
||||
buf->freeNext = i + 1;
|
||||
|
||||
buf->io_in_progress_lock = LWLockAssign();
|
||||
buf->cntx_lock = LWLockAssign();
|
||||
buf->cntxDirty = false;
|
||||
buf->wait_backend_id = 0;
|
||||
buf->content_lock = LWLockAssign();
|
||||
}
|
||||
|
||||
/* Correct last entry of linked list */
|
||||
BufferDescriptors[NBuffers - 1].bufNext = -1;
|
||||
|
||||
LWLockRelease(BufMgrLock);
|
||||
BufferDescriptors[NBuffers - 1].freeNext = FREENEXT_END_OF_LIST;
|
||||
}
|
||||
|
||||
/* Init other shared buffer-management stuff */
|
||||
@ -162,12 +147,13 @@ InitBufferPool(void)
|
||||
* buffer pool.
|
||||
*
|
||||
* NB: this is called before InitProcess(), so we do not have a PGPROC and
|
||||
* cannot do LWLockAcquire; hence we can't actually access the bufmgr's
|
||||
* cannot do LWLockAcquire; hence we can't actually access stuff in
|
||||
* shared memory yet. We are only initializing local data here.
|
||||
*/
|
||||
void
|
||||
InitBufferPoolAccess(void)
|
||||
{
|
||||
char *block;
|
||||
int i;
|
||||
|
||||
/*
|
||||
@ -179,12 +165,18 @@ InitBufferPoolAccess(void)
|
||||
sizeof(*PrivateRefCount));
|
||||
|
||||
/*
|
||||
* Convert shmem offsets into addresses as seen by this process. This
|
||||
* is just to speed up the BufferGetBlock() macro. It is OK to do this
|
||||
* without any lock since the data pointers never change.
|
||||
* Construct addresses for the individual buffer data blocks. We do
|
||||
* this just to speed up the BufferGetBlock() macro. (Since the
|
||||
* addresses should be the same in every backend, we could inherit
|
||||
* this data from the postmaster --- but in the EXEC_BACKEND case
|
||||
* that doesn't work.)
|
||||
*/
|
||||
block = BufferBlocks;
|
||||
for (i = 0; i < NBuffers; i++)
|
||||
BufferBlockPointers[i] = (Block) MAKE_PTR(BufferDescriptors[i].data);
|
||||
{
|
||||
BufferBlockPointers[i] = (Block) block;
|
||||
block += BLCKSZ;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -3,12 +3,9 @@
|
||||
* buf_table.c
|
||||
* routines for mapping BufferTags to buffer indexes.
|
||||
*
|
||||
* NOTE: this module is called only by freelist.c, and the "buffer IDs"
|
||||
* it deals with are whatever freelist.c needs them to be; they may not be
|
||||
* directly equivalent to Buffer numbers.
|
||||
*
|
||||
* Note: all routines in this file assume that the BufMgrLock is held
|
||||
* by the caller, so no synchronization is needed.
|
||||
* Note: the routines in this file do no locking of their own. The caller
|
||||
* must hold a suitable lock on the BufMappingLock, as specified in the
|
||||
* comments.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
@ -16,7 +13,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.39 2005/02/03 23:29:11 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.40 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -74,17 +71,17 @@ InitBufTable(int size)
|
||||
/*
|
||||
* BufTableLookup
|
||||
* Lookup the given BufferTag; return buffer ID, or -1 if not found
|
||||
*
|
||||
* Caller must hold at least share lock on BufMappingLock
|
||||
*/
|
||||
int
|
||||
BufTableLookup(BufferTag *tagPtr)
|
||||
{
|
||||
BufferLookupEnt *result;
|
||||
|
||||
if (tagPtr->blockNum == P_NEW)
|
||||
return -1;
|
||||
|
||||
result = (BufferLookupEnt *)
|
||||
hash_search(SharedBufHash, (void *) tagPtr, HASH_FIND, NULL);
|
||||
|
||||
if (!result)
|
||||
return -1;
|
||||
|
||||
@ -93,14 +90,23 @@ BufTableLookup(BufferTag *tagPtr)
|
||||
|
||||
/*
|
||||
* BufTableInsert
|
||||
* Insert a hashtable entry for given tag and buffer ID
|
||||
* Insert a hashtable entry for given tag and buffer ID,
|
||||
* unless an entry already exists for that tag
|
||||
*
|
||||
* Returns -1 on successful insertion. If a conflicting entry exists
|
||||
* already, returns the buffer ID in that entry.
|
||||
*
|
||||
* Caller must hold write lock on BufMappingLock
|
||||
*/
|
||||
void
|
||||
int
|
||||
BufTableInsert(BufferTag *tagPtr, int buf_id)
|
||||
{
|
||||
BufferLookupEnt *result;
|
||||
bool found;
|
||||
|
||||
Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
|
||||
Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
|
||||
|
||||
result = (BufferLookupEnt *)
|
||||
hash_search(SharedBufHash, (void *) tagPtr, HASH_ENTER, &found);
|
||||
|
||||
@ -109,15 +115,19 @@ BufTableInsert(BufferTag *tagPtr, int buf_id)
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of shared memory")));
|
||||
|
||||
if (found) /* found something already in the table? */
|
||||
elog(ERROR, "shared buffer hash table corrupted");
|
||||
if (found) /* found something already in the table */
|
||||
return result->id;
|
||||
|
||||
result->id = buf_id;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* BufTableDelete
|
||||
* Delete the hashtable entry for given tag (which must exist)
|
||||
*
|
||||
* Caller must hold write lock on BufMappingLock
|
||||
*/
|
||||
void
|
||||
BufTableDelete(BufferTag *tagPtr)
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.62 2005/01/10 20:02:21 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.63 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -24,6 +24,10 @@
|
||||
|
||||
/*#define LBDEBUG*/
|
||||
|
||||
/* Note: this macro only works on local buffers, not shared ones! */
|
||||
#define LocalBufHdrGetBlock(bufHdr) \
|
||||
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
|
||||
|
||||
/* should be a GUC parameter some day */
|
||||
int NLocBuffer = 64;
|
||||
|
||||
@ -39,7 +43,7 @@ static int nextFreeLocalBuf = 0;
|
||||
* allocate a local buffer. We do round robin allocation for now.
|
||||
*
|
||||
* API is similar to bufmgr.c's BufferAlloc, except that we do not need
|
||||
* to have the BufMgrLock since this is all local. Also, IO_IN_PROGRESS
|
||||
* to do any locking since this is all local. Also, IO_IN_PROGRESS
|
||||
* does not get set.
|
||||
*/
|
||||
BufferDesc *
|
||||
@ -47,11 +51,12 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
{
|
||||
BufferTag newTag; /* identity of requested block */
|
||||
int i;
|
||||
int trycounter;
|
||||
BufferDesc *bufHdr;
|
||||
|
||||
INIT_BUFFERTAG(newTag, reln, blockNum);
|
||||
|
||||
/* a low tech search for now -- not optimized for scans */
|
||||
/* a low tech search for now -- should use a hashtable */
|
||||
for (i = 0; i < NLocBuffer; i++)
|
||||
{
|
||||
bufHdr = &LocalBufferDescriptors[i];
|
||||
@ -81,32 +86,44 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
RelationGetRelid(reln), blockNum, -nextFreeLocalBuf - 1);
|
||||
#endif
|
||||
|
||||
/* need to get a new buffer (round robin for now) */
|
||||
bufHdr = NULL;
|
||||
for (i = 0; i < NLocBuffer; i++)
|
||||
/*
|
||||
* Need to get a new buffer. We use a clock sweep algorithm
|
||||
* (essentially the same as what freelist.c does now...)
|
||||
*/
|
||||
trycounter = NLocBuffer;
|
||||
for (;;)
|
||||
{
|
||||
int b = (nextFreeLocalBuf + i) % NLocBuffer;
|
||||
int b = nextFreeLocalBuf;
|
||||
|
||||
if (LocalRefCount[b] == 0)
|
||||
if (++nextFreeLocalBuf >= NLocBuffer)
|
||||
nextFreeLocalBuf = 0;
|
||||
|
||||
bufHdr = &LocalBufferDescriptors[b];
|
||||
|
||||
if (LocalRefCount[b] == 0 && bufHdr->usage_count == 0)
|
||||
{
|
||||
bufHdr = &LocalBufferDescriptors[b];
|
||||
LocalRefCount[b]++;
|
||||
ResourceOwnerRememberBuffer(CurrentResourceOwner,
|
||||
BufferDescriptorGetBuffer(bufHdr));
|
||||
nextFreeLocalBuf = (b + 1) % NLocBuffer;
|
||||
BufferDescriptorGetBuffer(bufHdr));
|
||||
break;
|
||||
}
|
||||
|
||||
if (bufHdr->usage_count > 0)
|
||||
{
|
||||
bufHdr->usage_count--;
|
||||
trycounter = NLocBuffer;
|
||||
}
|
||||
else if (--trycounter == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("no empty local buffer available")));
|
||||
}
|
||||
if (bufHdr == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("no empty local buffer available")));
|
||||
|
||||
/*
|
||||
* this buffer is not referenced but it might still be dirty. if
|
||||
* that's the case, write it out before reusing it!
|
||||
*/
|
||||
if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
|
||||
if (bufHdr->flags & BM_DIRTY)
|
||||
{
|
||||
SMgrRelation oreln;
|
||||
|
||||
@ -116,7 +133,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
/* And write... */
|
||||
smgrwrite(oreln,
|
||||
bufHdr->tag.blockNum,
|
||||
(char *) MAKE_PTR(bufHdr->data),
|
||||
(char *) LocalBufHdrGetBlock(bufHdr),
|
||||
true);
|
||||
|
||||
LocalBufferFlushCount++;
|
||||
@ -129,7 +146,7 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
* use, so it's okay to do it (and possibly error out) before marking
|
||||
* the buffer as not dirty.
|
||||
*/
|
||||
if (bufHdr->data == (SHMEM_OFFSET) 0)
|
||||
if (LocalBufHdrGetBlock(bufHdr) == NULL)
|
||||
{
|
||||
char *data = (char *) malloc(BLCKSZ);
|
||||
|
||||
@ -138,17 +155,10 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
|
||||
/*
|
||||
* This is a bit of a hack: bufHdr->data needs to be a shmem
|
||||
* offset for consistency with the shared-buffer case, so make it
|
||||
* one even though it's not really a valid shmem offset.
|
||||
*/
|
||||
bufHdr->data = MAKE_OFFSET(data);
|
||||
|
||||
/*
|
||||
* Set pointer for use by BufferGetBlock() macro.
|
||||
*/
|
||||
LocalBufferBlockPointers[-(bufHdr->buf_id + 2)] = (Block) data;
|
||||
LocalBufHdrGetBlock(bufHdr) = (Block) data;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -156,7 +166,8 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
|
||||
*/
|
||||
bufHdr->tag = newTag;
|
||||
bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
|
||||
bufHdr->cntxDirty = false;
|
||||
bufHdr->flags |= BM_TAG_VALID;
|
||||
bufHdr->usage_count = 0;
|
||||
|
||||
*foundPtr = FALSE;
|
||||
return bufHdr;
|
||||
@ -170,6 +181,7 @@ void
|
||||
WriteLocalBuffer(Buffer buffer, bool release)
|
||||
{
|
||||
int bufid;
|
||||
BufferDesc *bufHdr;
|
||||
|
||||
Assert(BufferIsLocal(buffer));
|
||||
|
||||
@ -178,12 +190,18 @@ WriteLocalBuffer(Buffer buffer, bool release)
|
||||
#endif
|
||||
|
||||
bufid = -(buffer + 1);
|
||||
LocalBufferDescriptors[bufid].flags |= BM_DIRTY;
|
||||
|
||||
Assert(LocalRefCount[bufid] > 0);
|
||||
|
||||
bufHdr = &LocalBufferDescriptors[bufid];
|
||||
bufHdr->flags |= BM_DIRTY;
|
||||
|
||||
if (release)
|
||||
{
|
||||
Assert(LocalRefCount[bufid] > 0);
|
||||
LocalRefCount[bufid]--;
|
||||
if (LocalRefCount[bufid] == 0 &&
|
||||
bufHdr->usage_count < BM_MAX_USAGE_COUNT)
|
||||
bufHdr->usage_count++;
|
||||
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
|
||||
}
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Written by Peter Eisentraut <peter_e@gmx.net>.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.253 2005/03/01 20:23:34 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.254 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*--------------------------------------------------------------------
|
||||
*/
|
||||
@ -77,7 +77,6 @@ extern bool Log_disconnections;
|
||||
extern DLLIMPORT bool check_function_bodies;
|
||||
extern int CommitDelay;
|
||||
extern int CommitSiblings;
|
||||
extern int DebugSharedBuffers;
|
||||
extern char *default_tablespace;
|
||||
|
||||
static const char *assign_log_destination(const char *value,
|
||||
@ -1230,15 +1229,6 @@ static struct config_int ConfigureNamesInt[] =
|
||||
-1, -1, INT_MAX / 1000, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
{"debug_shared_buffers", PGC_POSTMASTER, STATS_MONITORING,
|
||||
gettext_noop("Interval to report shared buffer status in seconds"),
|
||||
NULL
|
||||
},
|
||||
&DebugSharedBuffers,
|
||||
0, 0, 600, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
{"bgwriter_delay", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer sleep time between rounds in milliseconds"),
|
||||
@ -1249,21 +1239,21 @@ static struct config_int ConfigureNamesInt[] =
|
||||
},
|
||||
|
||||
{
|
||||
{"bgwriter_percent", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer percentage of dirty buffers to flush per round"),
|
||||
{"bgwriter_lru_maxpages", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer maximum number of all pages to flush per round"),
|
||||
NULL
|
||||
},
|
||||
&BgWriterPercent,
|
||||
1, 0, 100, NULL, NULL
|
||||
&bgwriter_lru_maxpages,
|
||||
5, 0, 1000, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
{"bgwriter_maxpages", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer maximum number of pages to flush per round"),
|
||||
{"bgwriter_all_maxpages", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer maximum number of LRU pages to flush per round"),
|
||||
NULL
|
||||
},
|
||||
&BgWriterMaxPages,
|
||||
100, 0, 1000, NULL, NULL
|
||||
&bgwriter_all_maxpages,
|
||||
5, 0, 1000, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
@ -1394,6 +1384,24 @@ static struct config_real ConfigureNamesReal[] =
|
||||
MAX_GEQO_SELECTION_BIAS, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer percentage of LRU buffers to flush per round"),
|
||||
NULL
|
||||
},
|
||||
&bgwriter_lru_percent,
|
||||
1.0, 0.0, 100.0, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
{"bgwriter_all_percent", PGC_SIGHUP, RESOURCES,
|
||||
gettext_noop("Background writer percentage of all buffers to flush per round"),
|
||||
NULL
|
||||
},
|
||||
&bgwriter_all_percent,
|
||||
0.333, 0.0, 100.0, NULL, NULL
|
||||
},
|
||||
|
||||
{
|
||||
{"seed", PGC_USERSET, UNGROUPED,
|
||||
gettext_noop("Sets the seed for random-number generation."),
|
||||
|
@ -99,8 +99,10 @@
|
||||
# - Background writer -
|
||||
|
||||
#bgwriter_delay = 200 # 10-10000 milliseconds between rounds
|
||||
#bgwriter_percent = 1 # 0-100% of dirty buffers in each round
|
||||
#bgwriter_maxpages = 100 # 0-1000 buffers max per round
|
||||
#bgwriter_lru_percent = 1.0 # 0-100% of LRU buffers scanned in each round
|
||||
#bgwriter_lru_maxpages = 5 # 0-1000 buffers max written per round
|
||||
#bgwriter_all_percent = 0.333 # 0-100% of all buffers scanned in each round
|
||||
#bgwriter_all_maxpages = 5 # 0-1000 buffers max written per round
|
||||
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
|
@ -14,7 +14,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/utils/resowner/resowner.c,v 1.9 2004/12/31 22:02:50 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/utils/resowner/resowner.c,v 1.10 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -200,12 +200,7 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
|
||||
* that would indicate failure to clean up the executor correctly ---
|
||||
* so issue warnings. In the abort case, just clean up quietly.
|
||||
*
|
||||
* XXX this is fairly inefficient due to multiple BufMgrLock
|
||||
* grabs if there are lots of buffers to be released, but we
|
||||
* don't expect many (indeed none in the success case) so it's
|
||||
* probably not worth optimizing.
|
||||
*
|
||||
* We are however careful to release back-to-front, so as to
|
||||
* We are careful to do the releasing back-to-front, so as to
|
||||
* avoid O(N^2) behavior in ResourceOwnerForgetBuffer().
|
||||
*/
|
||||
while (owner->nbuffers > 0)
|
||||
|
@ -5,7 +5,7 @@
|
||||
*
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.4 2004/12/31 22:03:39 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.5 2005/03/04 20:21:06 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -18,8 +18,6 @@
|
||||
|
||||
/* GUC options */
|
||||
extern int BgWriterDelay;
|
||||
extern int BgWriterPercent;
|
||||
extern int BgWriterMaxPages;
|
||||
extern int CheckPointTimeout;
|
||||
extern int CheckPointWarning;
|
||||
|
||||
|
@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.76 2005/02/03 23:29:19 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.77 2005/03/04 20:21:07 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -19,24 +19,39 @@
|
||||
#include "storage/buf.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "storage/spin.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
|
||||
/*
|
||||
* Flags for buffer descriptors
|
||||
*
|
||||
* Note: TAG_VALID essentially means that there is a buffer hashtable
|
||||
* entry associated with the buffer's tag.
|
||||
*/
|
||||
#define BM_DIRTY (1 << 0) /* data needs writing */
|
||||
#define BM_VALID (1 << 1) /* data is valid */
|
||||
#define BM_IO_IN_PROGRESS (1 << 2) /* read or write in
|
||||
#define BM_TAG_VALID (1 << 2) /* tag is assigned */
|
||||
#define BM_IO_IN_PROGRESS (1 << 3) /* read or write in
|
||||
* progress */
|
||||
#define BM_IO_ERROR (1 << 3) /* previous I/O failed */
|
||||
#define BM_JUST_DIRTIED (1 << 4) /* dirtied since write
|
||||
#define BM_IO_ERROR (1 << 4) /* previous I/O failed */
|
||||
#define BM_JUST_DIRTIED (1 << 5) /* dirtied since write
|
||||
* started */
|
||||
#define BM_PIN_COUNT_WAITER (1 << 5) /* have waiter for sole
|
||||
#define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole
|
||||
* pin */
|
||||
|
||||
typedef bits16 BufFlags;
|
||||
|
||||
/*
|
||||
* The maximum allowed value of usage_count represents a tradeoff between
|
||||
* accuracy and speed of the clock-sweep buffer management algorithm. A
|
||||
* large value (comparable to NBuffers) would approximate LRU semantics.
|
||||
* But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
|
||||
* clock sweeps to find a free buffer, so in practice we don't want the
|
||||
* value to be very large.
|
||||
*/
|
||||
#define BM_MAX_USAGE_COUNT 5
|
||||
|
||||
/*
|
||||
* Buffer tag identifies which disk block the buffer contains.
|
||||
*
|
||||
@ -77,45 +92,81 @@ typedef struct buftag
|
||||
|
||||
/*
|
||||
* BufferDesc -- shared descriptor/state data for a single shared buffer.
|
||||
*
|
||||
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
|
||||
* usage_count, refcount, or wait_backend_id fields. buf_id field never
|
||||
* changes after initialization, so does not need locking. freeNext is
|
||||
* protected by the BufFreelistLock not buf_hdr_lock. The LWLocks can take
|
||||
* care of themselves. The buf_hdr_lock is *not* used to control access to
|
||||
* the data in the buffer!
|
||||
*
|
||||
* An exception is that if we have the buffer pinned, its tag can't change
|
||||
* underneath us, so we can examine the tag without locking the spinlock.
|
||||
* Also, in places we do one-time reads of the flags without bothering to
|
||||
* lock the spinlock; this is generally for situations where we don't expect
|
||||
* the flag bit being tested to be changing.
|
||||
*
|
||||
* We can't physically remove items from a disk page if another backend has
|
||||
* the buffer pinned. Hence, a backend may need to wait for all other pins
|
||||
* to go away. This is signaled by storing its own backend ID into
|
||||
* wait_backend_id and setting flag bit BM_PIN_COUNT_WAITER. At present,
|
||||
* there can be only one such waiter per buffer.
|
||||
*
|
||||
* We use this same struct for local buffer headers, but the lock fields
|
||||
* are not used and not all of the flag bits are useful either.
|
||||
*/
|
||||
typedef struct sbufdesc
|
||||
{
|
||||
Buffer bufNext; /* link in freelist chain */
|
||||
SHMEM_OFFSET data; /* pointer to data in buf pool */
|
||||
|
||||
/* tag and id must be together for table lookup (still true?) */
|
||||
BufferTag tag; /* file/block identifier */
|
||||
int buf_id; /* buffer's index number (from 0) */
|
||||
|
||||
BufferTag tag; /* ID of page contained in buffer */
|
||||
BufFlags flags; /* see bit definitions above */
|
||||
uint16 usage_count; /* usage counter for clock sweep code */
|
||||
unsigned refcount; /* # of backends holding pins on buffer */
|
||||
BackendId wait_backend_id; /* backend ID of pin-count waiter */
|
||||
|
||||
slock_t buf_hdr_lock; /* protects the above fields */
|
||||
|
||||
int buf_id; /* buffer's index number (from 0) */
|
||||
int freeNext; /* link in freelist chain */
|
||||
|
||||
LWLockId io_in_progress_lock; /* to wait for I/O to complete */
|
||||
LWLockId cntx_lock; /* to lock access to page context */
|
||||
|
||||
bool cntxDirty; /* new way to mark block as dirty */
|
||||
|
||||
/*
|
||||
* We can't physically remove items from a disk page if another
|
||||
* backend has the buffer pinned. Hence, a backend may need to wait
|
||||
* for all other pins to go away. This is signaled by storing its own
|
||||
* backend ID into wait_backend_id and setting flag bit
|
||||
* BM_PIN_COUNT_WAITER. At present, there can be only one such waiter
|
||||
* per buffer.
|
||||
*/
|
||||
BackendId wait_backend_id; /* backend ID of pin-count waiter */
|
||||
LWLockId content_lock; /* to lock access to buffer contents */
|
||||
} BufferDesc;
|
||||
|
||||
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
|
||||
|
||||
/*
|
||||
* The freeNext field is either the index of the next freelist entry,
|
||||
* or one of these special values:
|
||||
*/
|
||||
#define FREENEXT_END_OF_LIST (-1)
|
||||
#define FREENEXT_NOT_IN_LIST (-2)
|
||||
|
||||
/* in bufmgr.c */
|
||||
/*
|
||||
* Macros for acquiring/releasing a buffer header's spinlock. The
|
||||
* NoHoldoff cases may be used when we know that we hold some LWLock
|
||||
* and therefore interrupts are already held off. Do not apply these
|
||||
* to local buffers!
|
||||
*/
|
||||
#define LockBufHdr(bufHdr) \
|
||||
SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
|
||||
#define UnlockBufHdr(bufHdr) \
|
||||
SpinLockRelease(&(bufHdr)->buf_hdr_lock)
|
||||
#define LockBufHdr_NoHoldoff(bufHdr) \
|
||||
SpinLockAcquire_NoHoldoff(&(bufHdr)->buf_hdr_lock)
|
||||
#define UnlockBufHdr_NoHoldoff(bufHdr) \
|
||||
SpinLockRelease_NoHoldoff(&(bufHdr)->buf_hdr_lock)
|
||||
|
||||
|
||||
/* in buf_init.c */
|
||||
extern BufferDesc *BufferDescriptors;
|
||||
|
||||
/* in localbuf.c */
|
||||
extern BufferDesc *LocalBufferDescriptors;
|
||||
|
||||
/* counters in buf_init.c */
|
||||
/* in freelist.c */
|
||||
extern bool strategy_hint_vacuum;
|
||||
|
||||
/* event counters in buf_init.c */
|
||||
extern long int ReadBufferCount;
|
||||
extern long int ReadLocalBufferCount;
|
||||
extern long int BufferHitCount;
|
||||
@ -129,15 +180,9 @@ extern long int LocalBufferFlushCount;
|
||||
*/
|
||||
|
||||
/* freelist.c */
|
||||
extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
|
||||
int *cdb_found_index);
|
||||
extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index);
|
||||
extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
|
||||
int cdb_found_index, int cdb_replace_index);
|
||||
extern void StrategyInvalidateBuffer(BufferDesc *buf);
|
||||
extern void StrategyHintVacuum(bool vacuum_active);
|
||||
extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
|
||||
int max_buffers);
|
||||
extern BufferDesc *StrategyGetBuffer(void);
|
||||
extern void StrategyFreeBuffer(BufferDesc *buf, bool at_head);
|
||||
extern int StrategySyncStart(void);
|
||||
extern int StrategyShmemSize(void);
|
||||
extern void StrategyInitialize(bool init);
|
||||
|
||||
@ -145,7 +190,7 @@ extern void StrategyInitialize(bool init);
|
||||
extern int BufTableShmemSize(int size);
|
||||
extern void InitBufTable(int size);
|
||||
extern int BufTableLookup(BufferTag *tagPtr);
|
||||
extern void BufTableInsert(BufferTag *tagPtr, int buf_id);
|
||||
extern int BufTableInsert(BufferTag *tagPtr, int buf_id);
|
||||
extern void BufTableDelete(BufferTag *tagPtr);
|
||||
|
||||
/* localbuf.c */
|
||||
|
@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.89 2004/12/31 22:03:42 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.90 2005/03/04 20:21:07 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -27,21 +27,25 @@ extern DLLIMPORT int NBuffers;
|
||||
|
||||
/* in bufmgr.c */
|
||||
extern bool zero_damaged_pages;
|
||||
extern double bgwriter_lru_percent;
|
||||
extern double bgwriter_all_percent;
|
||||
extern int bgwriter_lru_maxpages;
|
||||
extern int bgwriter_all_maxpages;
|
||||
|
||||
/* in buf_init.c */
|
||||
extern DLLIMPORT Block *BufferBlockPointers;
|
||||
extern int32 *PrivateRefCount;
|
||||
extern DLLIMPORT int32 *PrivateRefCount;
|
||||
|
||||
/* in localbuf.c */
|
||||
extern DLLIMPORT int NLocBuffer;
|
||||
extern DLLIMPORT Block *LocalBufferBlockPointers;
|
||||
extern int32 *LocalRefCount;
|
||||
extern DLLIMPORT int32 *LocalRefCount;
|
||||
|
||||
/* special block number for ReadBuffer() */
|
||||
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
|
||||
|
||||
/*
|
||||
* Buffer context lock modes
|
||||
* Buffer content lock modes (mode argument for LockBuffer())
|
||||
*/
|
||||
#define BUFFER_LOCK_UNLOCK 0
|
||||
#define BUFFER_LOCK_SHARE 1
|
||||
@ -150,8 +154,12 @@ extern void LockBufferForCleanup(Buffer buffer);
|
||||
extern void AbortBufferIO(void);
|
||||
|
||||
extern void BufmgrCommit(void);
|
||||
extern int BufferSync(int percent, int maxpages);
|
||||
extern void BufferSync(void);
|
||||
extern void BgBufferSync(void);
|
||||
|
||||
extern void InitLocalBuffer(void);
|
||||
|
||||
/* in freelist.c */
|
||||
extern void StrategyHintVacuum(bool vacuum_active);
|
||||
|
||||
#endif
|
||||
|
@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.16 2004/12/31 22:03:42 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.17 2005/03/04 20:21:07 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -25,7 +25,8 @@
|
||||
*/
|
||||
typedef enum LWLockId
|
||||
{
|
||||
BufMgrLock,
|
||||
BufMappingLock,
|
||||
BufFreelistLock,
|
||||
LockMgrLock,
|
||||
OidGenLock,
|
||||
XidGenLock,
|
||||
|
Loading…
x
Reference in New Issue
Block a user