|
|
|
@ -19,6 +19,10 @@
|
|
|
|
|
* and pin it so that no one can destroy it while this process
|
|
|
|
|
* is using it.
|
|
|
|
|
*
|
|
|
|
|
* StartReadBuffer() -- as above, with separate wait step
|
|
|
|
|
* StartReadBuffers() -- multiple block version
|
|
|
|
|
* WaitReadBuffers() -- second step of above
|
|
|
|
|
*
|
|
|
|
|
* ReleaseBuffer() -- unpin a buffer
|
|
|
|
|
*
|
|
|
|
|
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
|
|
|
|
@ -152,6 +156,13 @@ int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
|
|
|
|
|
*/
|
|
|
|
|
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Limit on how many blocks should be handled in single I/O operations.
|
|
|
|
|
* StartReadBuffers() callers should respect it, as should other operations
|
|
|
|
|
* that call smgr APIs directly.
|
|
|
|
|
*/
|
|
|
|
|
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* GUC variables about triggering kernel writeback for buffers written; OS
|
|
|
|
|
* dependent defaults are set via the GUC mechanism.
|
|
|
|
@ -471,10 +482,10 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
|
|
|
|
|
static Buffer ReadBuffer_common(Relation rel,
|
|
|
|
|
SMgrRelation smgr, char smgr_persistence,
|
|
|
|
|
ForkNumber forkNum, BlockNumber blockNum,
|
|
|
|
|
ReadBufferMode mode, BufferAccessStrategy strategy,
|
|
|
|
|
bool *hit);
|
|
|
|
|
ReadBufferMode mode, BufferAccessStrategy strategy);
|
|
|
|
|
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
|
|
|
|
|
ForkNumber fork,
|
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
@ -500,18 +511,18 @@ static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
|
|
|
|
|
static int SyncOneBuffer(int buf_id, bool skip_recently_used,
|
|
|
|
|
WritebackContext *wb_context);
|
|
|
|
|
static void WaitIO(BufferDesc *buf);
|
|
|
|
|
static bool StartBufferIO(BufferDesc *buf, bool forInput);
|
|
|
|
|
static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
|
|
|
|
|
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
|
|
|
|
|
uint32 set_flag_bits, bool forget_owner);
|
|
|
|
|
static void AbortBufferIO(Buffer buffer);
|
|
|
|
|
static void shared_buffer_write_error_callback(void *arg);
|
|
|
|
|
static void local_buffer_write_error_callback(void *arg);
|
|
|
|
|
static BufferDesc *BufferAlloc(SMgrRelation smgr,
|
|
|
|
|
char relpersistence,
|
|
|
|
|
ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum,
|
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
|
bool *foundPtr, IOContext io_context);
|
|
|
|
|
static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
|
|
|
|
|
char relpersistence,
|
|
|
|
|
ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum,
|
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
|
bool *foundPtr, IOContext io_context);
|
|
|
|
|
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
|
|
|
|
|
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
|
|
|
|
|
IOObject io_object, IOContext io_context);
|
|
|
|
@ -777,11 +788,10 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
|
|
|
|
|
* If strategy is not NULL, a nondefault buffer access strategy is used.
|
|
|
|
|
* See buffer/README for details.
|
|
|
|
|
*/
|
|
|
|
|
Buffer
|
|
|
|
|
inline Buffer
|
|
|
|
|
ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
|
|
|
|
|
ReadBufferMode mode, BufferAccessStrategy strategy)
|
|
|
|
|
{
|
|
|
|
|
bool hit;
|
|
|
|
|
Buffer buf;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
@ -798,11 +808,9 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
|
|
|
|
|
* Read the buffer, and update pgstat counters to reflect a cache hit or
|
|
|
|
|
* miss.
|
|
|
|
|
*/
|
|
|
|
|
pgstat_count_buffer_read(reln);
|
|
|
|
|
buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
|
|
|
|
|
forkNum, blockNum, mode, strategy, &hit);
|
|
|
|
|
if (hit)
|
|
|
|
|
pgstat_count_buffer_hit(reln);
|
|
|
|
|
buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
|
|
|
|
|
forkNum, blockNum, mode, strategy);
|
|
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -822,13 +830,12 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum, ReadBufferMode mode,
|
|
|
|
|
BufferAccessStrategy strategy, bool permanent)
|
|
|
|
|
{
|
|
|
|
|
bool hit;
|
|
|
|
|
|
|
|
|
|
SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
|
|
|
|
|
|
|
|
|
|
return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
|
|
|
|
|
RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
|
|
|
|
|
mode, strategy, &hit);
|
|
|
|
|
return ReadBuffer_common(NULL, smgr,
|
|
|
|
|
permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
|
|
|
|
|
forkNum, blockNum,
|
|
|
|
|
mode, strategy);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
@ -994,35 +1001,162 @@ ExtendBufferedRelTo(BufferManagerRelation bmr,
|
|
|
|
|
*/
|
|
|
|
|
if (buffer == InvalidBuffer)
|
|
|
|
|
{
|
|
|
|
|
bool hit;
|
|
|
|
|
|
|
|
|
|
Assert(extended_by == 0);
|
|
|
|
|
buffer = ReadBuffer_common(bmr.smgr, bmr.relpersistence,
|
|
|
|
|
fork, extend_to - 1, mode, strategy,
|
|
|
|
|
&hit);
|
|
|
|
|
buffer = ReadBuffer_common(bmr.rel, bmr.smgr, 0,
|
|
|
|
|
fork, extend_to - 1, mode, strategy);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* ReadBuffer_common -- common logic for all ReadBuffer variants
|
|
|
|
|
*
|
|
|
|
|
* *hit is set to true if the request was satisfied from shared buffer cache.
|
|
|
|
|
* Zero a buffer and lock it, as part of the implementation of
|
|
|
|
|
* RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
|
|
|
|
|
* pinned. It does not have to be valid, but it is valid and locked on
|
|
|
|
|
* return.
|
|
|
|
|
*/
|
|
|
|
|
static Buffer
|
|
|
|
|
ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum, ReadBufferMode mode,
|
|
|
|
|
BufferAccessStrategy strategy, bool *hit)
|
|
|
|
|
static void
|
|
|
|
|
ZeroBuffer(Buffer buffer, ReadBufferMode mode)
|
|
|
|
|
{
|
|
|
|
|
BufferDesc *bufHdr;
|
|
|
|
|
uint32 buf_state;
|
|
|
|
|
|
|
|
|
|
Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
|
|
|
|
|
|
|
|
|
|
if (BufferIsLocal(buffer))
|
|
|
|
|
bufHdr = GetLocalBufferDescriptor(-buffer - 1);
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
bufHdr = GetBufferDescriptor(buffer - 1);
|
|
|
|
|
if (mode == RBM_ZERO_AND_LOCK)
|
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
|
else
|
|
|
|
|
LockBufferForCleanup(buffer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
memset(BufferGetPage(buffer), 0, BLCKSZ);
|
|
|
|
|
|
|
|
|
|
if (BufferIsLocal(buffer))
|
|
|
|
|
{
|
|
|
|
|
buf_state = pg_atomic_read_u32(&bufHdr->state);
|
|
|
|
|
buf_state |= BM_VALID;
|
|
|
|
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
buf_state = LockBufHdr(bufHdr);
|
|
|
|
|
buf_state |= BM_VALID;
|
|
|
|
|
UnlockBufHdr(bufHdr, buf_state);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Pin a buffer for a given block. *foundPtr is set to true if the block was
|
|
|
|
|
* already present, or false if more work is required to either read it in or
|
|
|
|
|
* zero it.
|
|
|
|
|
*/
|
|
|
|
|
static pg_attribute_always_inline Buffer
|
|
|
|
|
PinBufferForBlock(Relation rel,
|
|
|
|
|
SMgrRelation smgr,
|
|
|
|
|
char smgr_persistence,
|
|
|
|
|
ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum,
|
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
|
bool *foundPtr)
|
|
|
|
|
{
|
|
|
|
|
BufferDesc *bufHdr;
|
|
|
|
|
Block bufBlock;
|
|
|
|
|
bool found;
|
|
|
|
|
IOContext io_context;
|
|
|
|
|
IOObject io_object;
|
|
|
|
|
bool isLocalBuf = SmgrIsTemp(smgr);
|
|
|
|
|
char persistence;
|
|
|
|
|
|
|
|
|
|
*hit = false;
|
|
|
|
|
Assert(blockNum != P_NEW);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If there is no Relation it usually implies recovery and thus permanent,
|
|
|
|
|
* but we take an argmument because CreateAndCopyRelationData can reach us
|
|
|
|
|
* with only an SMgrRelation for an unlogged relation that we don't want
|
|
|
|
|
* to flag with BM_PERMANENT.
|
|
|
|
|
*/
|
|
|
|
|
if (rel)
|
|
|
|
|
persistence = rel->rd_rel->relpersistence;
|
|
|
|
|
else if (smgr_persistence == 0)
|
|
|
|
|
persistence = RELPERSISTENCE_PERMANENT;
|
|
|
|
|
else
|
|
|
|
|
persistence = smgr_persistence;
|
|
|
|
|
|
|
|
|
|
if (persistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
{
|
|
|
|
|
io_context = IOCONTEXT_NORMAL;
|
|
|
|
|
io_object = IOOBJECT_TEMP_RELATION;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
io_context = IOContextForStrategy(strategy);
|
|
|
|
|
io_object = IOOBJECT_RELATION;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
|
|
|
|
|
smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
smgr->smgr_rlocator.backend);
|
|
|
|
|
|
|
|
|
|
if (persistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
{
|
|
|
|
|
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
|
|
|
|
|
if (*foundPtr)
|
|
|
|
|
pgBufferUsage.local_blks_hit++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
|
|
|
|
|
strategy, foundPtr, io_context);
|
|
|
|
|
if (*foundPtr)
|
|
|
|
|
pgBufferUsage.shared_blks_hit++;
|
|
|
|
|
}
|
|
|
|
|
if (rel)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* While pgBufferUsage's "read" counter isn't bumped unless we reach
|
|
|
|
|
* WaitReadBuffers() (so, not for hits, and not for buffers that are
|
|
|
|
|
* zeroed instead), the per-relation stats always count them.
|
|
|
|
|
*/
|
|
|
|
|
pgstat_count_buffer_read(rel);
|
|
|
|
|
if (*foundPtr)
|
|
|
|
|
pgstat_count_buffer_hit(rel);
|
|
|
|
|
}
|
|
|
|
|
if (*foundPtr)
|
|
|
|
|
{
|
|
|
|
|
VacuumPageHit++;
|
|
|
|
|
pgstat_count_io_op(io_object, io_context, IOOP_HIT);
|
|
|
|
|
if (VacuumCostActive)
|
|
|
|
|
VacuumCostBalance += VacuumCostPageHit;
|
|
|
|
|
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
|
|
|
|
|
smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
smgr->smgr_rlocator.backend,
|
|
|
|
|
true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return BufferDescriptorGetBuffer(bufHdr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* ReadBuffer_common -- common logic for all ReadBuffer variants
|
|
|
|
|
*
|
|
|
|
|
* smgr is required, rel is optional unless using P_NEW.
|
|
|
|
|
*/
|
|
|
|
|
static pg_attribute_always_inline Buffer
|
|
|
|
|
ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
|
|
|
|
|
ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum, ReadBufferMode mode,
|
|
|
|
|
BufferAccessStrategy strategy)
|
|
|
|
|
{
|
|
|
|
|
ReadBuffersOperation operation;
|
|
|
|
|
Buffer buffer;
|
|
|
|
|
int flags;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Backward compatibility path, most code should use ExtendBufferedRel()
|
|
|
|
@ -1041,181 +1175,360 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
|
flags |= EB_LOCK_FIRST;
|
|
|
|
|
|
|
|
|
|
return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence),
|
|
|
|
|
forkNum, strategy, flags);
|
|
|
|
|
return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
|
|
|
|
|
smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
smgr->smgr_rlocator.backend);
|
|
|
|
|
|
|
|
|
|
if (isLocalBuf)
|
|
|
|
|
if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
|
|
|
|
|
mode == RBM_ZERO_AND_LOCK))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We do not use a BufferAccessStrategy for I/O of temporary tables.
|
|
|
|
|
* However, in some cases, the "strategy" may not be NULL, so we can't
|
|
|
|
|
* rely on IOContextForStrategy() to set the right IOContext for us.
|
|
|
|
|
* This may happen in cases like CREATE TEMPORARY TABLE AS...
|
|
|
|
|
*/
|
|
|
|
|
io_context = IOCONTEXT_NORMAL;
|
|
|
|
|
io_object = IOOBJECT_TEMP_RELATION;
|
|
|
|
|
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
|
|
|
|
|
if (found)
|
|
|
|
|
pgBufferUsage.local_blks_hit++;
|
|
|
|
|
else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
|
|
|
|
|
mode == RBM_ZERO_ON_ERROR)
|
|
|
|
|
pgBufferUsage.local_blks_read++;
|
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
|
|
buffer = PinBufferForBlock(rel, smgr, smgr_persistence,
|
|
|
|
|
forkNum, blockNum, strategy, &found);
|
|
|
|
|
ZeroBuffer(buffer, mode);
|
|
|
|
|
return buffer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (mode == RBM_ZERO_ON_ERROR)
|
|
|
|
|
flags = READ_BUFFERS_ZERO_ON_ERROR;
|
|
|
|
|
else
|
|
|
|
|
flags = 0;
|
|
|
|
|
operation.smgr = smgr;
|
|
|
|
|
operation.rel = rel;
|
|
|
|
|
operation.smgr_persistence = smgr_persistence;
|
|
|
|
|
operation.forknum = forkNum;
|
|
|
|
|
operation.strategy = strategy;
|
|
|
|
|
if (StartReadBuffer(&operation,
|
|
|
|
|
&buffer,
|
|
|
|
|
blockNum,
|
|
|
|
|
flags))
|
|
|
|
|
WaitReadBuffers(&operation);
|
|
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static pg_attribute_always_inline bool
|
|
|
|
|
StartReadBuffersImpl(ReadBuffersOperation *operation,
|
|
|
|
|
Buffer *buffers,
|
|
|
|
|
BlockNumber blockNum,
|
|
|
|
|
int *nblocks,
|
|
|
|
|
int flags)
|
|
|
|
|
{
|
|
|
|
|
int actual_nblocks = *nblocks;
|
|
|
|
|
int io_buffers_len = 0;
|
|
|
|
|
|
|
|
|
|
Assert(*nblocks > 0);
|
|
|
|
|
Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < actual_nblocks; ++i)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* lookup the buffer. IO_IN_PROGRESS is set if the requested block is
|
|
|
|
|
* not currently in memory.
|
|
|
|
|
*/
|
|
|
|
|
io_context = IOContextForStrategy(strategy);
|
|
|
|
|
io_object = IOOBJECT_RELATION;
|
|
|
|
|
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
|
|
|
|
|
strategy, &found, io_context);
|
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
|
|
buffers[i] = PinBufferForBlock(operation->rel,
|
|
|
|
|
operation->smgr,
|
|
|
|
|
operation->smgr_persistence,
|
|
|
|
|
operation->forknum,
|
|
|
|
|
blockNum + i,
|
|
|
|
|
operation->strategy,
|
|
|
|
|
&found);
|
|
|
|
|
|
|
|
|
|
if (found)
|
|
|
|
|
pgBufferUsage.shared_blks_hit++;
|
|
|
|
|
else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
|
|
|
|
|
mode == RBM_ZERO_ON_ERROR)
|
|
|
|
|
pgBufferUsage.shared_blks_read++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* At this point we do NOT hold any locks. */
|
|
|
|
|
|
|
|
|
|
/* if it was already in the buffer pool, we're done */
|
|
|
|
|
if (found)
|
|
|
|
|
{
|
|
|
|
|
/* Just need to update stats before we exit */
|
|
|
|
|
*hit = true;
|
|
|
|
|
VacuumPageHit++;
|
|
|
|
|
pgstat_count_io_op(io_object, io_context, IOOP_HIT);
|
|
|
|
|
|
|
|
|
|
if (VacuumCostActive)
|
|
|
|
|
VacuumCostBalance += VacuumCostPageHit;
|
|
|
|
|
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
|
|
|
|
|
smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
smgr->smgr_rlocator.backend,
|
|
|
|
|
found);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked
|
|
|
|
|
* on return.
|
|
|
|
|
*/
|
|
|
|
|
if (!isLocalBuf)
|
|
|
|
|
{
|
|
|
|
|
if (mode == RBM_ZERO_AND_LOCK)
|
|
|
|
|
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
|
|
|
|
|
LW_EXCLUSIVE);
|
|
|
|
|
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
|
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
|
|
|
|
|
/*
|
|
|
|
|
* Terminate the read as soon as we get a hit. It could be a
|
|
|
|
|
* single buffer hit, or it could be a hit that follows a readable
|
|
|
|
|
* range. We don't want to create more than one readable range,
|
|
|
|
|
* so we stop here.
|
|
|
|
|
*/
|
|
|
|
|
actual_nblocks = i + 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return BufferDescriptorGetBuffer(bufHdr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* if we have gotten to this point, we have allocated a buffer for the
|
|
|
|
|
* page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
|
|
|
|
|
* if it's a shared buffer.
|
|
|
|
|
*/
|
|
|
|
|
Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
|
|
|
|
|
|
|
|
|
|
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Read in the page, unless the caller intends to overwrite it and just
|
|
|
|
|
* wants us to allocate a buffer.
|
|
|
|
|
*/
|
|
|
|
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
|
MemSet((char *) bufBlock, 0, BLCKSZ);
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
instr_time io_start = pgstat_prepare_io_time(track_io_timing);
|
|
|
|
|
|
|
|
|
|
smgrread(smgr, forkNum, blockNum, bufBlock);
|
|
|
|
|
|
|
|
|
|
pgstat_count_io_op_time(io_object, io_context,
|
|
|
|
|
IOOP_READ, io_start, 1);
|
|
|
|
|
|
|
|
|
|
/* check for garbage data */
|
|
|
|
|
if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
|
|
|
|
|
PIV_LOG_WARNING | PIV_REPORT_STAT))
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
|
|
|
|
|
{
|
|
|
|
|
ereport(WARNING,
|
|
|
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
|
|
|
errmsg("invalid page in block %u of relation %s; zeroing out page",
|
|
|
|
|
blockNum,
|
|
|
|
|
relpath(smgr->smgr_rlocator, forkNum))));
|
|
|
|
|
MemSet((char *) bufBlock, 0, BLCKSZ);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
ereport(ERROR,
|
|
|
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
|
|
|
errmsg("invalid page in block %u of relation %s",
|
|
|
|
|
blockNum,
|
|
|
|
|
relpath(smgr->smgr_rlocator, forkNum))));
|
|
|
|
|
/* Extend the readable range to cover this block. */
|
|
|
|
|
io_buffers_len++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*nblocks = actual_nblocks;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* In RBM_ZERO_AND_LOCK / RBM_ZERO_AND_CLEANUP_LOCK mode, grab the buffer
|
|
|
|
|
* content lock before marking the page as valid, to make sure that no
|
|
|
|
|
* other backend sees the zeroed page before the caller has had a chance
|
|
|
|
|
* to initialize it.
|
|
|
|
|
*
|
|
|
|
|
* Since no-one else can be looking at the page contents yet, there is no
|
|
|
|
|
* difference between an exclusive lock and a cleanup-strength lock. (Note
|
|
|
|
|
* that we cannot use LockBuffer() or LockBufferForCleanup() here, because
|
|
|
|
|
* they assert that the buffer is already valid.)
|
|
|
|
|
*/
|
|
|
|
|
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
|
|
|
|
|
!isLocalBuf)
|
|
|
|
|
if (likely(io_buffers_len == 0))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/* Populate information needed for I/O. */
|
|
|
|
|
operation->buffers = buffers;
|
|
|
|
|
operation->blocknum = blockNum;
|
|
|
|
|
operation->flags = flags;
|
|
|
|
|
operation->nblocks = actual_nblocks;
|
|
|
|
|
operation->io_buffers_len = io_buffers_len;
|
|
|
|
|
|
|
|
|
|
if (flags & READ_BUFFERS_ISSUE_ADVICE)
|
|
|
|
|
{
|
|
|
|
|
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
|
|
|
|
|
/*
|
|
|
|
|
* In theory we should only do this if PinBufferForBlock() had to
|
|
|
|
|
* allocate new buffers above. That way, if two calls to
|
|
|
|
|
* StartReadBuffers() were made for the same blocks before
|
|
|
|
|
* WaitReadBuffers(), only the first would issue the advice. That'd be
|
|
|
|
|
* a better simulation of true asynchronous I/O, which would only
|
|
|
|
|
* start the I/O once, but isn't done here for simplicity. Note also
|
|
|
|
|
* that the following call might actually issue two advice calls if we
|
|
|
|
|
* cross a segment boundary; in a true asynchronous version we might
|
|
|
|
|
* choose to process only one real I/O at a time in that case.
|
|
|
|
|
*/
|
|
|
|
|
smgrprefetch(operation->smgr,
|
|
|
|
|
operation->forknum,
|
|
|
|
|
blockNum,
|
|
|
|
|
operation->io_buffers_len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (isLocalBuf)
|
|
|
|
|
{
|
|
|
|
|
/* Only need to adjust flags */
|
|
|
|
|
uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
|
|
|
|
|
|
|
|
|
|
buf_state |= BM_VALID;
|
|
|
|
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/* Set BM_VALID, terminate IO, and wake up any waiters */
|
|
|
|
|
TerminateBufferIO(bufHdr, false, BM_VALID, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VacuumPageMiss++;
|
|
|
|
|
if (VacuumCostActive)
|
|
|
|
|
VacuumCostBalance += VacuumCostPageMiss;
|
|
|
|
|
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
|
|
|
|
|
smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
smgr->smgr_rlocator.backend,
|
|
|
|
|
found);
|
|
|
|
|
|
|
|
|
|
return BufferDescriptorGetBuffer(bufHdr);
|
|
|
|
|
/* Indicate that WaitReadBuffers() should be called. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
|
|
|
|
|
* buffer. If no buffer exists already, selects a replacement
|
|
|
|
|
* victim and evicts the old page, but does NOT read in new page.
|
|
|
|
|
* Begin reading a range of blocks beginning at blockNum and extending for
|
|
|
|
|
* *nblocks. On return, up to *nblocks pinned buffers holding those blocks
|
|
|
|
|
* are written into the buffers array, and *nblocks is updated to contain the
|
|
|
|
|
* actual number, which may be fewer than requested. Caller sets some of the
|
|
|
|
|
* members of operation; see struct definition.
|
|
|
|
|
*
|
|
|
|
|
* If false is returned, no I/O is necessary. If true is returned, one I/O
|
|
|
|
|
* has been started, and WaitReadBuffers() must be called with the same
|
|
|
|
|
* operation object before the buffers are accessed. Along with the operation
|
|
|
|
|
* object, the caller-supplied array of buffers must remain valid until
|
|
|
|
|
* WaitReadBuffers() is called.
|
|
|
|
|
*
|
|
|
|
|
* Currently the I/O is only started with optional operating system advice if
|
|
|
|
|
* requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
|
|
|
|
|
* happens synchronously in WaitReadBuffers(). In future work, true I/O could
|
|
|
|
|
* be initiated here.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
StartReadBuffers(ReadBuffersOperation *operation,
|
|
|
|
|
Buffer *buffers,
|
|
|
|
|
BlockNumber blockNum,
|
|
|
|
|
int *nblocks,
|
|
|
|
|
int flags)
|
|
|
|
|
{
|
|
|
|
|
return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Single block version of the StartReadBuffers(). This might save a few
|
|
|
|
|
* instructions when called from another translation unit, because it is
|
|
|
|
|
* specialized for nblocks == 1.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
StartReadBuffer(ReadBuffersOperation *operation,
|
|
|
|
|
Buffer *buffer,
|
|
|
|
|
BlockNumber blocknum,
|
|
|
|
|
int flags)
|
|
|
|
|
{
|
|
|
|
|
int nblocks = 1;
|
|
|
|
|
bool result;
|
|
|
|
|
|
|
|
|
|
result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
|
|
|
|
|
Assert(nblocks == 1); /* single block can't be short */
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
|
WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
|
|
|
|
|
{
|
|
|
|
|
if (BufferIsLocal(buffer))
|
|
|
|
|
{
|
|
|
|
|
BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
|
|
|
|
|
|
|
|
|
|
return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
WaitReadBuffers(ReadBuffersOperation *operation)
|
|
|
|
|
{
|
|
|
|
|
Buffer *buffers;
|
|
|
|
|
int nblocks;
|
|
|
|
|
BlockNumber blocknum;
|
|
|
|
|
ForkNumber forknum;
|
|
|
|
|
IOContext io_context;
|
|
|
|
|
IOObject io_object;
|
|
|
|
|
char persistence;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Currently operations are only allowed to include a read of some range,
|
|
|
|
|
* with an optional extra buffer that is already pinned at the end. So
|
|
|
|
|
* nblocks can be at most one more than io_buffers_len.
|
|
|
|
|
*/
|
|
|
|
|
Assert((operation->nblocks == operation->io_buffers_len) ||
|
|
|
|
|
(operation->nblocks == operation->io_buffers_len + 1));
|
|
|
|
|
|
|
|
|
|
/* Find the range of the physical read we need to perform. */
|
|
|
|
|
nblocks = operation->io_buffers_len;
|
|
|
|
|
if (nblocks == 0)
|
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
|
|
buffers = &operation->buffers[0];
|
|
|
|
|
blocknum = operation->blocknum;
|
|
|
|
|
forknum = operation->forknum;
|
|
|
|
|
|
|
|
|
|
persistence = operation->rel
|
|
|
|
|
? operation->rel->rd_rel->relpersistence
|
|
|
|
|
: RELPERSISTENCE_PERMANENT;
|
|
|
|
|
if (persistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
{
|
|
|
|
|
io_context = IOCONTEXT_NORMAL;
|
|
|
|
|
io_object = IOOBJECT_TEMP_RELATION;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
io_context = IOContextForStrategy(operation->strategy);
|
|
|
|
|
io_object = IOOBJECT_RELATION;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We count all these blocks as read by this backend. This is traditional
|
|
|
|
|
* behavior, but might turn out to be not true if we find that someone
|
|
|
|
|
* else has beaten us and completed the read of some of these blocks. In
|
|
|
|
|
* that case the system globally double-counts, but we traditionally don't
|
|
|
|
|
* count this as a "hit", and we don't have a separate counter for "miss,
|
|
|
|
|
* but another backend completed the read".
|
|
|
|
|
*/
|
|
|
|
|
if (persistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
pgBufferUsage.local_blks_read += nblocks;
|
|
|
|
|
else
|
|
|
|
|
pgBufferUsage.shared_blks_read += nblocks;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < nblocks; ++i)
|
|
|
|
|
{
|
|
|
|
|
int io_buffers_len;
|
|
|
|
|
Buffer io_buffers[MAX_IO_COMBINE_LIMIT];
|
|
|
|
|
void *io_pages[MAX_IO_COMBINE_LIMIT];
|
|
|
|
|
instr_time io_start;
|
|
|
|
|
BlockNumber io_first_block;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Skip this block if someone else has already completed it. If an
|
|
|
|
|
* I/O is already in progress in another backend, this will wait for
|
|
|
|
|
* the outcome: either done, or something went wrong and we will
|
|
|
|
|
* retry.
|
|
|
|
|
*/
|
|
|
|
|
if (!WaitReadBuffersCanStartIO(buffers[i], false))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Report this as a 'hit' for this backend, even though it must
|
|
|
|
|
* have started out as a miss in PinBufferForBlock().
|
|
|
|
|
*/
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
|
|
|
|
|
operation->smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
operation->smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
operation->smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
operation->smgr->smgr_rlocator.backend,
|
|
|
|
|
true);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We found a buffer that we need to read in. */
|
|
|
|
|
io_buffers[0] = buffers[i];
|
|
|
|
|
io_pages[0] = BufferGetBlock(buffers[i]);
|
|
|
|
|
io_first_block = blocknum + i;
|
|
|
|
|
io_buffers_len = 1;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* How many neighboring-on-disk blocks can we can scatter-read into
|
|
|
|
|
* other buffers at the same time? In this case we don't wait if we
|
|
|
|
|
* see an I/O already in progress. We already hold BM_IO_IN_PROGRESS
|
|
|
|
|
* for the head block, so we should get on with that I/O as soon as
|
|
|
|
|
* possible. We'll come back to this block again, above.
|
|
|
|
|
*/
|
|
|
|
|
while ((i + 1) < nblocks &&
|
|
|
|
|
WaitReadBuffersCanStartIO(buffers[i + 1], true))
|
|
|
|
|
{
|
|
|
|
|
/* Must be consecutive block numbers. */
|
|
|
|
|
Assert(BufferGetBlockNumber(buffers[i + 1]) ==
|
|
|
|
|
BufferGetBlockNumber(buffers[i]) + 1);
|
|
|
|
|
|
|
|
|
|
io_buffers[io_buffers_len] = buffers[++i];
|
|
|
|
|
io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
io_start = pgstat_prepare_io_time(track_io_timing);
|
|
|
|
|
smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
|
|
|
|
|
pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
|
|
|
|
|
io_buffers_len);
|
|
|
|
|
|
|
|
|
|
/* Verify each block we read, and terminate the I/O. */
|
|
|
|
|
for (int j = 0; j < io_buffers_len; ++j)
|
|
|
|
|
{
|
|
|
|
|
BufferDesc *bufHdr;
|
|
|
|
|
Block bufBlock;
|
|
|
|
|
|
|
|
|
|
if (persistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
{
|
|
|
|
|
bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
|
|
|
|
|
bufBlock = LocalBufHdrGetBlock(bufHdr);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
|
|
|
|
|
bufBlock = BufHdrGetBlock(bufHdr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* check for garbage data */
|
|
|
|
|
if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
|
|
|
|
|
PIV_LOG_WARNING | PIV_REPORT_STAT))
|
|
|
|
|
{
|
|
|
|
|
if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
|
|
|
|
|
{
|
|
|
|
|
ereport(WARNING,
|
|
|
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
|
|
|
errmsg("invalid page in block %u of relation %s; zeroing out page",
|
|
|
|
|
io_first_block + j,
|
|
|
|
|
relpath(operation->smgr->smgr_rlocator, forknum))));
|
|
|
|
|
memset(bufBlock, 0, BLCKSZ);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
ereport(ERROR,
|
|
|
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
|
|
|
errmsg("invalid page in block %u of relation %s",
|
|
|
|
|
io_first_block + j,
|
|
|
|
|
relpath(operation->smgr->smgr_rlocator, forknum))));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Terminate I/O and set BM_VALID. */
|
|
|
|
|
if (persistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
{
|
|
|
|
|
uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
|
|
|
|
|
|
|
|
|
|
buf_state |= BM_VALID;
|
|
|
|
|
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/* Set BM_VALID, terminate IO, and wake up any waiters */
|
|
|
|
|
TerminateBufferIO(bufHdr, false, BM_VALID, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Report I/Os as completing individually. */
|
|
|
|
|
TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
|
|
|
|
|
operation->smgr->smgr_rlocator.locator.spcOid,
|
|
|
|
|
operation->smgr->smgr_rlocator.locator.dbOid,
|
|
|
|
|
operation->smgr->smgr_rlocator.locator.relNumber,
|
|
|
|
|
operation->smgr->smgr_rlocator.backend,
|
|
|
|
|
false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VacuumPageMiss += io_buffers_len;
|
|
|
|
|
if (VacuumCostActive)
|
|
|
|
|
VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
|
|
|
|
|
* buffer. If no buffer exists already, selects a replacement victim and
|
|
|
|
|
* evicts the old page, but does NOT read in new page.
|
|
|
|
|
*
|
|
|
|
|
* "strategy" can be a buffer replacement strategy object, or NULL for
|
|
|
|
|
* the default strategy. The selected buffer's usage_count is advanced when
|
|
|
|
@ -1223,11 +1536,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
*
|
|
|
|
|
* The returned buffer is pinned and is already marked as holding the
|
|
|
|
|
* desired page. If it already did have the desired page, *foundPtr is
|
|
|
|
|
* set true. Otherwise, *foundPtr is set false and the buffer is marked
|
|
|
|
|
* as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
|
|
|
|
|
*
|
|
|
|
|
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
|
|
|
|
|
* we keep it for simplicity in ReadBuffer.
|
|
|
|
|
* set true. Otherwise, *foundPtr is set false.
|
|
|
|
|
*
|
|
|
|
|
* io_context is passed as an output parameter to avoid calling
|
|
|
|
|
* IOContextForStrategy() when there is a shared buffers hit and no IO
|
|
|
|
@ -1235,7 +1544,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
*
|
|
|
|
|
* No locks are held either at entry or exit.
|
|
|
|
|
*/
|
|
|
|
|
static BufferDesc *
|
|
|
|
|
static pg_attribute_always_inline BufferDesc *
|
|
|
|
|
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
BlockNumber blockNum,
|
|
|
|
|
BufferAccessStrategy strategy,
|
|
|
|
@ -1286,19 +1595,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We can only get here if (a) someone else is still reading in
|
|
|
|
|
* the page, or (b) a previous read attempt failed. We have to
|
|
|
|
|
* wait for any active read attempt to finish, and then set up our
|
|
|
|
|
* own read attempt if the page is still not BM_VALID.
|
|
|
|
|
* StartBufferIO does it all.
|
|
|
|
|
* the page, (b) a previous read attempt failed, or (c) someone
|
|
|
|
|
* called StartReadBuffers() but not yet WaitReadBuffers().
|
|
|
|
|
*/
|
|
|
|
|
if (StartBufferIO(buf, true))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* If we get here, previous attempts to read the buffer must
|
|
|
|
|
* have failed ... but we shall bravely try again.
|
|
|
|
|
*/
|
|
|
|
|
*foundPtr = false;
|
|
|
|
|
}
|
|
|
|
|
*foundPtr = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
@ -1363,19 +1663,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We can only get here if (a) someone else is still reading in
|
|
|
|
|
* the page, or (b) a previous read attempt failed. We have to
|
|
|
|
|
* wait for any active read attempt to finish, and then set up our
|
|
|
|
|
* own read attempt if the page is still not BM_VALID.
|
|
|
|
|
* StartBufferIO does it all.
|
|
|
|
|
* the page, (b) a previous read attempt failed, or (c) someone
|
|
|
|
|
* called StartReadBuffers() but not yet WaitReadBuffers().
|
|
|
|
|
*/
|
|
|
|
|
if (StartBufferIO(existing_buf_hdr, true))
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* If we get here, previous attempts to read the buffer must
|
|
|
|
|
* have failed ... but we shall bravely try again.
|
|
|
|
|
*/
|
|
|
|
|
*foundPtr = false;
|
|
|
|
|
}
|
|
|
|
|
*foundPtr = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return existing_buf_hdr;
|
|
|
|
@ -1407,15 +1698,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
|
|
|
|
LWLockRelease(newPartitionLock);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Buffer contents are currently invalid. Try to obtain the right to
|
|
|
|
|
* start I/O. If StartBufferIO returns false, then someone else managed
|
|
|
|
|
* to read it before we did, so there's nothing left for BufferAlloc() to
|
|
|
|
|
* do.
|
|
|
|
|
* Buffer contents are currently invalid.
|
|
|
|
|
*/
|
|
|
|
|
if (StartBufferIO(victim_buf_hdr, true))
|
|
|
|
|
*foundPtr = false;
|
|
|
|
|
else
|
|
|
|
|
*foundPtr = true;
|
|
|
|
|
*foundPtr = false;
|
|
|
|
|
|
|
|
|
|
return victim_buf_hdr;
|
|
|
|
|
}
|
|
|
|
@ -1769,7 +2054,7 @@ again:
|
|
|
|
|
* pessimistic, but outside of toy-sized shared_buffers it should allow
|
|
|
|
|
* sufficient pins.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
void
|
|
|
|
|
LimitAdditionalPins(uint32 *additional_pins)
|
|
|
|
|
{
|
|
|
|
|
uint32 max_backends;
|
|
|
|
@ -2034,7 +2319,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
|
|
|
|
|
|
|
|
|
|
buf_state &= ~BM_VALID;
|
|
|
|
|
UnlockBufHdr(existing_hdr, buf_state);
|
|
|
|
|
} while (!StartBufferIO(existing_hdr, true));
|
|
|
|
|
} while (!StartBufferIO(existing_hdr, true, false));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
@ -2057,7 +2342,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
|
|
|
|
|
LWLockRelease(partition_lock);
|
|
|
|
|
|
|
|
|
|
/* XXX: could combine the locked operations in it with the above */
|
|
|
|
|
StartBufferIO(victim_buf_hdr, true);
|
|
|
|
|
StartBufferIO(victim_buf_hdr, true, false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -2372,7 +2657,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* If we previously pinned the buffer, it must surely be valid.
|
|
|
|
|
* If we previously pinned the buffer, it is likely to be valid, but
|
|
|
|
|
* it may not be if StartReadBuffers() was called and
|
|
|
|
|
* WaitReadBuffers() hasn't been called yet. We'll check by loading
|
|
|
|
|
* the flags without locking. This is racy, but it's OK to return
|
|
|
|
|
* false spuriously: when WaitReadBuffers() calls StartBufferIO(),
|
|
|
|
|
* it'll see that it's now valid.
|
|
|
|
|
*
|
|
|
|
|
* Note: We deliberately avoid a Valgrind client request here.
|
|
|
|
|
* Individual access methods can optionally superimpose buffer page
|
|
|
|
@ -2381,7 +2671,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
|
|
|
|
|
* that the buffer page is legitimately non-accessible here. We
|
|
|
|
|
* cannot meddle with that.
|
|
|
|
|
*/
|
|
|
|
|
result = true;
|
|
|
|
|
result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ref->refcount++;
|
|
|
|
@ -3449,7 +3739,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
|
|
|
|
|
* someone else flushed the buffer before we could, so we need not do
|
|
|
|
|
* anything.
|
|
|
|
|
*/
|
|
|
|
|
if (!StartBufferIO(buf, false))
|
|
|
|
|
if (!StartBufferIO(buf, false, false))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/* Setup error traceback support for ereport() */
|
|
|
|
@ -5184,9 +5474,15 @@ WaitIO(BufferDesc *buf)
|
|
|
|
|
*
|
|
|
|
|
* Returns true if we successfully marked the buffer as I/O busy,
|
|
|
|
|
* false if someone else already did the work.
|
|
|
|
|
*
|
|
|
|
|
* If nowait is true, then we don't wait for an I/O to be finished by another
|
|
|
|
|
* backend. In that case, false indicates either that the I/O was already
|
|
|
|
|
* finished, or is still in progress. This is useful for callers that want to
|
|
|
|
|
* find out if they can perform the I/O as part of a larger operation, without
|
|
|
|
|
* waiting for the answer or distinguishing the reasons why not.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
StartBufferIO(BufferDesc *buf, bool forInput)
|
|
|
|
|
StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
|
|
|
|
|
{
|
|
|
|
|
uint32 buf_state;
|
|
|
|
|
|
|
|
|
@ -5199,6 +5495,8 @@ StartBufferIO(BufferDesc *buf, bool forInput)
|
|
|
|
|
if (!(buf_state & BM_IO_IN_PROGRESS))
|
|
|
|
|
break;
|
|
|
|
|
UnlockBufHdr(buf, buf_state);
|
|
|
|
|
if (nowait)
|
|
|
|
|
return false;
|
|
|
|
|
WaitIO(buf);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|