postgres/src/backend/storage/buffer/bufmgr.c

/*-------------------------------------------------------------------------
 *
 * bufmgr.c--
 *    buffer manager interface routines
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.8 1997/01/16 08:11:41 vadim Exp $
 *
 *-------------------------------------------------------------------------
 */
/*
 *
 * BufferAlloc() -- lookup a buffer in the buffer table.  If
 *	it isn't there add it, but do not read it into memory.
 *	This is used when we are about to reinitialize the
 *	buffer so don't care what the current disk contents are.
 *	BufferAlloc() pins the new buffer in memory.
 *
 * ReadBuffer() -- same as BufferAlloc() but reads the data
 *	on a buffer cache miss.
 *
 * ReleaseBuffer() -- unpin the buffer
 *
 * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
 *	but don't unpin.  The disk IO is delayed until buffer
 *	replacement if WriteMode is BUFFER_LATE_WRITE.
 *
 * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
 *
 * DirtyBufferCopy() -- For a given dbid/relid/blockno, if the buffer is
 *			in the cache and is dirty, mark it clean and copy
 *			it to the requested location.  This is a logical
 *			write, and has been installed to support the cache
 *			management code for write-once storage managers.
 *
 * FlushBuffer() -- as above but never delayed write.
 *
 * BufferSync() -- flush all dirty buffers in the buffer pool.
 *
 * InitBufferPool() -- Init the buffer module.
 *
 * See other files:
 * 	freelist.c -- chooses victim for buffer replacement
 *	buf_table.c -- manages the buffer lookup table
 */
#include <sys/file.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <signal.h>

#include "postgres.h"

/* declarations split between these three files */
#include "storage/buf.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"

#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "storage/smgr.h"
#include "storage/lmgr.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/hsearch.h"
#include "utils/palloc.h"
#include "utils/memutils.h"
#include "utils/relcache.h"
#include "executor/execdebug.h"	/* for NDirectFileRead */
#include "catalog/catalog.h"

extern SPINLOCK BufMgrLock;
extern int ReadBufferCount;
extern int BufferHitCount;
extern int BufferFlushCount;

static int WriteMode = BUFFER_LATE_WRITE;	/* Delayed write is default */

static void WaitIO(BufferDesc *buf, SPINLOCK spinlock);
#ifndef HAS_TEST_AND_SET
static void SignalIO(BufferDesc *buf);
extern long *NWaitIOBackendP; /* defined in buf_init.c */
#endif /* HAS_TEST_AND_SET */

static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum,
				       bool bufferLockHeld);
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
			       bool *foundPtr, bool bufferLockHeld);
static int FlushBuffer (Buffer buffer, bool release);
static void BufferSync(void);
static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);

/* ---------------------------------------------------
 * RelationGetBufferWithBuffer
 *	see if the given buffer is what we want
 *	if yes, we don't need to bother the buffer manager
 * ---------------------------------------------------
 */
Buffer
RelationGetBufferWithBuffer(Relation relation,
			    BlockNumber blockNumber,
			    Buffer buffer)
{
    BufferDesc *bufHdr;
    LRelId lrelId;

    if (BufferIsValid(buffer)) {
	if (!BufferIsLocal(buffer)) {
	    bufHdr = &BufferDescriptors[buffer-1];
	    lrelId = RelationGetLRelId(relation);
	    SpinAcquire(BufMgrLock);
	    if (bufHdr->tag.blockNum == blockNumber &&
		bufHdr->tag.relId.relId == lrelId.relId &&
		bufHdr->tag.relId.dbId == lrelId.dbId) {
		SpinRelease(BufMgrLock);
		return(buffer);
	    }
	    return(ReadBufferWithBufferLock(relation, blockNumber, true));
	} else {
	    bufHdr = &LocalBufferDescriptors[-buffer-1];
	    if (bufHdr->tag.relId.relId == relation->rd_id &&
		bufHdr->tag.blockNum == blockNumber) {
		return(buffer);
	    }
	}
    }
    return(ReadBuffer(relation, blockNumber));
}

/*
 * ReadBuffer -- returns a buffer containing the requested
 *	block of the requested relation.  If the blknum
 *	requested is P_NEW, extend the relation file and
 *	allocate a new block.
 *
 * Returns: the buffer number for the buffer containing
 *	the block read or NULL on an error.
 *
 * Assume when this function is called, that reln has been
 *	opened already.
 */

extern int ShowPinTrace;


#undef ReadBuffer	/* conflicts with macro when BUFMGR_DEBUG defined */

/*
 * ReadBuffer --
 *
 */
Buffer
ReadBuffer(Relation reln, BlockNumber blockNum)
{
    return ReadBufferWithBufferLock(reln, blockNum, false);
}

/*
 * is_userbuffer
 *
 * XXX caller must have already acquired BufMgrLock
 */
static bool
is_userbuffer(Buffer buffer)
{
    BufferDesc *buf = &BufferDescriptors[buffer-1];

    if (IsSystemRelationName(buf->sb_relname))
	return false;
    return true;
}

Buffer
ReadBuffer_Debug(char *file,
		 int line,
		 Relation reln,
		 BlockNumber blockNum)
{
    Buffer buffer;

    buffer = ReadBufferWithBufferLock(reln, blockNum, false);
    if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
	BufferDesc *buf = &BufferDescriptors[buffer-1];

	fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
		buffer, buf->sb_relname, buf->tag.blockNum,
		PrivateRefCount[buffer - 1], file, line);
    }
    return buffer;
}

/*
 * ReadBufferWithBufferLock -- does the work of
 *	ReadBuffer() but with the possibility that
 *	the buffer lock has already been held. this
 *	is yet another effort to reduce the number of
 *	semops in the system.
 */
static Buffer
ReadBufferWithBufferLock(Relation reln,
			 BlockNumber blockNum,
			 bool bufferLockHeld)
{
    BufferDesc *bufHdr;
    int		extend;   /* extending the file by one block */
    int		status;
    bool	found;
    bool	isLocalBuf;

    extend = (blockNum == P_NEW);
    isLocalBuf = reln->rd_islocal;

    if (isLocalBuf) {
	bufHdr = LocalBufferAlloc(reln, blockNum, &found);
    } else {
	ReadBufferCount++;

	/* lookup the buffer.  IO_IN_PROGRESS is set if the requested
	 * block is not currently in memory.
	 */
	bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld);
	if (found) BufferHitCount++;
    }

    if (!bufHdr) {
	return(InvalidBuffer);
    }

    /* if its already in the buffer pool, we're done */
    if (found) {
	/*
	 * This happens when a bogus buffer was returned previously and is
	 * floating around in the buffer pool.  A routine calling this would
	 * want this extended.
	 */
	if (extend) {
	    /* new buffers are zero-filled */
	    memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
	    (void) smgrextend(bufHdr->bufsmgr, reln,
			      (char *) MAKE_PTR(bufHdr->data));
	}
	return (BufferDescriptorGetBuffer(bufHdr));

    }

    /*
     * if we have gotten to this point, the reln pointer must be ok
     * and the relation file must be open.
     */
    if (extend) {
	/* new buffers are zero-filled */
	(void) memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
	status = smgrextend(bufHdr->bufsmgr, reln,
			    (char *) MAKE_PTR(bufHdr->data));
    } else {
	status = smgrread(bufHdr->bufsmgr, reln, blockNum,
			  (char *) MAKE_PTR(bufHdr->data));
    }

    if (isLocalBuf)
	return (BufferDescriptorGetBuffer(bufHdr));

    /* lock buffer manager again to update IO IN PROGRESS */
    SpinAcquire(BufMgrLock);

    if (status == SM_FAIL) {
	/* IO Failed.  cleanup the data structures and go home */

	if (! BufTableDelete(bufHdr)) {
	    SpinRelease(BufMgrLock);
	    elog(FATAL,"BufRead: buffer table broken after IO error\n");
	}
	/* remember that BufferAlloc() pinned the buffer */
	UnpinBuffer(bufHdr);

	/*
	 * Have to reset the flag so that anyone waiting for
	 * the buffer can tell that the contents are invalid.
	 */
	bufHdr->flags |= BM_IO_ERROR;
	bufHdr->flags &= ~BM_IO_IN_PROGRESS;
    } else {
	/* IO Succeeded.  clear the flags, finish buffer update */

	bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS);
    }

    /* If anyone was waiting for IO to complete, wake them up now */
#ifdef HAS_TEST_AND_SET
    S_UNLOCK(&(bufHdr->io_in_progress_lock));
#else
    if (bufHdr->refcount > 1)
	SignalIO(bufHdr);
#endif

    SpinRelease(BufMgrLock);

    if (status == SM_FAIL)
	return(InvalidBuffer);

    return(BufferDescriptorGetBuffer(bufHdr));
}

/*
 * BufferAlloc -- Get a buffer from the buffer pool but dont
 *	read it.
 *
 * Returns: descriptor for buffer
 *
 * When this routine returns, the BufMgrLock is guaranteed NOT be held.
 */
static BufferDesc *
BufferAlloc(Relation reln,
	    BlockNumber blockNum,
	    bool	*foundPtr,
	    bool bufferLockHeld)
{
    BufferDesc 		*buf, *buf2;
    BufferTag 		newTag;	 /* identity of requested block */
    bool		inProgress; /* buffer undergoing IO */
    bool		newblock = FALSE;

    /* create a new tag so we can lookup the buffer */
    /* assume that the relation is already open */
    if (blockNum == P_NEW) {
	newblock = TRUE;
	blockNum = smgrnblocks(reln->rd_rel->relsmgr, reln);
    }

    INIT_BUFFERTAG(&newTag,reln,blockNum);

    if (!bufferLockHeld)
	SpinAcquire(BufMgrLock);

    /* see if the block is in the buffer pool already */
    buf = BufTableLookup(&newTag);
    if (buf != NULL) {
	/* Found it.  Now, (a) pin the buffer so no
	 * one steals it from the buffer pool,
	 * (b) check IO_IN_PROGRESS, someone may be
	 * faulting the buffer into the buffer pool.
	 */

	PinBuffer(buf);
	inProgress = (buf->flags & BM_IO_IN_PROGRESS);

	*foundPtr = TRUE;
	if (inProgress) {
	    WaitIO(buf, BufMgrLock);
	    if (buf->flags & BM_IO_ERROR) {
		/* wierd race condition:
		 *
		 * We were waiting for someone else to read the buffer.
		 * While we were waiting, the reader boof'd in some
		 *  way, so the contents of the buffer are still
		 * invalid.  By saying that we didn't find it, we can
		 * make the caller reinitialize the buffer.  If two
		 * processes are waiting for this block, both will
		 * read the block.  The second one to finish may overwrite
		 * any updates made by the first.  (Assume higher level
		 * synchronization prevents this from happening).
		 *
		 * This is never going to happen, don't worry about it.
		 */
		*foundPtr = FALSE;
	    }
	}
#ifdef BMTRACE
	_bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND);
#endif /* BMTRACE */

	SpinRelease(BufMgrLock);

	return(buf);
    }

    *foundPtr = FALSE;

    /*
     * Didn't find it in the buffer pool.  We'll have
     * to initialize a new buffer.  First, grab one from
     * the free list.  If it's dirty, flush it to disk.
     * Remember to unlock BufMgr spinlock while doing the IOs.
     */
    inProgress = FALSE;
    for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL; ) {

	/* GetFreeBuffer will abort if it can't find a free buffer */
	buf = GetFreeBuffer();

	/*
	 * But it can return buf == NULL if we are in aborting
	 * transaction now and so elog(WARN,...) in GetFreeBuffer
	 * will not abort again.
	 */
	if ( buf == NULL )
		return (NULL);

	/*
	 * There should be exactly one pin on the buffer after
	 * it is allocated -- ours.  If it had a pin it wouldn't
	 * have been on the free list.  No one else could have
	 * pinned it between GetFreeBuffer and here because we
	 * have the BufMgrLock.
	 */
	Assert(buf->refcount == 0);
	buf->refcount = 1;
	PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;

	if (buf->flags & BM_DIRTY) {
	    bool smok;
	    /*
	     * Set BM_IO_IN_PROGRESS to keep anyone from doing anything
	     * with the contents of the buffer while we write it out.
	     * We don't really care if they try to read it, but if they
	     * can complete a BufferAlloc on it they can then scribble
	     * into it, and we'd really like to avoid that while we are
	     * flushing the buffer.  Setting this flag should block them
	     * in WaitIO until we're done.
	     */
	    inProgress = TRUE;
	    buf->flags |= BM_IO_IN_PROGRESS;
#ifdef HAS_TEST_AND_SET
	    /*
	     * All code paths that acquire this lock pin the buffer
	     * first; since no one had it pinned (it just came off the
	     * free list), no one else can have this lock.
	     */
	    Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
	    S_LOCK(&(buf->io_in_progress_lock));
#endif /* HAS_TEST_AND_SET */

	    /*
	     * Write the buffer out, being careful to release BufMgrLock
	     * before starting the I/O.
	     *
	     * This #ifndef is here because a few extra semops REALLY kill
	     * you on machines that don't have spinlocks.  If you don't
	     * operate with much concurrency, well...
	     */
	    smok = BufferReplace(buf, true);
#ifndef OPTIMIZE_SINGLE
	    SpinAcquire(BufMgrLock);
#endif /* OPTIMIZE_SINGLE */

	    if ( smok == FALSE )
	    {
		elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s",
			 buf->tag.blockNum, buf->sb_dbname, buf->sb_relname);
		inProgress = FALSE;
		buf->flags |= BM_IO_ERROR;
		buf->flags &= ~BM_IO_IN_PROGRESS;
#ifdef HAS_TEST_AND_SET
		S_UNLOCK(&(buf->io_in_progress_lock));
#else /* !HAS_TEST_AND_SET */
		if (buf->refcount > 1)
		    SignalIO(buf);
#endif /* !HAS_TEST_AND_SET */
		PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
		buf->refcount--;
		if ( buf->refcount == 0 )
		{
		    AddBufferToFreelist(buf);
		    buf->flags |= BM_FREE;
		}
		buf = (BufferDesc *) NULL;
	    }
	    else
	    {
		BufferFlushCount++;
		buf->flags &= ~BM_DIRTY;
	    }

	    /*
	     * Somebody could have pinned the buffer while we were
	     * doing the I/O and had given up the BufMgrLock (though
	     * they would be waiting for us to clear the BM_IO_IN_PROGRESS
	     * flag).  That's why this is a loop -- if so, we need to clear
	     * the I/O flags, remove our pin and start all over again.
	     *
	     * People may be making buffers free at any time, so there's
	     * no reason to think that we have an immediate disaster on
	     * our hands.
	     */
	    if (buf && buf->refcount > 1) {
		inProgress = FALSE;
		buf->flags &= ~BM_IO_IN_PROGRESS;
#ifdef HAS_TEST_AND_SET
		S_UNLOCK(&(buf->io_in_progress_lock));
#else /* !HAS_TEST_AND_SET */
		if (buf->refcount > 1)
		    SignalIO(buf);
#endif /* !HAS_TEST_AND_SET */
		PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
		buf->refcount--;
		buf = (BufferDesc *) NULL;
	    }

	    /*
	     * Somebody could have allocated another buffer for the
	     * same block we are about to read in. (While we flush out
	     * the dirty buffer, we don't hold the lock and someone could
	     * have allocated another buffer for the same block. The problem
	     * is we haven't gotten around to insert the new tag into
	     * the buffer table. So we need to check here.	-ay 3/95
	     */
	    buf2 = BufTableLookup(&newTag);
	    if (buf2 != NULL) {
		/* Found it. Someone has already done what we're about
		 * to do. We'll just handle this as if it were found in
		 * the buffer pool in the first place.
		 */
		if ( buf != NULL )
		{
#ifdef HAS_TEST_AND_SET
			S_UNLOCK(&(buf->io_in_progress_lock));
#else /* !HAS_TEST_AND_SET */
			if (buf->refcount > 1)
			    SignalIO(buf);
#endif /* !HAS_TEST_AND_SET */

		/* give up the buffer since we don't need it any more */
			buf->refcount--;
			PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
			AddBufferToFreelist(buf);
			buf->flags |= BM_FREE;
			buf->flags &= ~BM_IO_IN_PROGRESS;
		}

		PinBuffer(buf2);
		inProgress = (buf2->flags & BM_IO_IN_PROGRESS);

		*foundPtr = TRUE;
		if (inProgress) {
		    WaitIO(buf2, BufMgrLock);
		    if (buf2->flags & BM_IO_ERROR) {
			*foundPtr = FALSE;
		    }
		}

		SpinRelease(BufMgrLock);

		return(buf2);
	    }
	}
    }
    /*
     * At this point we should have the sole pin on a non-dirty
     * buffer and we may or may not already have the BM_IO_IN_PROGRESS
     * flag set.
     */

    /*
     * Change the name of the buffer in the lookup table:
     *
     * Need to update the lookup table before the read starts.
     * If someone comes along looking for the buffer while
     * we are reading it in, we don't want them to allocate
     * a new buffer.  For the same reason, we didn't want
     * to erase the buf table entry for the buffer we were
     * writing back until now, either.
     */

    if (! BufTableDelete(buf)) {
	SpinRelease(BufMgrLock);
	elog(FATAL,"buffer wasn't in the buffer table\n");

    }

    /* record the database name and relation name for this buffer */
    strcpy (buf->sb_relname, reln->rd_rel->relname.data);
    strcpy (buf->sb_dbname, GetDatabaseName());

    /* remember which storage manager is responsible for it */
    buf->bufsmgr = reln->rd_rel->relsmgr;

    INIT_BUFFERTAG(&(buf->tag),reln,blockNum);
    if (! BufTableInsert(buf)) {
	SpinRelease(BufMgrLock);
	elog(FATAL,"Buffer in lookup table twice \n");
    }

    /* Buffer contents are currently invalid.  Have
     * to mark IO IN PROGRESS so no one fiddles with
     * them until the read completes.  If this routine
     * has been called simply to allocate a buffer, no
     * io will be attempted, so the flag isnt set.
     */
    if (!inProgress) {
	buf->flags |= BM_IO_IN_PROGRESS;
#ifdef HAS_TEST_AND_SET
	Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
	S_LOCK(&(buf->io_in_progress_lock));
#endif /* HAS_TEST_AND_SET */
    }

#ifdef BMTRACE
    _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND);
#endif /* BMTRACE */

    SpinRelease(BufMgrLock);

    return (buf);
}

/*
 * WriteBuffer--
 *
 *	Pushes buffer contents to disk if WriteMode is BUFFER_FLUSH_WRITE.
 *	Otherwise, marks contents as dirty.
 *
 * Assume that buffer is pinned.  Assume that reln is
 *	valid.
 *
 * Side Effects:
 *    	Pin count is decremented.
 */

#undef WriteBuffer

int
WriteBuffer(Buffer buffer)
{
    BufferDesc	*bufHdr;

    if (WriteMode == BUFFER_FLUSH_WRITE) {
	return (FlushBuffer (buffer, TRUE));
    } else {

	if (BufferIsLocal(buffer))
	    return WriteLocalBuffer(buffer, TRUE);

	if (BAD_BUFFER_ID(buffer))
	    return(FALSE);

	bufHdr = &BufferDescriptors[buffer-1];

	SpinAcquire(BufMgrLock);
	Assert(bufHdr->refcount > 0);
	bufHdr->flags |= BM_DIRTY;
	UnpinBuffer(bufHdr);
	SpinRelease(BufMgrLock);
    }
    return(TRUE);
}

void
WriteBuffer_Debug(char *file, int line, Buffer buffer)
{
    WriteBuffer(buffer);
    if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) {
	BufferDesc *buf;
	buf = &BufferDescriptors[buffer-1];
	fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
		buffer, buf->sb_relname, buf->tag.blockNum,
		PrivateRefCount[buffer - 1], file, line);
    }
}

/*
 *  DirtyBufferCopy() -- Copy a given dirty buffer to the requested
 *			 destination.
 *
 *	We treat this as a write.  If the requested buffer is in the pool
 *	and is dirty, we copy it to the location requested and mark it
 *	clean.  This routine supports the Sony jukebox storage manager,
 *	which agrees to take responsibility for the data once we mark
 *	it clean.
 *
 *  NOTE: used by sony jukebox code in postgres 4.2   - ay 2/95
 */
void
DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest)
{
    BufferDesc *buf;
    BufferTag btag;

    btag.relId.relId = relid;
    btag.relId.dbId = dbid;
    btag.blockNum = blkno;

    SpinAcquire(BufMgrLock);
    buf = BufTableLookup(&btag);

    if (buf == (BufferDesc *) NULL
	|| !(buf->flags & BM_DIRTY)
	|| !(buf->flags & BM_VALID)) {
	SpinRelease(BufMgrLock);
	return;
    }

    /* hate to do this holding the lock, but release and reacquire is slower */
    memmove(dest, (char *) MAKE_PTR(buf->data), BLCKSZ);

    buf->flags &= ~BM_DIRTY;

    SpinRelease(BufMgrLock);
}

/*
 * FlushBuffer -- like WriteBuffer, but force the page to disk.
 *
 * 'buffer' is known to be dirty/pinned, so there should not be a
 * problem reading the BufferDesc members without the BufMgrLock
 * (nobody should be able to change tags, flags, etc. out from under
 * us).
 */
static int
FlushBuffer(Buffer buffer, bool release)
{
    BufferDesc	*bufHdr;
    Oid bufdb;
    Relation bufrel;
    int	status;

    if (BufferIsLocal(buffer))
	return FlushLocalBuffer(buffer, release);

    if (BAD_BUFFER_ID(buffer))
	return (STATUS_ERROR);

    bufHdr = &BufferDescriptors[buffer-1];
    bufdb = bufHdr->tag.relId.dbId;

    Assert (bufdb == MyDatabaseId || bufdb == (Oid) NULL);
    bufrel = RelationIdCacheGetRelation (bufHdr->tag.relId.relId);
    Assert (bufrel != (Relation) NULL);

    status = smgrflush(bufHdr->bufsmgr, bufrel, bufHdr->tag.blockNum,
			   (char *) MAKE_PTR(bufHdr->data));

    if (status == SM_FAIL)
    {
	elog(WARN, "FlushBuffer: cannot flush block %u of the relation %.*s",
			bufHdr->tag.blockNum,
			NAMEDATALEN, bufrel->rd_rel->relname.data);
	return (STATUS_ERROR);
    }

    SpinAcquire(BufMgrLock);
    bufHdr->flags &= ~BM_DIRTY;
    if ( release )
    	UnpinBuffer(bufHdr);
    SpinRelease(BufMgrLock);

    return(STATUS_OK);
}

/*
 * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
 * 			   when the operation is complete.
 *
 *	We know that the buffer is for a relation in our private cache,
 *	because this routine is called only to write out buffers that
 *	were changed by the executing backend.
 */
int
WriteNoReleaseBuffer(Buffer buffer)
{
    BufferDesc	*bufHdr;

    if (WriteMode == BUFFER_FLUSH_WRITE) {
	return (FlushBuffer (buffer, FALSE));
    } else {

	if (BufferIsLocal(buffer))
	    return WriteLocalBuffer(buffer, FALSE);

	if (BAD_BUFFER_ID(buffer))
	    return (STATUS_ERROR);

	bufHdr = &BufferDescriptors[buffer-1];

	SpinAcquire(BufMgrLock);
	bufHdr->flags |= BM_DIRTY;
	SpinRelease(BufMgrLock);
    }
    return(STATUS_OK);
}


#undef ReleaseAndReadBuffer
/*
 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
 * 	so that only one semop needs to be called.
 *
 */
Buffer
ReleaseAndReadBuffer(Buffer buffer,
		     Relation relation,
		     BlockNumber blockNum)
{
    BufferDesc	*bufHdr;
    Buffer retbuf;

    if (BufferIsLocal(buffer)) {
	Assert(LocalRefCount[-buffer - 1] > 0);
	LocalRefCount[-buffer - 1]--;
    } else {
	if (BufferIsValid(buffer)) {
	    bufHdr = &BufferDescriptors[buffer-1];
	    Assert(PrivateRefCount[buffer - 1] > 0);
	    PrivateRefCount[buffer - 1]--;
	    if (PrivateRefCount[buffer - 1] == 0 &&
		LastRefCount[buffer - 1] == 0) {
		/* only release buffer if it is not pinned in previous ExecMain
		   level */
		SpinAcquire(BufMgrLock);
		bufHdr->refcount--;
		if (bufHdr->refcount == 0) {
		    AddBufferToFreelist(bufHdr);
		    bufHdr->flags |= BM_FREE;
		}
		retbuf = ReadBufferWithBufferLock(relation, blockNum, true);
		return retbuf;
	    }
	}
    }

    return (ReadBuffer(relation, blockNum));
}

/*
 * BufferSync -- Flush all dirty buffers in the pool.
 *
 *	This is called at transaction commit time.  It does the wrong thing,
 *	right now.  We should flush only our own changes to stable storage,
 *	and we should obey the lock protocol on the buffer manager metadata
 *	as we do it.  Also, we need to be sure that no other transaction is
 *	modifying the page as we flush it.  This is only a problem for objects
 *	that use a non-two-phase locking protocol, like btree indices.  For
 *	those objects, we would like to set a write lock for the duration of
 *	our IO.  Another possibility is to code updates to btree pages
 *	carefully, so that writing them out out of order cannot cause
 *	any unrecoverable errors.
 *
 *	I don't want to think hard about this right now, so I will try
 *	to come back to it later.
 */
static void
BufferSync()
{
    int i;
    Oid bufdb;
    Oid bufrel;
    Relation reln;
    BufferDesc *bufHdr;
    int status;

    SpinAcquire(BufMgrLock);
    for (i=0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) {
	if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) {
	    bufdb = bufHdr->tag.relId.dbId;
	    bufrel = bufHdr->tag.relId.relId;
	    if (bufdb == MyDatabaseId || bufdb == (Oid) 0) {
		reln = RelationIdCacheGetRelation(bufrel);

		/*
		 *  We have to pin buffer to keep anyone from stealing it
		 *  from the buffer pool while we are flushing it or
		 *  waiting in WaitIO. It's bad for GetFreeBuffer in
		 *  BufferAlloc, but there is no other way to prevent
		 *  writing into disk block data from some other buffer,
		 *  getting smgr status of some other block and
		 *  clearing BM_DIRTY of ...		- VAdim 09/16/96
		 */
		PinBuffer(bufHdr);
		if (bufHdr->flags & BM_IO_IN_PROGRESS)
		{
		    WaitIO(bufHdr, BufMgrLock);
		    UnpinBuffer(bufHdr);
		    if (bufHdr->flags & BM_IO_ERROR)
		    {
			elog(WARN, "cannot write %u for %s",
				bufHdr->tag.blockNum, bufHdr->sb_relname);
		    }
		    if (reln != (Relation)NULL)
			RelationDecrementReferenceCount(reln);
		    continue;
		}

		/*
		 *  If we didn't have the reldesc in our local cache, flush this
		 *  page out using the 'blind write' storage manager routine.  If
		 *  we did find it, use the standard interface.
		 */

#ifndef OPTIMIZE_SINGLE
		SpinRelease(BufMgrLock);
#endif /* OPTIMIZE_SINGLE */
		if (reln == (Relation) NULL) {
		    status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname,
					  bufHdr->sb_relname, bufdb, bufrel,
					  bufHdr->tag.blockNum,
					  (char *) MAKE_PTR(bufHdr->data));
		} else {
		    status = smgrwrite(bufHdr->bufsmgr, reln,
				       bufHdr->tag.blockNum,
				       (char *) MAKE_PTR(bufHdr->data));
		}
#ifndef OPTIMIZE_SINGLE
		SpinAcquire(BufMgrLock);
#endif /* OPTIMIZE_SINGLE */

		UnpinBuffer(bufHdr);
		if (status == SM_FAIL) {
		    bufHdr->flags |= BM_IO_ERROR;
		    elog(WARN, "cannot write %u for %s",
			 bufHdr->tag.blockNum, bufHdr->sb_relname);
		}
		/*
		 * What if someone has marked this buffer as DIRTY after
		 * smgr[blind]write but before SpinAcquire(BufMgrLock)
		 * ??? - vadim 01/16/97
		 */
		bufHdr->flags &= ~BM_DIRTY;
		if (reln != (Relation)NULL)
		    RelationDecrementReferenceCount(reln);
	    }
	}
    }
    SpinRelease(BufMgrLock);

    LocalBufferSync();
}


/*
 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf'
 * 	is cleared.  Because IO_IN_PROGRESS conflicts are
 *	expected to be rare, there is only one BufferIO
 *	lock in the entire system.  All processes block
 *	on this semaphore when they try to use a buffer
 *	that someone else is faulting in.  Whenever a
 *	process finishes an IO and someone is waiting for
 *	the buffer, BufferIO is signaled (SignalIO).  All
 *	waiting processes then wake up and check to see
 *	if their buffer is now ready.  This implementation
 *	is simple, but efficient enough if WaitIO is
 *	rarely called by multiple processes simultaneously.
 *
 *  ProcSleep atomically releases the spinlock and goes to
 *	sleep.
 *
 *  Note: there is an easy fix if the queue becomes long.
 *	save the id of the buffer we are waiting for in
 *	the queue structure.  That way signal can figure
 *	out which proc to wake up.
 */
#ifdef HAS_TEST_AND_SET
static void
WaitIO(BufferDesc *buf, SPINLOCK spinlock)
{
    SpinRelease(spinlock);
    S_LOCK(&(buf->io_in_progress_lock));
    S_UNLOCK(&(buf->io_in_progress_lock));
    SpinAcquire(spinlock);
}

#else /* HAS_TEST_AND_SET */
IpcSemaphoreId        WaitIOSemId;

static void
WaitIO(BufferDesc *buf, SPINLOCK spinlock)
{
    bool 	inProgress;

    for (;;) {

	/* wait until someone releases IO lock */
	(*NWaitIOBackendP)++;
	SpinRelease(spinlock);
	IpcSemaphoreLock(WaitIOSemId, 0, 1);
	SpinAcquire(spinlock);
	inProgress = (buf->flags & BM_IO_IN_PROGRESS);
	if (!inProgress) break;
    }
}

/*
 * SignalIO --
 */
static void
SignalIO(BufferDesc *buf)
{
    /* somebody better be waiting. */
    Assert( buf->refcount > 1);
    IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP);
    *NWaitIOBackendP = 0;
}
#endif /* HAS_TEST_AND_SET */

long NDirectFileRead;	/* some I/O's are direct file access.  bypass bufmgr */
long NDirectFileWrite;   /* e.g., I/O in psort and hashjoin.		     */

void
PrintBufferUsage(FILE *statfp)
{
    float hitrate;

    if (ReadBufferCount==0)
	hitrate = 0.0;
    else
	hitrate = (float)BufferHitCount * 100.0/ReadBufferCount;

    fprintf(statfp, "!\t%ld blocks read, %ld blocks written, buffer hit rate = %.2f%%\n",
	    ReadBufferCount - BufferHitCount + NDirectFileRead,
	    BufferFlushCount + NDirectFileWrite,
	    hitrate);
}

void
ResetBufferUsage()
{
    BufferHitCount = 0;
    ReadBufferCount = 0;
    BufferFlushCount = 0;
    NDirectFileRead = 0;
    NDirectFileWrite = 0;
}

/* ----------------------------------------------
 *	ResetBufferPool
 *
 *	this routine is supposed to be called when a transaction aborts.
 *	it will release all the buffer pins held by the transaciton.
 *
 * ----------------------------------------------
 */
void
ResetBufferPool()
{
    register int i;
    for (i=1; i<=NBuffers; i++) {
	if (BufferIsValid(i)) {
	    while(PrivateRefCount[i - 1] > 0) {
		ReleaseBuffer(i);
	    }
	}
	LastRefCount[i - 1] = 0;
    }

    ResetLocalBufferPool();
}

/* -----------------------------------------------
 *	BufferPoolCheckLeak
 *
 *	check if there is buffer leak
 *
 * -----------------------------------------------
 */
int
BufferPoolCheckLeak()
{
    register int i;
    void PrintBufferDescs();

    for (i = 1; i <= NBuffers; i++) {
	if (BufferIsValid(i)) {
	    elog(NOTICE, "buffer leak detected in BufferPoolCheckLeak()");
	    PrintBufferDescs();
	    return(1);
	}
    }
    return(0);
}

/* ------------------------------------------------
 *	FlushBufferPool
 *
 *	flush all dirty blocks in buffer pool to disk
 *
 * ------------------------------------------------
 */
void
FlushBufferPool(int StableMainMemoryFlag)
{
    if (!StableMainMemoryFlag) {
        BufferSync();
	smgrcommit();
    }
}

/*
 * BufferIsValid --
 *	True iff the refcnt of the local buffer is > 0
 * Note:
 *	BufferIsValid(InvalidBuffer) is False.
 *	BufferIsValid(UnknownBuffer) is False.
 */
bool
BufferIsValid(Buffer bufnum)
{
    if (BufferIsLocal(bufnum))
	return (bufnum >= -NLocBuffer && LocalRefCount[-bufnum - 1] > 0);

    if (BAD_BUFFER_ID(bufnum))
        return(false);

    return ((bool)(PrivateRefCount[bufnum - 1] > 0));
}

/*
 * BufferGetBlockNumber --
 *	Returns the block number associated with a buffer.
 *
 * Note:
 *	Assumes that the buffer is valid.
 */
BlockNumber
BufferGetBlockNumber(Buffer buffer)
{
    Assert(BufferIsValid(buffer));

    /* XXX should be a critical section */
    if (BufferIsLocal(buffer))
	return (LocalBufferDescriptors[-buffer-1].tag.blockNum);
    else
	return (BufferDescriptors[buffer-1].tag.blockNum);
}

/*
 * BufferGetRelation --
 *	Returns the relation desciptor associated with a buffer.
 *
 * Note:
 *	Assumes buffer is valid.
 */
Relation
BufferGetRelation(Buffer buffer)
{
    Relation    relation;
    Oid		relid;

    Assert(BufferIsValid(buffer));
    Assert(!BufferIsLocal(buffer));	/* not supported for local buffers */

    /* XXX should be a critical section */
    relid = LRelIdGetRelationId(BufferDescriptors[buffer-1].tag.relId);
    relation = RelationIdGetRelation(relid);

    RelationDecrementReferenceCount(relation);

    if (RelationHasReferenceCountZero(relation)) {
	/*
	  elog(NOTICE, "BufferGetRelation: 0->1");
	  */

        RelationIncrementReferenceCount(relation);
    }

    return (relation);
}

/*
 * BufferReplace
 *
 * Flush the buffer corresponding to 'bufHdr'
 *
 */
static int
BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
{
    Relation 	reln;
    Oid	bufdb, bufrel;
    int		status;

    if (!bufferLockHeld)
	SpinAcquire(BufMgrLock);

    /*
     * first try to find the reldesc in the cache, if no luck,
     * don't bother to build the reldesc from scratch, just do
     * a blind write.
     */

    bufdb = bufHdr->tag.relId.dbId;
    bufrel = bufHdr->tag.relId.relId;

    if (bufdb == MyDatabaseId || bufdb == (Oid) NULL)
	reln = RelationIdCacheGetRelation(bufrel);
    else
	reln = (Relation) NULL;

    SpinRelease(BufMgrLock);

    if (reln != (Relation) NULL) {
	status = smgrflush(bufHdr->bufsmgr, reln, bufHdr->tag.blockNum,
			   (char *) MAKE_PTR(bufHdr->data));
    } else {

	/* blind write always flushes */
	status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname,
			      bufHdr->sb_relname, bufdb, bufrel,
			      bufHdr->tag.blockNum,
			      (char *) MAKE_PTR(bufHdr->data));
    }

    if (status == SM_FAIL)
	return (FALSE);

    return (TRUE);
}

/*
 * RelationGetNumberOfBlocks --
 *	Returns the buffer descriptor associated with a page in a relation.
 *
 * Note:
 *      XXX may fail for huge relations.
 *      XXX should be elsewhere.
 *      XXX maybe should be hidden
 */
BlockNumber
RelationGetNumberOfBlocks(Relation relation)
{
    return
	((relation->rd_islocal) ? relation->rd_nblocks :
	    smgrnblocks(relation->rd_rel->relsmgr, relation));
}

/*
 * BufferGetBlock --
 *	Returns a reference to a disk page image associated with a buffer.
 *
 * Note:
 *	Assumes buffer is valid.
 */
Block
BufferGetBlock(Buffer buffer)
{
    Assert(BufferIsValid(buffer));

    if (BufferIsLocal(buffer))
	return((Block)MAKE_PTR(LocalBufferDescriptors[-buffer-1].data));
    else
	return((Block)MAKE_PTR(BufferDescriptors[buffer-1].data));
}

/* ---------------------------------------------------------------------
 *      ReleaseRelationBuffers
 *
 *      this function unmarks all the dirty pages of a relation
 *	in the buffer pool so that at the end of transaction
 *      these pages will not be flushed.
 *      XXX currently it sequentially searches the buffer pool, should be
 *      changed to more clever ways of searching.
 * --------------------------------------------------------------------
 */
void
ReleaseRelationBuffers (Relation rdesc)
{
    register int i;
    int holding = 0;
    BufferDesc *buf;

    if ( rdesc->rd_islocal )
    {
    	for (i = 0; i < NLocBuffer; i++)
    	{
	    buf = &LocalBufferDescriptors[i];
	    if ((buf->flags & BM_DIRTY) &&
		(buf->tag.relId.relId == rdesc->rd_id))
	    {
		buf->flags &= ~BM_DIRTY;
	    }
    	}
    	return;
    }

    for (i=1; i<=NBuffers; i++) {
	buf = &BufferDescriptors[i-1];
	if (!holding) {
	    SpinAcquire(BufMgrLock);
	    holding = 1;
	}
        if ((buf->flags & BM_DIRTY) &&
            (buf->tag.relId.dbId == MyDatabaseId) &&
            (buf->tag.relId.relId == rdesc->rd_id)) {
            buf->flags &= ~BM_DIRTY;
            if (!(buf->flags & BM_FREE)) {
		SpinRelease(BufMgrLock);
		holding = 0;
		ReleaseBuffer(i);
	    }
	}
    }
    if (holding)
	SpinRelease(BufMgrLock);
}

/* ---------------------------------------------------------------------
 *      DropBuffers
 *
 *	This function marks all the buffers in the buffer cache for a
 *	particular database as clean.  This is used when we destroy a
 *	database, to avoid trying to flush data to disk when the directory
 *	tree no longer exists.
 *
 *	This is an exceedingly non-public interface.
 * --------------------------------------------------------------------
 */
void
DropBuffers(Oid dbid)
{
    register int i;
    BufferDesc *buf;

    SpinAcquire(BufMgrLock);
    for (i=1; i<=NBuffers; i++) {
	buf = &BufferDescriptors[i-1];
        if ((buf->tag.relId.dbId == dbid) && (buf->flags & BM_DIRTY)) {
            buf->flags &= ~BM_DIRTY;
        }
    }
    SpinRelease(BufMgrLock);
}

/* -----------------------------------------------------------------
 *	PrintBufferDescs
 *
 *	this function prints all the buffer descriptors, for debugging
 *	use only.
 * -----------------------------------------------------------------
 */
void
PrintBufferDescs()
{
    int i;
    BufferDesc *buf = BufferDescriptors;

    if (IsUnderPostmaster) {
	SpinAcquire(BufMgrLock);
	for (i = 0; i < NBuffers; ++i, ++buf) {
	    elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \
blockNum=%d, flags=0x%x, refcount=%d %d)",
		 i, buf->freeNext, buf->freePrev, NAMEDATALEN,
		 buf->sb_relname, buf->tag.blockNum, buf->flags,
		 buf->refcount, PrivateRefCount[i]);
	}
	SpinRelease(BufMgrLock);
    } else {
	/* interactive backend */
	for (i = 0; i < NBuffers; ++i, ++buf) {
	    printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n",
		   i, buf->sb_relname, buf->tag.blockNum,
		   buf->flags, buf->refcount, PrivateRefCount[i]);
	}
    }
}

void
PrintPinnedBufs()
{
    int i;
    BufferDesc *buf = BufferDescriptors;

    SpinAcquire(BufMgrLock);
    for (i = 0; i < NBuffers; ++i, ++buf) {
	if (PrivateRefCount[i] > 0)
	    elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \
blockNum=%d, flags=0x%x, refcount=%d %d)\n",
		 i, buf->freeNext, buf->freePrev, NAMEDATALEN, buf->sb_relname,
		 buf->tag.blockNum, buf->flags,
		 buf->refcount, PrivateRefCount[i]);
    }
    SpinRelease(BufMgrLock);
}

/*
 * BufferPoolBlowaway
 *
 * this routine is solely for the purpose of experiments -- sometimes
 * you may want to blowaway whatever is left from the past in buffer
 * pool and start measuring some performance with a clean empty buffer
 * pool.
 */
void
BufferPoolBlowaway()
{
    register int i;

    BufferSync();
    for (i=1; i<=NBuffers; i++) {
        if (BufferIsValid(i)) {
            while(BufferIsValid(i))
                ReleaseBuffer(i);
        }
        BufTableDelete(&BufferDescriptors[i-1]);
    }
}

#undef IncrBufferRefCount
#undef ReleaseBuffer

void
IncrBufferRefCount(Buffer buffer)
{
    if (BufferIsLocal(buffer)) {
	Assert(LocalRefCount[-buffer - 1] >= 0);
	LocalRefCount[-buffer - 1]++;
    } else {
	Assert(!BAD_BUFFER_ID(buffer));
	Assert(PrivateRefCount[buffer - 1] >= 0);
	PrivateRefCount[buffer - 1]++;
    }
}

/*
 * ReleaseBuffer -- remove the pin on a buffer without
 * 	marking it dirty.
 *
 */
int
ReleaseBuffer(Buffer buffer)
{
    BufferDesc	*bufHdr;

    if (BufferIsLocal(buffer)) {
	Assert(LocalRefCount[-buffer - 1] > 0);
	LocalRefCount[-buffer - 1]--;
	return (STATUS_OK);
    }

    if (BAD_BUFFER_ID(buffer))
	return(STATUS_ERROR);

    bufHdr = &BufferDescriptors[buffer-1];

    Assert(PrivateRefCount[buffer - 1] > 0);
    PrivateRefCount[buffer - 1]--;
    if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0) {
	/* only release buffer if it is not pinned in previous ExecMain
	   levels */
	SpinAcquire(BufMgrLock);
	bufHdr->refcount--;
	if (bufHdr->refcount == 0) {
	    AddBufferToFreelist(bufHdr);
	    bufHdr->flags |= BM_FREE;
	}
	SpinRelease(BufMgrLock);
    }

    return(STATUS_OK);
}

void
IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
{
    IncrBufferRefCount(buffer);
    if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
        BufferDesc *buf = &BufferDescriptors[buffer-1];

        fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
		buffer, buf->sb_relname, buf->tag.blockNum,
		PrivateRefCount[buffer - 1], file, line);
    }
}

void
ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
{
    ReleaseBuffer(buffer);
    if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
        BufferDesc *buf = &BufferDescriptors[buffer-1];

        fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
		buffer, buf->sb_relname, buf->tag.blockNum,
		PrivateRefCount[buffer - 1], file, line);
    }
}

int
ReleaseAndReadBuffer_Debug(char *file,
			   int line,
			   Buffer buffer,
			   Relation relation,
			   BlockNumber blockNum)
{
    bool bufferValid;
    Buffer b;

    bufferValid = BufferIsValid(buffer);
    b = ReleaseAndReadBuffer(buffer, relation, blockNum);
    if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
	&& is_userbuffer(buffer)) {
	BufferDesc *buf = &BufferDescriptors[buffer-1];

        fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
		buffer, buf->sb_relname, buf->tag.blockNum,
		PrivateRefCount[buffer - 1], file, line);
    }
    if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) {
	BufferDesc *buf = &BufferDescriptors[b-1];

        fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
		b, buf->sb_relname, buf->tag.blockNum,
		PrivateRefCount[b - 1], file, line);
    }
    return b;
}

#ifdef BMTRACE

/*
 *  trace allocations and deallocations in a circular buffer in
 *  shared memory.  check the buffer before doing the allocation,
 *  and die if there's anything fishy.
 */

_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType)
{
    static int mypid = 0;
    long start, cur;
    bmtrace *tb;

    if (mypid == 0)
	mypid = getpid();

    start = *CurTraceBuf;

    if (start > 0)
	cur = start - 1;
    else
	cur = BMT_LIMIT - 1;

    for (;;) {
	tb = &TraceBuf[cur];
	if (tb->bmt_op != BMT_NOTUSED) {
	    if (tb->bmt_buf == bufNo) {
		if ((tb->bmt_op == BMT_DEALLOC)
		    || (tb->bmt_dbid == dbId && tb->bmt_relid == relId
			&& tb->bmt_blkno == blkNo))
		    goto okay;

		/* die holding the buffer lock */
		_bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur);
	    }
	}

	if (cur == start)
	    goto okay;

	if (cur == 0)
	    cur = BMT_LIMIT - 1;
	else
	    cur--;
    }

 okay:
    tb = &TraceBuf[start];
    tb->bmt_pid = mypid;
    tb->bmt_buf = bufNo;
    tb->bmt_dbid = dbId;
    tb->bmt_relid = relId;
    tb->bmt_blkno = blkNo;
    tb->bmt_op = allocType;

    *CurTraceBuf = (start + 1) % BMT_LIMIT;
}

_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
	int allocType, long start, long cur)
{
    FILE *fp;
    bmtrace *tb;
    int i;

    tb = &TraceBuf[cur];

    if ((fp = fopen("/tmp/death_notice", "w")) == (FILE *) NULL)
	elog(FATAL, "buffer alloc trace error and can't open log file");

    fprintf(fp, "buffer alloc trace detected the following error:\n\n");
    fprintf(fp, "    buffer %d being %s inconsistently with a previous %s\n\n",
	    bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"),
	    (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation"));

    fprintf(fp, "the trace buffer contains:\n");

    i = start;
    for (;;) {
	tb = &TraceBuf[i];
	if (tb->bmt_op != BMT_NOTUSED) {
	    fprintf(fp, "     [%3d]%spid %d buf %2d for <%d,%d,%d> ",
		    i, (i == cur ? " ---> " : "\t"),
		    tb->bmt_pid, tb->bmt_buf,
		    tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno);

	    switch (tb->bmt_op) {
	    case BMT_ALLOCFND:
		fprintf(fp, "allocate (found)\n");
		break;

	    case BMT_ALLOCNOTFND:
		fprintf(fp, "allocate (not found)\n");
		break;

	    case BMT_DEALLOC:
		fprintf(fp, "deallocate\n");
		break;

	    default:
		fprintf(fp, "unknown op type %d\n", tb->bmt_op);
		break;
	    }
	}

	i = (i + 1) % BMT_LIMIT;
	if (i == start)
	    break;
    }

    fprintf(fp, "\noperation causing error:\n");
    fprintf(fp, "\tpid %d buf %d for <%d,%d,%d> ",
	    getpid(), bufNo, dbId, relId, blkNo);

    switch (allocType) {
    case BMT_ALLOCFND:
	fprintf(fp, "allocate (found)\n");
	break;

    case BMT_ALLOCNOTFND:
	fprintf(fp, "allocate (not found)\n");
	break;

    case BMT_DEALLOC:
	fprintf(fp, "deallocate\n");
	break;

    default:
	fprintf(fp, "unknown op type %d\n", allocType);
	break;
    }

    (void) fclose(fp);

    kill(getpid(), SIGILL);
}

#endif /* BMTRACE */

void
BufferRefCountReset(int *refcountsave)
{
    int i;
    for (i=0; i<NBuffers; i++) {
	refcountsave[i] = PrivateRefCount[i];
	LastRefCount[i] += PrivateRefCount[i];
	PrivateRefCount[i] = 0;
    }
}

void
BufferRefCountRestore(int *refcountsave)
{
    int i;
    for (i=0; i<NBuffers; i++) {
	PrivateRefCount[i] = refcountsave[i];
	LastRefCount[i] -= refcountsave[i];
	refcountsave[i] = 0;
    }
}

int SetBufferWriteMode (int mode)
{
    int old;

    old = WriteMode;
    WriteMode = mode;
    return (old);
}