diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 182c1d4234..f475e63a78 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.6 2003/11/29 19:51:56 pgsql Exp $
+$PostgreSQL: pgsql/src/backend/storage/buffer/README,v 1.7 2004/04/19 23:27:17 tgl Exp $
 
 Notes about shared buffer access rules
 --------------------------------------
@@ -97,153 +97,149 @@ for VACUUM's use, since we don't allow multiple VACUUMs concurrently on a
 single relation anyway.
 
 
-Buffer replacement strategy interface:
+Buffer replacement strategy interface
+-------------------------------------
 
-The two files freelist.c and buf_table.c contain the buffer cache
-replacement strategy. The interface to the strategy is:
+The file freelist.c contains the buffer cache replacement strategy.
+The interface to the strategy is:
 
-    BufferDesc *
-	StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
+	BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+	                                 int *cdb_found_index)
 
-		This is allways the first call made by the buffer manager
-		to check if a disk page is in memory. If so, the function
-		returns the buffer descriptor and no further action is
-		required.
+This is always the first call made by the buffer manager to check if a disk
+page is in memory. If so, the function returns the buffer descriptor and no
+further action is required. If the page is not in memory,
+StrategyBufferLookup() returns NULL.
 
-		If the page is not in memory, StrategyBufferLookup()
-		returns NULL.
+The flag recheck tells the strategy that this is a second lookup after
+flushing a dirty block. If the buffer manager has to evict another buffer,
+it will release the bufmgr lock while doing the write IO. During this time,
+another backend could possibly fault in the same page this backend is after,
+so we have to check again after the IO is done if the page is in memory now.
 
-		The flag recheck tells the strategy that this is a second
-		lookup after flushing a dirty block. If the buffer manager
-		has to evict another buffer, he will release the bufmgr lock
-		while doing the write IO. During this time, another backend
-		could possibly fault in the same page this backend is after,
-		so we have to check again after the IO is done if the page
-		is in memory now.
+*cdb_found_index is set to the index of the found CDB, or -1 if none.
+This is not intended to be used by the caller, except to pass to
+StrategyReplaceBuffer().
 
-	BufferDesc *
-	StrategyGetBuffer(void)
+	BufferDesc *StrategyGetBuffer(int *cdb_replace_index)
 
-		The buffer manager calls this function to get an unpinned
-		cache buffer who's content can be evicted. The returned
-		buffer might be empty, clean or dirty.
+The buffer manager calls this function to get an unpinned cache buffer whose
+content can be evicted. The returned buffer might be empty, clean or dirty.
 
-		The returned buffer is only a cadidate for replacement.
-		It is possible that while the buffer is written, another
-		backend finds and modifies it, so that it is dirty again.
-		The buffer manager will then call StrategyGetBuffer()
-		again to ask for another candidate.
+The returned buffer is only a candidate for replacement.  It is possible that
+while the buffer is being written, another backend finds and modifies it, so
+that it is dirty again.  The buffer manager will then have to call
+StrategyGetBuffer() again to ask for another candidate.
 
-	void
-	StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, 
-			BlockNumber blockNum)
-		
-		Called by the buffer manager at the time it is about to
-		change the association of a buffer with a disk page.
+*cdb_replace_index is set to the index of the candidate CDB, or -1 if none
+(meaning we are using a previously free buffer).  This is not intended to be
+used by the caller, except to pass to StrategyReplaceBuffer().
 
-		Before this call, StrategyBufferLookup() still has to find
-		the buffer even if it was returned by StrategyGetBuffer()
-		as a candidate for replacement.
+	void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+	                           int cdb_found_index, int cdb_replace_index)
 
-		After this call, this buffer must be returned for a
-		lookup of the new page identified by rnode and blockNum.
+Called by the buffer manager at the time it is about to change the association
+of a buffer with a disk page.
 
-	void
-	StrategyInvalidateBuffer(BufferDesc *buf)
+Before this call, StrategyBufferLookup() still has to find the buffer under
+its old tag, even if it was returned by StrategyGetBuffer() as a candidate
+for replacement.
 
-		Called from various parts to inform that the content of
-		this buffer has been thrown away. This happens for example
-		in the case of dropping a relation.
+After this call, this buffer must be returned for a lookup of the new page
+identified by *newTag.
 
-		The buffer must be clean and unpinned on call.
+cdb_found_index and cdb_replace_index must be the auxiliary values
+returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
 
-		If the buffer associated with a disk page, StrategyBufferLookup()
-		must not return it for this page after the call.
+	void StrategyInvalidateBuffer(BufferDesc *buf)
 
-	void
-	StrategyHintVacuum(bool vacuum_active)
+Called by the buffer manager to inform the strategy that the content of this
+buffer is being thrown away. This happens for example in the case of dropping
+a relation.  The buffer must be clean and unpinned on call.
 
-		Because vacuum reads all relations of the entire database
-		through the buffer manager, it can greatly disturb the
-		buffer replacement strategy. This function is used by vacuum
-		to inform that all subsequent buffer lookups are caused
-		by vacuum scanning relations.
+If the buffer was associated with a disk page, StrategyBufferLookup()
+must not return it for this page after the call.
 
-		
-Buffer replacement strategy:
+	void StrategyHintVacuum(bool vacuum_active)
 
-The buffer replacement strategy actually used in freelist.c is a
-version of the Adaptive Replacement Cache (ARC) special tailored for
-PostgreSQL.
+Because VACUUM reads all relations of the entire database through the buffer
+manager, it can greatly disturb the buffer replacement strategy. This function
+is used by VACUUM to inform the strategy that subsequent buffer lookups are
+(or are not) caused by VACUUM scanning relations.
+
+
+Buffer replacement strategy
+---------------------------
+
+The buffer replacement strategy actually used in freelist.c is a version of
+the Adaptive Replacement Cache (ARC) specially tailored for PostgreSQL.
 
 The algorithm works as follows:
 
-    C is the size of the cache in number of pages (conf: shared_buffers)
-	ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
-	is allwayt associated with one unique file page and "can" point to
-	one shared buffer.
+C is the size of the cache in number of pages (a/k/a shared_buffers or
+NBuffers).  ARC uses 2*C Cache Directory Blocks (CDB). A cache directory block
+is always associated with one unique file page.  It may point to one shared
+buffer, or may indicate that the file page is not in a buffer but has been
+accessed recently.
 
-	All file pages known in by the directory are managed in 4 LRU lists
-	named B1, T1, T2 and B2. The T1 and T2 lists are the "real" cache
-	entries, linking a file page to a memory buffer where the page is
-	currently cached. Consequently T1len+T2len <= C. B1 and B2 are
-	ghost cache directories that extend T1 and T2 so that the strategy
-	remembers pages longer. The strategy tries to keep B1len+T1len and
-	B2len+T2len both at C. T1len and T2 len vary over the runtime
-	depending on the lookup pattern and its resulting cache hits. The
-	desired size of T1len is called T1target.
+All CDB entries are managed in 4 LRU lists named T1, T2, B1 and B2. The T1 and
+T2 lists are the "real" cache entries, linking a file page to a memory buffer
+where the page is currently cached. Consequently T1len+T2len <= C. B1 and B2
+are ghost cache directories that extend T1 and T2 so that the strategy
+remembers pages longer. The strategy tries to keep B1len+T1len and B2len+T2len
+both at C. T1len and T2len vary over the runtime depending on the lookup
+pattern and its resulting cache hits. The desired size of T1len is called
+T1target.
 
-	Assuming we have a full cache, one of 5 cases happens on a lookup:
+Assuming we have a full cache, one of 5 cases happens on a lookup:
 
-	MISS	On a cache miss, depending on T1target and the actual T1len
-			the LRU buffer of T1 or T2 is evicted. Its CDB is removed
-			from the T list and added as MRU of the corresponding B list.
-			The now free buffer is replaced with the requested page
-			and added as MRU of T1.
+MISS	On a cache miss, depending on T1target and the actual T1len
+	the LRU buffer of either T1 or T2 is evicted. Its CDB is removed
+	from the T list and added as MRU of the corresponding B list.
+	The now free buffer is replaced with the requested page
+	and added as MRU of T1.
 
-	T1 hit	The T1 CDB is moved to the MRU position of the T2 list.
+T1 hit	The T1 CDB is moved to the MRU position of the T2 list.
 
-	T2 hit	The T2 CDB is moved to the MRU position of the T2 list.
+T2 hit	The T2 CDB is moved to the MRU position of the T2 list.
 
-	B1 hit	This means that a buffer that was evicted from the T1
-			list is now requested again, indicating that T1target is
-			too small (otherwise it would still be in T1 and thus in
-			memory). The strategy raises T1target, evicts a buffer
-			depending on T1target and T1len and places the CDB at
-			MRU of T2.
+B1 hit	This means that a buffer that was evicted from the T1
+	list is now requested again, indicating that T1target is
+	too small (otherwise it would still be in T1 and thus in
+	memory). The strategy raises T1target, evicts a buffer
+	depending on T1target and T1len and places the CDB at
+	MRU of T2.
 
-	B2 hit	This means the opposite of B1, the T2 list is probably too
-			small. So the strategy lowers T1target, evicts a buffer
-			and places the CDB at MRU of T2.
+B2 hit	This means the opposite of B1, the T2 list is probably too
+	small. So the strategy lowers T1target, evicts a buffer
+	and places the CDB at MRU of T2.
 
-	Thus, every page that is found on lookup in any of the four lists
-	ends up as the MRU of the T2 list. The T2 list therefore is the
-	"frequency" cache, holding frequently requested pages.
+Thus, every page that is found on lookup in any of the four lists
+ends up as the MRU of the T2 list. The T2 list therefore is the
+"frequency" cache, holding frequently requested pages.
 
-	Every page that is seen for the first time ends up as the MRU of
-	the T1 list. The T1 list is the "recency" cache, holding recent
-	newcomers.
+Every page that is seen for the first time ends up as the MRU of the T1
+list. The T1 list is the "recency" cache, holding recent newcomers.
 
-	The tailoring done for PostgreSQL has to do with the way, the
-	query executor works. A typical UPDATE or DELETE first scans the 
-	relation, searching for the tuples and then calls heap_update() or
-	heap_delete(). This causes at least 2 lookups for the block in the
-	same statement. In the case of multiple matches in one block even
-	more often. As a result, every block touched in an UPDATE or DELETE
-	would directly jump into the T2 cache, which is wrong. To prevent
-	this the strategy remembers which transaction added a buffer to the
-	T1 list and will not promote it from there into the T2 cache during
-	the same transaction.
-	
-	Another specialty is the change of the strategy during VACUUM.
-	Lookups during VACUUM do not represent application needs, so it
-	would be wrong to change the cache balance T1target due to that
-	or to cause massive cache evictions. Therefore, a page read in to
-	satisfy vacuum (not those that actually cause a hit on any list)
-	is placed at the LRU position of the T1 list, for immediate
-	reuse. Since Vacuum usually requests many pages very fast, the
-	natural side effect of this is that it will get back the very
-	buffers it filled and possibly modified on the next call and will
-	therefore do it's work in a few shared memory buffers, while using
-	whatever it finds in the cache already.
+The tailoring done for PostgreSQL has to do with the way the query executor
+works. A typical UPDATE or DELETE first scans the relation, searching for the
+tuples and then calls heap_update() or heap_delete(). This causes at least 2
+lookups for the block in the same statement. In the case of multiple matches
+in one block even more often. As a result, every block touched in an UPDATE or
+DELETE would directly jump into the T2 cache, which is wrong. To prevent this
+the strategy remembers which transaction added a buffer to the T1 list and
+will not promote it from there into the T2 cache during the same transaction.
 
+Another specialty is the change of the strategy during VACUUM.  Lookups during
+VACUUM do not represent application needs, and do not suggest that the page
+will be hit again soon, so it would be wrong to change the cache balance
+T1target due to that or to cause massive cache evictions. Therefore, a page
+read in to satisfy vacuum is placed at the LRU position of the T1 list, for
+immediate reuse.  Also, if we happen to get a hit on a CDB entry during
+VACUUM, we do not promote the page above its current position in the list.
+Since VACUUM usually requests many pages very fast, the effect of this is that
+it will get back the very buffers it filled and possibly modified on the next
+call and will therefore do its work in a few shared memory buffers, while
+being able to use whatever it finds in the cache already.  This also implies
+that most of the write traffic caused by a VACUUM will be done by the VACUUM
+itself and not pushed off onto other processes.
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index a671bf9f7f..e0aa0e93e8 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -8,35 +8,15 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.62 2004/02/12 15:06:56 wieck Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_init.c,v 1.63 2004/04/19 23:27:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <sys/file.h>
-#include <math.h>
-#include <signal.h>
-
-#include "catalog/catalog.h"
-#include "executor/execdebug.h"
-#include "miscadmin.h"
-#include "storage/buf.h"
-#include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
-#include "storage/fd.h"
-#include "storage/ipc.h"
-#include "storage/lmgr.h"
-#include "storage/shmem.h"
-#include "storage/smgr.h"
-#include "storage/lwlock.h"
-#include "utils/builtins.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
+#include "storage/buf_internals.h"
 
-int			ShowPinTrace = 0;
-
-int			Data_Descriptors;
 
 BufferDesc *BufferDescriptors;
 Block	   *BufferBlockPointers;
@@ -44,6 +24,14 @@ Block	   *BufferBlockPointers;
 long	   *PrivateRefCount;	/* also used in freelist.c */
 bits8	   *BufferLocks;		/* flag bits showing locks I have set */
 
+/* statistics counters */
+long int	ReadBufferCount;
+long int	ReadLocalBufferCount;
+long int	BufferHitCount;
+long int	LocalBufferHitCount;
+long int	BufferFlushCount;
+long int	LocalBufferFlushCount;
+
 
 /*
  * Data Structures:
@@ -61,48 +49,35 @@ bits8	   *BufferLocks;		/* flag bits showing locks I have set */
  *		see freelist.c.  A buffer cannot be replaced while in
  *		use either by data manager or during IO.
  *
- * WriteBufferBack:
- *		currently, a buffer is only written back at the time
- *		it is selected for replacement.  It should
- *		be done sooner if possible to reduce latency of
- *		BufferAlloc().	Maybe there should be a daemon process.
  *
  * Synchronization/Locking:
  *
  * BufMgrLock lock -- must be acquired before manipulating the
- *		buffer queues (lookup/freelist).  Must be released
+ *		buffer search datastructures (lookup/freelist, as well as the
+ *		flag bits of any buffer).  Must be released
  *		before exit and before doing any IO.
  *
  * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
  *		It must be set when an IO is initiated and cleared at
- *		the end of	the IO.  It is there to make sure that one
+ *		the end of the IO.  It is there to make sure that one
  *		process doesn't start to use a buffer while another is
  *		faulting it in.  see IOWait/IOSignal.
  *
- * refcount --	A buffer is pinned during IO and immediately
- *		after a BufferAlloc().	A buffer is always either pinned
- *		or on the freelist but never both.	The buffer must be
- *		released, written, or flushed before the end of
- *		transaction.
+ * refcount --	Counts the number of processes holding pins on a buffer.
+ *		A buffer is pinned during IO and immediately after a BufferAlloc().
+ *		Pins must be released before end of transaction.
  *
- * PrivateRefCount -- Each buffer also has a private refcount the keeps
+ * PrivateRefCount -- Each buffer also has a private refcount that keeps
  *		track of the number of times the buffer is pinned in the current
- *		processes.	This is used for two purposes, first, if we pin a
+ *		process.	This is used for two purposes: first, if we pin a
  *		a buffer more than once, we only need to change the shared refcount
- *		once, thus only lock the buffer pool once, second, when a transaction
+ *		once, thus only lock the shared state once; second, when a transaction
  *		aborts, it should only unpin the buffers exactly the number of times it
  *		has pinned them, so that it will not blow away buffers of another
  *		backend.
  *
  */
 
-long int	ReadBufferCount;
-long int	ReadLocalBufferCount;
-long int	BufferHitCount;
-long int	LocalBufferHitCount;
-long int	BufferFlushCount;
-long int	LocalBufferFlushCount;
-
 
 /*
  * Initialize shared buffer pool
@@ -118,8 +93,6 @@ InitBufferPool(void)
 				foundDescs;
 	int			i;
 
-	Data_Descriptors = NBuffers;
-
 	/*
 	 * It's probably not really necessary to grab the lock --- if there's
 	 * anyone else attached to the shmem at this point, we've got
@@ -131,7 +104,7 @@ InitBufferPool(void)
 
 	BufferDescriptors = (BufferDesc *)
 		ShmemInitStruct("Buffer Descriptors",
-					  Data_Descriptors * sizeof(BufferDesc), &foundDescs);
+						NBuffers * sizeof(BufferDesc), &foundDescs);
 
 	BufferBlocks = (char *)
 		ShmemInitStruct("Buffer Blocks",
@@ -152,9 +125,9 @@ InitBufferPool(void)
 
 		/*
 		 * link the buffers into a single linked list. This will become the
-		 * LiFo list of unused buffers returned by StragegyGetBuffer().
+		 * LIFO list of unused buffers returned by StrategyGetBuffer().
 		 */
-		for (i = 0; i < Data_Descriptors; block += BLCKSZ, buf++, i++)
+		for (i = 0; i < NBuffers; block += BLCKSZ, buf++, i++)
 		{
 			Assert(ShmemIsValid((unsigned long) block));
 
@@ -173,7 +146,7 @@ InitBufferPool(void)
 		}
 
 		/* Correct last entry */
-		BufferDescriptors[Data_Descriptors - 1].bufNext = -1;
+		BufferDescriptors[NBuffers - 1].bufNext = -1;
 	}
 
 	/* Init other shared buffer-management stuff */
@@ -215,35 +188,31 @@ InitBufferPoolAccess(void)
 		BufferBlockPointers[i] = (Block) MAKE_PTR(BufferDescriptors[i].data);
 }
 
-/* -----------------------------------------------------
+/*
  * BufferShmemSize
  *
  * compute the size of shared memory for the buffer pool including
  * data pages, buffer descriptors, hash tables, etc.
- * ----------------------------------------------------
  */
 int
 BufferShmemSize(void)
 {
 	int			size = 0;
 
-	/* size of shmem index hash table */
-	size += hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
-
 	/* size of buffer descriptors */
 	size += MAXALIGN(NBuffers * sizeof(BufferDesc));
 
-	/* size of the shared replacement strategy control block */
-	size += MAXALIGN(sizeof(BufferStrategyControl));
-
-	/* size of the ARC directory blocks */
-	size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));
-
 	/* size of data pages */
 	size += NBuffers * MAXALIGN(BLCKSZ);
 
 	/* size of buffer hash table */
 	size += hash_estimate_size(NBuffers * 2, sizeof(BufferLookupEnt));
 
+	/* size of the shared replacement strategy control block */
+	size += MAXALIGN(sizeof(BufferStrategyControl));
+
+	/* size of the ARC directory blocks */
+	size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));
+
 	return size;
 }
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 33590b65fd..3829444195 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -3,46 +3,42 @@
  * buf_table.c
  *	  routines for finding buffers in the buffer pool.
  *
+ * NOTE: these days, what this table actually provides is a mapping from
+ * BufferTags to CDB indexes, not directly to buffers.  The function names
+ * are thus slight misnomers.
+ *
+ * Note: all routines in this file assume that the BufMgrLock is held
+ * by the caller, so no synchronization is needed.
+ *
+ *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.34 2003/12/14 00:34:47 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/buf_table.c,v 1.35 2004/04/19 23:27:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
-/*
- * OLD COMMENTS
- *
- * Data Structures:
- *
- *		Buffers are identified by their BufferTag (buf.h).	This
- * file contains routines for allocating a shmem hash table to
- * map buffer tags to buffer descriptors.
- *
- * Synchronization:
- *
- *	All routines in this file assume BufMgrLock is held by their caller.
- */
-
 #include "postgres.h"
 
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 
+
 static HTAB *SharedBufHash;
 
 
 /*
  * Initialize shmem hash table for mapping buffers
+ *		size is the desired hash table size (2*NBuffers for ARC algorithm)
  */
 void
 InitBufTable(int size)
 {
 	HASHCTL		info;
 
-	/* assume lock is held */
+	/* assume no locking is needed yet */
 
 	/* BufferTag maps to Buffer */
 	info.keysize = sizeof(BufferTag);
@@ -60,6 +56,7 @@ InitBufTable(int size)
 
 /*
  * BufTableLookup
+ *		Lookup the given BufferTag; return CDB index, or -1 if not found
  */
 int
 BufTableLookup(BufferTag *tagPtr)
@@ -78,10 +75,11 @@ BufTableLookup(BufferTag *tagPtr)
 }
 
 /*
- * BufTableDelete
+ * BufTableInsert
+ *		Insert a hashtable entry for given tag and CDB index
  */
-bool
-BufTableInsert(BufferTag *tagPtr, Buffer buf_id)
+void
+BufTableInsert(BufferTag *tagPtr, int cdb_id)
 {
 	BufferLookupEnt *result;
 	bool		found;
@@ -97,14 +95,14 @@ BufTableInsert(BufferTag *tagPtr, Buffer buf_id)
 	if (found)					/* found something else in the table? */
 		elog(ERROR, "shared buffer hash table corrupted");
 
-	result->id = buf_id;
-	return TRUE;
+	result->id = cdb_id;
 }
 
 /*
  * BufTableDelete
+ *		Delete the hashtable entry for given tag
  */
-bool
+void
 BufTableDelete(BufferTag *tagPtr)
 {
 	BufferLookupEnt *result;
@@ -114,6 +112,4 @@ BufTableDelete(BufferTag *tagPtr)
 
 	if (!result)				/* shouldn't happen */
 		elog(ERROR, "shared buffer hash table corrupted");
-
-	return TRUE;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index d515a7a259..a80435b7ec 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.160 2004/02/12 20:07:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -54,9 +54,9 @@
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "utils/relcache.h"
-
 #include "pgstat.h"
 
+
 #define BufferGetLSN(bufHdr)	\
 	(*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
 
@@ -64,15 +64,17 @@
 /* GUC variable */
 bool		zero_damaged_pages = false;
 
+#ifdef NOT_USED
+int			ShowPinTrace = 0;
+#endif
+
 int			BgWriterDelay = 200;
 int			BgWriterPercent = 1;
 int			BgWriterMaxpages = 100;
 
-static void WaitIO(BufferDesc *buf);
-static void StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf);
-static void ContinueBufferIO(BufferDesc *buf, bool forInput);
-static void buffer_write_error_callback(void *arg);
+long		NDirectFileRead;	/* some I/O's are direct file access.
+								 * bypass bufmgr */
+long		NDirectFileWrite;	/* e.g., I/O in psort and hashjoin. */
 
 /*
  * Macro : BUFFER_IS_BROKEN
@@ -80,18 +82,22 @@ static void buffer_write_error_callback(void *arg);
 */
 #define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))
 
+
+static void PinBuffer(BufferDesc *buf);
+static void UnpinBuffer(BufferDesc *buf);
+static void WaitIO(BufferDesc *buf);
+static void StartBufferIO(BufferDesc *buf, bool forInput);
+static void TerminateBufferIO(BufferDesc *buf);
+static void ContinueBufferIO(BufferDesc *buf, bool forInput);
+static void buffer_write_error_callback(void *arg);
 static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
 				   bool bufferLockHeld);
 static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
 			bool *foundPtr);
 static void BufferReplace(BufferDesc *bufHdr);
-
-#ifdef NOT_USED
-void		PrintBufferDescs(void);
-#endif
-
 static void write_buffer(Buffer buffer, bool unpin);
 
+
 /*
  * ReadBuffer -- returns a buffer containing the requested
  *		block of the requested relation.  If the blknum
@@ -282,14 +288,15 @@ BufferAlloc(Relation reln,
 	BufferDesc *buf,
 			   *buf2;
 	BufferTag	newTag;			/* identity of requested block */
+	int			cdb_found_index,
+				cdb_replace_index;
 	bool		inProgress;		/* buffer undergoing IO */
 
-	/* create a new tag so we can lookup the buffer */
-	/* assume that the relation is already open */
+	/* create a tag so we can lookup the buffer */
 	INIT_BUFFERTAG(&newTag, reln, blockNum);
 
 	/* see if the block is in the buffer pool already */
-	buf = StrategyBufferLookup(&newTag, false);
+	buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
 	if (buf != NULL)
 	{
 		/*
@@ -332,6 +339,13 @@ BufferAlloc(Relation reln,
 		}
 
 		LWLockRelease(BufMgrLock);
+
+		/*
+		 * Do the cost accounting for vacuum
+		 */
+		if (VacuumCostActive)
+			VacuumCostBalance += VacuumCostPageHit;
+
 		return buf;
 	}
 
@@ -345,16 +359,16 @@ BufferAlloc(Relation reln,
 	inProgress = FALSE;
 	for (buf = NULL; buf == NULL;)
 	{
-		buf = StrategyGetBuffer();
+		buf = StrategyGetBuffer(&cdb_replace_index);
 
-		/* GetFreeBuffer will abort if it can't find a free buffer */
+		/* StrategyGetBuffer will elog if it can't find a free buffer */
 		Assert(buf);
 
 		/*
 		 * There should be exactly one pin on the buffer after it is
 		 * allocated -- ours.  If it had a pin it wouldn't have been on
 		 * the free list.  No one else could have pinned it between
-		 * GetFreeBuffer and here because we have the BufMgrLock.
+		 * StrategyGetBuffer and here because we have the BufMgrLock.
 		 */
 		Assert(buf->refcount == 0);
 		buf->refcount = 1;
@@ -438,7 +452,7 @@ BufferAlloc(Relation reln,
 			 * we haven't gotten around to insert the new tag into the
 			 * buffer table. So we need to check here.		-ay 3/95
 			 */
-			buf2 = StrategyBufferLookup(&newTag, true);
+			buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
 			if (buf2 != NULL)
 			{
 				/*
@@ -471,6 +485,15 @@ BufferAlloc(Relation reln,
 				}
 
 				LWLockRelease(BufMgrLock);
+
+				/*
+				 * Do the cost accounting for vacuum.  (XXX perhaps better
+				 * to consider this a miss?  We didn't have to do the read,
+				 * but we did have to write ...)
+				 */
+				if (VacuumCostActive)
+					VacuumCostBalance += VacuumCostPageHit;
+
 				return buf2;
 			}
 		}
@@ -485,8 +508,8 @@ BufferAlloc(Relation reln,
 	 * Tell the buffer replacement strategy that we are replacing the
 	 * buffer content. Then rename the buffer.
 	 */
-	StrategyReplaceBuffer(buf, reln, blockNum);
-	INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
+	StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
+	buf->tag = newTag;
 
 	/*
 	 * Buffer contents are currently invalid.  Have to mark IO IN PROGRESS
@@ -501,6 +524,12 @@ BufferAlloc(Relation reln,
 
 	LWLockRelease(BufMgrLock);
 
+	/*
+	 * Do the cost accounting for vacuum
+	 */
+	if (VacuumCostActive)
+		VacuumCostBalance += VacuumCostPageMiss;
+
 	return buf;
 }
 
@@ -624,20 +653,93 @@ ReleaseAndReadBuffer(Buffer buffer,
 }
 
 /*
- * BufferSync -- Write all dirty buffers in the pool.
+ * PinBuffer -- make buffer unavailable for replacement.
  *
- * This is called at checkpoint time and writes out all dirty shared buffers,
+ * This should be applied only to shared buffers, never local ones.
+ * Bufmgr lock must be held by caller.
+ */
+static void
+PinBuffer(BufferDesc *buf)
+{
+	int			b = BufferDescriptorGetBuffer(buf) - 1;
+
+	if (PrivateRefCount[b] == 0)
+		buf->refcount++;
+	PrivateRefCount[b]++;
+	Assert(PrivateRefCount[b] > 0);
+}
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ *
+ * This should be applied only to shared buffers, never local ones.
+ * Bufmgr lock must be held by caller.
+ */
+static void
+UnpinBuffer(BufferDesc *buf)
+{
+	int			b = BufferDescriptorGetBuffer(buf) - 1;
+
+	Assert(buf->refcount > 0);
+	Assert(PrivateRefCount[b] > 0);
+	PrivateRefCount[b]--;
+	if (PrivateRefCount[b] == 0)
+		buf->refcount--;
+
+	if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+		buf->refcount == 1)
+	{
+		/* we just released the last pin other than the waiter's */
+		buf->flags &= ~BM_PIN_COUNT_WAITER;
+		ProcSendSignal(buf->wait_backend_id);
+	}
+	else
+	{
+		/* do nothing */
+	}
+}
+
+/*
+ * BufferSync -- Write out dirty buffers in the pool.
+ *
+ * This is called at checkpoint time to write out all dirty shared buffers,
  * and by the background writer process to write out some of the dirty blocks.
+ * percent/maxpages should be zero in the former case, and nonzero limit
+ * values in the latter.
  */
 int
 BufferSync(int percent, int maxpages)
 {
+	BufferDesc **dirty_buffers;
+	BufferTag  *buftags;
+	int			num_buffer_dirty;
 	int			i;
-	BufferDesc *bufHdr;
 	ErrorContextCallback errcontext;
 
-	int			num_buffer_dirty;
-	int		   *buffer_dirty;
+	/*
+	 * Get a list of all currently dirty buffers and how many there are.
+	 * We do not flush buffers that get dirtied after we started. They
+	 * have to wait until the next checkpoint.
+	 */
+	dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *));
+	buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag));
+	
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+	num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags,
+											   NBuffers);
+
+	/*
+	 * If called by the background writer, we are usually asked to
+	 * only write out some portion of dirty buffers now, to prevent
+	 * the IO storm at checkpoint time.
+	 */
+	if (percent > 0)
+	{
+		Assert(percent <= 100);
+		num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100;
+	}
+	if (maxpages > 0 && num_buffer_dirty > maxpages)
+		num_buffer_dirty = maxpages;
 
 	/* Setup error traceback support for ereport() */
 	errcontext.callback = buffer_write_error_callback;
@@ -646,47 +748,22 @@ BufferSync(int percent, int maxpages)
 	error_context_stack = &errcontext;
 
 	/*
-	 * Get a list of all currently dirty buffers and how many there are.
-	 * We do not flush buffers that get dirtied after we started. They
-	 * have to wait until the next checkpoint.
+	 * Loop over buffers to be written.  Note the BufMgrLock is held at
+	 * loop top, but is released and reacquired intraloop, so we aren't
+	 * holding it long.
 	 */
-	buffer_dirty = (int *)palloc(NBuffers * sizeof(int));
-	
-	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-	num_buffer_dirty = StrategyDirtyBufferList(buffer_dirty, NBuffers);
-	LWLockRelease(BufMgrLock);
-
-	/*
-	 * If called by the background writer, we are usually asked to
-	 * only write out some percentage of dirty buffers now, to prevent
-	 * the IO storm at checkpoint time.
-	 */
-	if (percent > 0 && num_buffer_dirty > 10)
-	{
-		Assert(percent <= 100);
-		num_buffer_dirty = (num_buffer_dirty * percent) / 100;
-		if (maxpages > 0 && num_buffer_dirty > maxpages)
-			num_buffer_dirty = maxpages;
-	}
-
 	for (i = 0; i < num_buffer_dirty; i++)
 	{
+		BufferDesc *bufHdr = dirty_buffers[i];
 		Buffer		buffer;
 		XLogRecPtr	recptr;
 		SMgrRelation reln;
 
-		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
-		bufHdr = &BufferDescriptors[buffer_dirty[i]];
 		errcontext.arg = bufHdr;
 
-		if (!(bufHdr->flags & BM_VALID))
-		{
-			LWLockRelease(BufMgrLock);
-			continue;
-		}
-
 		/*
+		 * Check it is still the same page and still needs writing.
+		 *
 		 * We can check bufHdr->cntxDirty here *without* holding any lock
 		 * on buffer context as long as we set this flag in access methods
 		 * *before* logging changes with XLogInsert(): if someone will set
@@ -694,11 +771,12 @@ BufferSync(int percent, int maxpages)
 		 * checkpoint.redo points before log record for upcoming changes
 		 * and so we are not required to write such dirty buffer.
 		 */
-		if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
-		{
-			LWLockRelease(BufMgrLock);
+		if (!(bufHdr->flags & BM_VALID))
+			continue;
+		if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i]))
+			continue;
+		if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
 			continue;
-		}
 
 		/*
 		 * IO synchronization. Note that we do it with unpinned buffer to
@@ -707,12 +785,13 @@ BufferSync(int percent, int maxpages)
 		if (bufHdr->flags & BM_IO_IN_PROGRESS)
 		{
 			WaitIO(bufHdr);
-			if (!(bufHdr->flags & BM_VALID) ||
-				(!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)))
-			{
-				LWLockRelease(BufMgrLock);
+			/* Still need writing? */
+			if (!(bufHdr->flags & BM_VALID))
+				continue;
+			if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i]))
+				continue;
+			if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
 				continue;
-			}
 		}
 
 		/*
@@ -723,10 +802,11 @@ BufferSync(int percent, int maxpages)
 		PinBuffer(bufHdr);
 		StartBufferIO(bufHdr, false);	/* output IO start */
 
-		buffer = BufferDescriptorGetBuffer(bufHdr);
-
+		/* Release BufMgrLock while doing xlog work */
 		LWLockRelease(BufMgrLock);
 
+		buffer = BufferDescriptorGetBuffer(bufHdr);
+
 		/*
 		 * Protect buffer content against concurrent update
 		 */
@@ -740,8 +820,12 @@ BufferSync(int percent, int maxpages)
 
 		/*
 		 * Now it's safe to write buffer to disk. Note that no one else
-		 * should not be able to write it while we were busy with locking
-		 * and log flushing because of we setted IO flag.
+		 * should have been able to write it while we were busy with
+		 * locking and log flushing because we set the IO flag.
+		 *
+		 * Before we issue the actual write command, clear the just-dirtied
+		 * flag.  This lets us recognize concurrent changes (note that only
+		 * hint-bit changes are possible since we hold the buffer shlock).
 		 */
 		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty);
@@ -767,12 +851,12 @@ BufferSync(int percent, int maxpages)
 		 * Release the per-buffer readlock, reacquire BufMgrLock.
 		 */
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-		BufferFlushCount++;
 
 		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 
 		bufHdr->flags &= ~BM_IO_IN_PROGRESS;	/* mark IO finished */
 		TerminateBufferIO(bufHdr);		/* Sync IO finished */
+		BufferFlushCount++;
 
 		/*
 		 * If this buffer was marked by someone as DIRTY while we were
@@ -781,14 +865,16 @@ BufferSync(int percent, int maxpages)
 		if (!(bufHdr->flags & BM_JUST_DIRTIED))
 			bufHdr->flags &= ~BM_DIRTY;
 		UnpinBuffer(bufHdr);
-		LWLockRelease(BufMgrLock);
 	}
 
-	pfree(buffer_dirty);
+	LWLockRelease(BufMgrLock);
 
 	/* Pop the error context stack */
 	error_context_stack = errcontext.previous;
 
+	pfree(dirty_buffers);
+	pfree(buftags);
+
 	return num_buffer_dirty;
 }
 
@@ -818,11 +904,6 @@ WaitIO(BufferDesc *buf)
 }
 
 
-long		NDirectFileRead;	/* some I/O's are direct file access.
-								 * bypass bufmgr */
-long		NDirectFileWrite;	/* e.g., I/O in psort and hashjoin. */
-
-
 /*
  * Return a palloc'd string containing buffer usage statistics.
  */
@@ -892,9 +973,9 @@ AtEOXact_Buffers(bool isCommit)
 
 			if (isCommit)
 				elog(WARNING,
-				"buffer refcount leak: [%03d] (bufNext=%d, "
-				  "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
-					 i, buf->bufNext,
+					 "buffer refcount leak: [%03d] "
+					 "(rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
+					 i,
 					 buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
 					 buf->tag.blockNum, buf->flags,
 					 buf->refcount, PrivateRefCount[i]);
@@ -1021,6 +1102,26 @@ BufferGetBlockNumber(Buffer buffer)
 		return BufferDescriptors[buffer - 1].tag.blockNum;
 }
 
+/*
+ * BufferGetFileNode
+ *		Returns the relation ID (RelFileNode) associated with a buffer.
+ *
+ * This should make the same checks as BufferGetBlockNumber, but since the
+ * two are generally called together, we don't bother.
+ */
+RelFileNode
+BufferGetFileNode(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+
+	if (BufferIsLocal(buffer))
+		bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
+	else
+		bufHdr = &BufferDescriptors[buffer - 1];
+
+	return (bufHdr->tag.rnode);
+}
+
 /*
  * BufferReplace
  *
@@ -1663,7 +1764,11 @@ refcount = %ld, file: %s, line: %d\n",
  *
  * This routine might get called many times on the same page, if we are making
  * the first scan after commit of an xact that added/deleted many tuples.
- * So, be as quick as we can if the buffer is already dirty.
+ * So, be as quick as we can if the buffer is already dirty.  We do this by
+ * not acquiring BufMgrLock if it looks like the status bits are already OK.
+ * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
+ * we look, because the buffer content update is already done and will be
+ * reflected in the I/O.)
  */
 void
 SetBufferCommitInfoNeedsSave(Buffer buffer)
@@ -2008,19 +2113,6 @@ AbortBufferIO(void)
 	}
 }
 
-RelFileNode
-BufferGetFileNode(Buffer buffer)
-{
-	BufferDesc *bufHdr;
-
-	if (BufferIsLocal(buffer))
-		bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
-	else
-		bufHdr = &BufferDescriptors[buffer - 1];
-
-	return (bufHdr->tag.rnode);
-}
-
 /*
  * Error context callback for errors occurring during buffer writes.
  */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 595e4905a8..c14d446497 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -3,210 +3,208 @@
  * freelist.c
  *	  routines for manipulating the buffer pool's replacement strategy.
  *
+ * Note: all routines in this file assume that the BufMgrLock is held
+ * by the caller, so no synchronization is needed.
+ *
+ *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.41 2004/02/12 15:06:56 wieck Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/freelist.c,v 1.42 2004/04/19 23:27:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
-/*
- * OLD COMMENTS
- *
- * Data Structures:
- *		SharedFreeList is a circular queue.  Notice that this
- *		is a shared memory queue so the next/prev "ptrs" are
- *		buffer ids, not addresses.
- *
- * Sync: all routines in this file assume that the buffer
- *		semaphore has been acquired by the caller.
- */
-
 #include "postgres.h"
 
+#include "access/xact.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
-#include "storage/ipc.h"
-#include "storage/proc.h"
-#include "access/xact.h"
-#include "miscadmin.h"
 
-#ifndef MAX
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#endif
-#ifndef MIN
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-#endif
 
+/* GUC variable: time in seconds between statistics reports */
+int		DebugSharedBuffers = 0;
+
+/* Pointers to shared state */
 static BufferStrategyControl	*StrategyControl = NULL;
 static BufferStrategyCDB		*StrategyCDB = NULL;
 
-static int		strategy_cdb_found;
-static int		strategy_cdb_replace;
-static int		strategy_get_from;
-
-int				DebugSharedBuffers = 0;
-
-static bool				strategy_hint_vacuum;
+/* Backend-local state about whether currently vacuuming */
+static bool				strategy_hint_vacuum = false;
 static TransactionId	strategy_vacuum_xid;
 
 
-#define T1_TARGET	StrategyControl->target_T1_size
-#define B1_LENGTH	StrategyControl->listSize[STRAT_LIST_B1]
-#define T1_LENGTH	StrategyControl->listSize[STRAT_LIST_T1]
-#define T2_LENGTH	StrategyControl->listSize[STRAT_LIST_T2]
-#define B2_LENGTH	StrategyControl->listSize[STRAT_LIST_B2]
+#define T1_TARGET	(StrategyControl->target_T1_size)
+#define B1_LENGTH	(StrategyControl->listSize[STRAT_LIST_B1])
+#define T1_LENGTH	(StrategyControl->listSize[STRAT_LIST_T1])
+#define T2_LENGTH	(StrategyControl->listSize[STRAT_LIST_T2])
+#define B2_LENGTH	(StrategyControl->listSize[STRAT_LIST_B2])
 
 
 /*
  * Macro to remove a CDB from whichever list it currently is on
  */
 #define	STRAT_LIST_REMOVE(cdb) \
-{ \
-	AssertMacro((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS);		\
-	if ((cdb)->prev < 0)												\
-		StrategyControl->listHead[(cdb)->list] = (cdb)->next;			\
-	else																\
-		StrategyCDB[(cdb)->prev].next = (cdb)->next;					\
-	if ((cdb)->next < 0)												\
-		StrategyControl->listTail[(cdb)->list] = (cdb)->prev;			\
-	else																\
-		StrategyCDB[(cdb)->next].prev = (cdb)->prev;					\
-	StrategyControl->listSize[(cdb)->list]--;							\
-	(cdb)->list = STRAT_LIST_UNUSED;									\
-}
+do { \
+	Assert((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS);	\
+	if ((cdb)->prev < 0)										\
+		StrategyControl->listHead[(cdb)->list] = (cdb)->next;	\
+	else														\
+		StrategyCDB[(cdb)->prev].next = (cdb)->next;			\
+	if ((cdb)->next < 0)										\
+		StrategyControl->listTail[(cdb)->list] = (cdb)->prev;	\
+	else														\
+		StrategyCDB[(cdb)->next].prev = (cdb)->prev;			\
+	StrategyControl->listSize[(cdb)->list]--;					\
+	(cdb)->list = STRAT_LIST_UNUSED;							\
+} while(0)
 
 /*
  * Macro to add a CDB to the tail of a list (MRU position)
  */
 #define STRAT_MRU_INSERT(cdb,l) \
-{ \
-	AssertMacro((cdb)->list == STRAT_LIST_UNUSED);						\
-	if (StrategyControl->listTail[(l)] < 0)								\
-	{																	\
-		(cdb)->prev = (cdb)->next = -1;									\
-		StrategyControl->listHead[(l)] = 								\
-			StrategyControl->listTail[(l)] =							\
-			((cdb) - StrategyCDB);										\
-	}																	\
-	else																\
-	{																	\
-		(cdb)->next = -1;												\
-		(cdb)->prev = StrategyControl->listTail[(l)];					\
-		StrategyCDB[StrategyControl->listTail[(l)]].next = 				\
-			((cdb) - StrategyCDB);										\
-		StrategyControl->listTail[(l)] = 								\
-			((cdb) - StrategyCDB);										\
-	}																	\
-	StrategyControl->listSize[(l)]++;									\
-	(cdb)->list = (l);													\
-}
+do { \
+	Assert((cdb)->list == STRAT_LIST_UNUSED);					\
+	if (StrategyControl->listTail[(l)] < 0)						\
+	{															\
+		(cdb)->prev = (cdb)->next = -1;							\
+		StrategyControl->listHead[(l)] = 						\
+			StrategyControl->listTail[(l)] =					\
+			((cdb) - StrategyCDB);								\
+	}															\
+	else														\
+	{															\
+		(cdb)->next = -1;										\
+		(cdb)->prev = StrategyControl->listTail[(l)];			\
+		StrategyCDB[StrategyControl->listTail[(l)]].next = 		\
+			((cdb) - StrategyCDB);								\
+		StrategyControl->listTail[(l)] = 						\
+			((cdb) - StrategyCDB);								\
+	}															\
+	StrategyControl->listSize[(l)]++;							\
+	(cdb)->list = (l);											\
+} while(0)
 
 /*
  * Macro to add a CDB to the head of a list (LRU position)
  */
 #define STRAT_LRU_INSERT(cdb,l) \
-{ \
-	AssertMacro((cdb)->list == STRAT_LIST_UNUSED);						\
-	if (StrategyControl->listHead[(l)] < 0)								\
-	{																	\
-		(cdb)->prev = (cdb)->next = -1;									\
-		StrategyControl->listHead[(l)] = 								\
-			StrategyControl->listTail[(l)] =							\
-			((cdb) - StrategyCDB);										\
-	}																	\
-	else																\
-	{																	\
-		(cdb)->prev = -1;												\
-		(cdb)->next = StrategyControl->listHead[(l)];					\
-		StrategyCDB[StrategyControl->listHead[(l)]].prev = 				\
-			((cdb) - StrategyCDB);										\
-		StrategyControl->listHead[(l)] = 								\
-			((cdb) - StrategyCDB);										\
-	}																	\
-	StrategyControl->listSize[(l)]++;									\
-	(cdb)->list = (l);													\
-}
+do { \
+	Assert((cdb)->list == STRAT_LIST_UNUSED);					\
+	if (StrategyControl->listHead[(l)] < 0)						\
+	{															\
+		(cdb)->prev = (cdb)->next = -1;							\
+		StrategyControl->listHead[(l)] = 						\
+			StrategyControl->listTail[(l)] =					\
+			((cdb) - StrategyCDB);								\
+	}															\
+	else														\
+	{															\
+		(cdb)->prev = -1;										\
+		(cdb)->next = StrategyControl->listHead[(l)];			\
+		StrategyCDB[StrategyControl->listHead[(l)]].prev = 		\
+			((cdb) - StrategyCDB);								\
+		StrategyControl->listHead[(l)] = 						\
+			((cdb) - StrategyCDB);								\
+	}															\
+	StrategyControl->listSize[(l)]++;							\
+	(cdb)->list = (l);											\
+} while(0)
 
 
+/*
+ * Printout for use when DebugSharedBuffers is enabled
+ */
+static void
+StrategyStatsDump(void)
+{
+	time_t		now = time(NULL);
+
+	if (StrategyControl->stat_report + DebugSharedBuffers < now)
+	{
+		long	all_hit, b1_hit, t1_hit, t2_hit, b2_hit;
+		int		id, t1_clean, t2_clean;
+		ErrorContextCallback	*errcxtold;
+
+		id = StrategyControl->listHead[STRAT_LIST_T1];
+		t1_clean = 0;
+		while (id >= 0)
+		{
+			if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
+				break;
+			t1_clean++;
+			id = StrategyCDB[id].next;
+		}
+		id = StrategyControl->listHead[STRAT_LIST_T2];
+		t2_clean = 0;
+		while (id >= 0)
+		{
+			if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
+				break;
+			t2_clean++;
+			id = StrategyCDB[id].next;
+		}
+
+		if (StrategyControl->num_lookup == 0)
+		{
+			all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
+		}
+		else
+		{
+			b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
+					  StrategyControl->num_lookup);
+			t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
+					  StrategyControl->num_lookup);
+			t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
+					  StrategyControl->num_lookup);
+			b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
+					  StrategyControl->num_lookup);
+			all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
+		}
+
+		errcxtold = error_context_stack;
+		error_context_stack = NULL;
+		elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
+			 T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
+		elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
+			 all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
+		elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
+			 t1_clean, t2_clean);
+		error_context_stack = errcxtold;
+
+		StrategyControl->num_lookup = 0;
+		StrategyControl->num_hit[STRAT_LIST_B1] = 0;
+		StrategyControl->num_hit[STRAT_LIST_T1] = 0;
+		StrategyControl->num_hit[STRAT_LIST_T2] = 0;
+		StrategyControl->num_hit[STRAT_LIST_B2] = 0;
+		StrategyControl->stat_report = now;
+	}
+}
+
 /*
  * StrategyBufferLookup
  *
  *	Lookup a page request in the cache directory. A buffer is only
- *	returned for a T1 or T2 cache hit. B1 and B2 hits are only
- *	remembered here to later affect the behaviour.
+ *	returned for a T1 or T2 cache hit. B1 and B2 hits are just
+ *	remembered here, to possibly affect the behaviour later.
+ *
+ *	recheck indicates we are rechecking after I/O wait; do not change
+ *	internal status in this case.
+ *
+ *	*cdb_found_index is set to the index of the found CDB, or -1 if none.
+ *	This is not intended to be used by the caller, except to pass to
+ *	StrategyReplaceBuffer().
  */
 BufferDesc *
-StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
+StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+					 int *cdb_found_index)
 {
 	BufferStrategyCDB  *cdb;
-	time_t				now;
 
+	/* Optional stats printout */
 	if (DebugSharedBuffers > 0)
-	{
-		time(&now);
-		if (StrategyControl->stat_report + DebugSharedBuffers < now)
-		{
-			long	all_hit, b1_hit, t1_hit, t2_hit, b2_hit;
-			int		id, t1_clean, t2_clean;
-			ErrorContextCallback	*errcxtold;
-
-			id = StrategyControl->listHead[STRAT_LIST_T1];
-			t1_clean = 0;
-			while (id >= 0)
-			{
-				if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
-					break;
-				t1_clean++;
-				id = StrategyCDB[id].next;
-			}
-			id = StrategyControl->listHead[STRAT_LIST_T2];
-			t2_clean = 0;
-			while (id >= 0)
-			{
-				if (BufferDescriptors[StrategyCDB[id].buf_id].flags & BM_DIRTY)
-					break;
-				t2_clean++;
-				id = StrategyCDB[id].next;
-			}
-
-			if (StrategyControl->num_lookup == 0)
-			{
-				all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
-			}
-			else
-			{
-				b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
-						  StrategyControl->num_lookup);
-				t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
-						  StrategyControl->num_lookup);
-				t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
-						  StrategyControl->num_lookup);
-				b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
-						  StrategyControl->num_lookup);
-				all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
-			}
-
-			errcxtold = error_context_stack;
-			error_context_stack = NULL;
-			elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
-					T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
-			elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
-					all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
-			elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
-					t1_clean, t2_clean);
-			error_context_stack = errcxtold;
-
-			StrategyControl->num_lookup = 0;
-			StrategyControl->num_hit[STRAT_LIST_B1] = 0;
-			StrategyControl->num_hit[STRAT_LIST_T1] = 0;
-			StrategyControl->num_hit[STRAT_LIST_T2] = 0;
-			StrategyControl->num_hit[STRAT_LIST_B2] = 0;
-			StrategyControl->stat_report = now;
-		}
-	}
+		StrategyStatsDump();
 
 	/*
 	 * Count lookups
@@ -216,72 +214,34 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
 	/*
 	 * Lookup the block in the shared hash table
 	 */
-	strategy_cdb_found = BufTableLookup(tagPtr);
+	*cdb_found_index = BufTableLookup(tagPtr);
 
 	/*
-	 * Handle CDB lookup miss
+	 * Done if complete CDB lookup miss
 	 */
-	if (strategy_cdb_found < 0)
-	{
-		if (!recheck)
-		{
-			/*
-			 * This is an initial lookup and we have a complete
-			 * cache miss (block found nowhere). This means we
-			 * remember according to the current T1 size and the
-			 * target T1 size from where we take a block if we
-			 * need one later.
-			 */
-			if (T1_LENGTH >= MAX(1, T1_TARGET))
-				strategy_get_from = STRAT_LIST_T1;
-			else
-				strategy_get_from = STRAT_LIST_T2;
-		}
-
-		/*
-		 * Do the cost accounting for vacuum
-		 */
-		if (VacuumCostActive)
-			VacuumCostBalance += VacuumCostPageMiss;
-
-		/* report cache miss */
+	if (*cdb_found_index < 0)
 		return NULL;
-	}
 
 	/*
 	 * We found a CDB
 	 */
-	cdb = &StrategyCDB[strategy_cdb_found];
+	cdb = &StrategyCDB[*cdb_found_index];
 
 	/*
 	 * Count hits
 	 */
 	StrategyControl->num_hit[cdb->list]++;
-	if (VacuumCostActive)
-		VacuumCostBalance += VacuumCostPageHit;
 
 	/*
 	 * If this is a T2 hit, we simply move the CDB to the
 	 * T2 MRU position and return the found buffer.
+	 *
+	 * A CDB in T2 cannot have t1_vacuum set, so we needn't check.  However,
+	 * if the current process is VACUUM then it doesn't promote to MRU.
 	 */
 	if (cdb->list == STRAT_LIST_T2)
 	{
-		STRAT_LIST_REMOVE(cdb);
-		STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
-
-		return &BufferDescriptors[cdb->buf_id];
-	}
-
-	/*
-	 * If this is a T1 hit, we move the buffer to the T2 MRU
-	 * only if another transaction had read it into T1. This is
-	 * required because any UPDATE or DELETE in PostgreSQL does
-	 * multiple ReadBuffer(), first during the scan, later during
-	 * the heap_update() or heap_delete().
-	 */
-	if (cdb->list == STRAT_LIST_T1)
-	{
-		if (!TransactionIdIsCurrentTransactionId(cdb->t1_xid))
+		if (!strategy_hint_vacuum)
 		{
 			STRAT_LIST_REMOVE(cdb);
 			STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
@@ -291,19 +251,59 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
 	}
 
 	/*
-	 * In the case of a recheck we don't care about B1 or B2 hits here.
-	 * The bufmgr does this call only to make sure noone faulted in the
-	 * block while we where busy flushing another. Now for this really
-	 * to end up as a B1 or B2 cache hit, we must have been flushing for
-	 * quite some time as the block not only must have been read, but
-	 * also traveled through the queue and evicted from the T cache again
-	 * already. 
+	 * If this is a T1 hit, we move the buffer to the T2 MRU only if another
+	 * transaction had read it into T1, *and* neither transaction is a VACUUM.
+	 * This is required because any UPDATE or DELETE in PostgreSQL does
+	 * multiple ReadBuffer(), first during the scan, later during the
+	 * heap_update() or heap_delete().  Otherwise move to T1 MRU.  VACUUM
+	 * doesn't even get to make that happen.
 	 */
-	if (recheck)
+	if (cdb->list == STRAT_LIST_T1)
 	{
-		return NULL;
+		if (!strategy_hint_vacuum)
+		{
+			if (!cdb->t1_vacuum &&
+				!TransactionIdIsCurrentTransactionId(cdb->t1_xid))
+			{
+				STRAT_LIST_REMOVE(cdb);
+				STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
+			}
+			else
+			{
+				STRAT_LIST_REMOVE(cdb);
+				STRAT_MRU_INSERT(cdb, STRAT_LIST_T1);
+				/*
+				 * If a non-VACUUM process references a page recently loaded
+				 * by VACUUM, clear the stigma; the state will now be the
+				 * same as if this process loaded it originally.
+				 */
+				if (cdb->t1_vacuum)
+				{
+					cdb->t1_xid = GetCurrentTransactionId();
+					cdb->t1_vacuum = false;
+				}
+			}
+		}
+
+		return &BufferDescriptors[cdb->buf_id];
 	}
 
+	/*
+	 * In the case of a recheck we don't care about B1 or B2 hits here.
+	 * The bufmgr does this call only to make sure no-one faulted in the
+	 * block while we where busy flushing another; we don't want to doubly
+	 * adjust the T1target.
+	 *
+	 * Now for this really to end up as a B1 or B2 cache hit, we must have
+	 * been flushing for quite some time as the block not only must have been
+	 * read, but also traveled through the queue and evicted from the T cache
+	 * again already.
+	 *
+	 * VACUUM re-reads shouldn't adjust the target either.
+	 */
+	if (recheck || strategy_hint_vacuum)
+		return NULL;
+
 	/*
 	 * Adjust the target size of the T1 cache depending on if this is
 	 * a B1 or B2 hit.
@@ -316,8 +316,8 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
 			 * small. Adjust the T1 target size and continue
 			 * below.
 			 */
-			T1_TARGET = MIN(T1_TARGET + MAX(B2_LENGTH / B1_LENGTH, 1),
-							Data_Descriptors);
+			T1_TARGET = Min(T1_TARGET + Max(B2_LENGTH / B1_LENGTH, 1),
+							NBuffers);
 			break;
 
 		case STRAT_LIST_B2:
@@ -325,26 +325,17 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
 			 * B2 hit means that the T2 cache is probably too
 			 * small. Adjust the T1 target size and continue
 			 * below.
- */
-			T1_TARGET = MAX(T1_TARGET - MAX(B1_LENGTH / B2_LENGTH, 1), 0);
+			 */
+			T1_TARGET = Max(T1_TARGET - Max(B1_LENGTH / B2_LENGTH, 1), 0);
 			break;
 
 		default:
-			elog(ERROR, "Buffer hash table corrupted - CDB on list %d found",
-					cdb->list);
+			elog(ERROR, "buffer hash table corrupted: CDB->list = %d",
+				 cdb->list);
 	}
 
 	/*
-	 * Decide where to take from if we will be out of
-	 * free blocks later in StrategyGetBuffer().
-	 */
-	if (T1_LENGTH >= MAX(1, T1_TARGET))
-		strategy_get_from = STRAT_LIST_T1;
-	else
-		strategy_get_from = STRAT_LIST_T2;
-
-	/*
-	 * Even if we had seen the block in the past, it's data is
+	 * Even though we had seen the block in the past, its data is
 	 * not currently in memory ... cache miss to the bufmgr.
 	 */
 	return NULL;
@@ -357,18 +348,25 @@ StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
  *	Called by the bufmgr to get the next candidate buffer to use in
  *	BufferAlloc(). The only hard requirement BufferAlloc() has is that
  *	this buffer must not currently be pinned. 
+ *
+ *	*cdb_replace_index is set to the index of the candidate CDB, or -1 if
+ *	none (meaning we are using a previously free buffer).  This is not
+ *	intended to be used by the caller, except to pass to
+ *	StrategyReplaceBuffer().
  */
 BufferDesc *
-StrategyGetBuffer(void)
+StrategyGetBuffer(int *cdb_replace_index)
 {
 	int				cdb_id;
 	BufferDesc	   *buf;
 
 	if (StrategyControl->listFreeBuffers < 0)
 	{
-		/* We don't have a free buffer, must take one from T1 or T2 */
-
-		if (strategy_get_from == STRAT_LIST_T1)
+		/*
+		 * We don't have a free buffer, must take one from T1 or T2.
+		 * Choose based on trying to converge T1len to T1target.
+		 */
+		if (T1_LENGTH >= Max(1, T1_TARGET))
 		{
 			/*
 			 * We should take the first unpinned buffer from T1.
@@ -379,7 +377,7 @@ StrategyGetBuffer(void)
 				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
 				if (buf->refcount == 0)
 				{
-					strategy_cdb_replace = cdb_id;
+					*cdb_replace_index = cdb_id;
 					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
 					return buf;
 				}
@@ -387,7 +385,7 @@ StrategyGetBuffer(void)
 			}
 
 			/*
-			 * No unpinned T1 buffer found - pardon T2 cache.
+			 * No unpinned T1 buffer found - try T2 cache.
 			 */
 			cdb_id = StrategyControl->listHead[STRAT_LIST_T2];
 			while (cdb_id >= 0)
@@ -395,7 +393,7 @@ StrategyGetBuffer(void)
 				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
 				if (buf->refcount == 0)
 				{
-					strategy_cdb_replace = cdb_id;
+					*cdb_replace_index = cdb_id;
 					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
 					return buf;
 				}
@@ -405,7 +403,7 @@ StrategyGetBuffer(void)
 			/*
 			 * No unpinned buffers at all!!!
 			 */
-			elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers");
+			elog(ERROR, "no unpinned buffers available");
 		}
 		else
 		{
@@ -418,7 +416,7 @@ StrategyGetBuffer(void)
 				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
 				if (buf->refcount == 0)
 				{
-					strategy_cdb_replace = cdb_id;
+					*cdb_replace_index = cdb_id;
 					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
 					return buf;
 				}
@@ -426,7 +424,7 @@ StrategyGetBuffer(void)
 			}
 
 			/*
-			 * No unpinned T2 buffer found - pardon T1 cache.
+			 * No unpinned T2 buffer found - try T1 cache.
 			 */
 			cdb_id = StrategyControl->listHead[STRAT_LIST_T1];
 			while (cdb_id >= 0)
@@ -434,7 +432,7 @@ StrategyGetBuffer(void)
 				buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
 				if (buf->refcount == 0)
 				{
-					strategy_cdb_replace = cdb_id;
+					*cdb_replace_index = cdb_id;
 					Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
 					return buf;
 				}
@@ -444,7 +442,7 @@ StrategyGetBuffer(void)
 			/*
 			 * No unpinned buffers at all!!!
 			 */
-			elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers");
+			elog(ERROR, "no unpinned buffers available");
 		}
 	}
 	else
@@ -459,13 +457,13 @@ StrategyGetBuffer(void)
 		 * that there will never be any reason to recheck. Otherwise
 		 * we would leak shared buffers here!
 		 */
-		strategy_cdb_replace = -1;
+		*cdb_replace_index = -1;
 		buf = &BufferDescriptors[StrategyControl->listFreeBuffers];
 
 		StrategyControl->listFreeBuffers = buf->bufNext;
 		buf->bufNext = -1;
 
-		/* Buffer of freelist cannot be pinned */
+		/* Buffer in freelist cannot be pinned */
 		Assert(buf->refcount == 0);
 		Assert(!(buf->flags & BM_DIRTY));
 
@@ -480,54 +478,59 @@ StrategyGetBuffer(void)
 /*
  * StrategyReplaceBuffer
  *
- *	Called by the buffer manager to inform us that he possibly flushed
- * 	a buffer and is now about to replace the content. Prior to this call,
+ *	Called by the buffer manager to inform us that he flushed a buffer
+ *	and is now about to replace the content. Prior to this call,
  *	the cache algorithm still reports the buffer as in the cache. After
  *	this call we report the new block, even if IO might still need to
- *	start.
+ *	be done to bring in the new content.
+ *
+ *	cdb_found_index and cdb_replace_index must be the auxiliary values
+ *	returned by previous calls to StrategyBufferLookup and StrategyGetBuffer.
  */
 void
-StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
+StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+					  int cdb_found_index, int cdb_replace_index)
 {
 	BufferStrategyCDB	   *cdb_found;
 	BufferStrategyCDB	   *cdb_replace;
 
-	if (strategy_cdb_found >= 0)
+	if (cdb_found_index >= 0)
 	{
-		/* This was a ghost buffer cache hit (B1 or B2) */
-		cdb_found = &StrategyCDB[strategy_cdb_found];
+		/* This must have been a ghost buffer cache hit (B1 or B2) */
+		cdb_found = &StrategyCDB[cdb_found_index];
 
 		/* Assert that the buffer remembered in cdb_found is the one */
 		/* the buffer manager is currently faulting in */
-		Assert(BUFFERTAG_EQUALS(&(cdb_found->buf_tag), rnode, blockNum));
+		Assert(BUFFERTAGS_EQUAL(&(cdb_found->buf_tag), newTag));
 		
-		if (strategy_cdb_replace >= 0)
+		if (cdb_replace_index >= 0)
 		{
 			/* We are satisfying it with an evicted T buffer */
-			cdb_replace = &StrategyCDB[strategy_cdb_replace];
+			cdb_replace = &StrategyCDB[cdb_replace_index];
 
 			/* Assert that the buffer remembered in cdb_replace is */
 			/* the one the buffer manager has just evicted */
 			Assert(cdb_replace->list == STRAT_LIST_T1 || 
-					cdb_replace->list == STRAT_LIST_T2);
+				   cdb_replace->list == STRAT_LIST_T2);
 			Assert(cdb_replace->buf_id == buf->buf_id);
 			Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag)));
 
-			/* If this was a T1 buffer faulted in by vacuum, just */
-			/* do not cause the CDB end up in the B1 list, so that */
-			/* the vacuum scan does not affect T1_target adjusting */
-			if (strategy_hint_vacuum)
+			/*
+			 * Under normal circumstances we move the evicted T list entry to
+			 * the corresponding B list.  However, T1 entries that exist only
+			 * because of VACUUM are just thrown into the unused list instead.
+			 * We don't expect them to be touched again by the VACUUM, and if
+			 * we put them into B1 then VACUUM would skew T1_target adjusting.
+			 */
+			if (cdb_replace->t1_vacuum)
 			{
 				BufTableDelete(&(cdb_replace->buf_tag));
 				STRAT_LIST_REMOVE(cdb_replace);
-				cdb_replace->buf_id = -1;
 				cdb_replace->next = StrategyControl->listUnusedCDB;
-				StrategyControl->listUnusedCDB = strategy_cdb_replace;
+				StrategyControl->listUnusedCDB = cdb_replace_index;
 			}
 			else
 			{
-				/* Under normal circumstances move the evicted */
-				/* T list entry to it's corresponding B list */
 				if (cdb_replace->list == STRAT_LIST_T1)
 				{
 					STRAT_LIST_REMOVE(cdb_replace);
@@ -539,25 +542,26 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
 					STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
 				}
 			}
-			/* And clear it's block reference */
+			/* And clear its block reference */
 			cdb_replace->buf_id = -1;
 		}
 		else
 		{
-			/* or we satisfy it with an unused buffer */
+			/* We are satisfying it with an unused buffer */
 		}
 
-		/* Now the found B CDB get's the buffer and is moved to T2 */
+		/* Now the found B CDB gets the buffer and is moved to T2 */
 		cdb_found->buf_id = buf->buf_id;
 		STRAT_LIST_REMOVE(cdb_found);
 		STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
 	}
 	else
 	{
-		/* This was a complete cache miss, so we need to create */
-		/* a new CDB. The goal is to keep T1len+B1len <= c */
-
-		if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= Data_Descriptors)
+		/*
+		 * This was a complete cache miss, so we need to create
+		 * a new CDB. The goal is to keep T1len+B1len <= c.
+		 */
+		if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= NBuffers)
 		{
 			/* So if B1 isn't empty and T1len+B1len >= c we take B1-LRU */
 			cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
@@ -587,18 +591,20 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
 			}
 		}
 
-		/* Set the CDB's buf_tag and insert the hash key */
-		INIT_BUFFERTAG(&(cdb_found->buf_tag), rnode, blockNum);
+		/* Set the CDB's buf_tag and insert it into the hash table */
+		cdb_found->buf_tag = *newTag;
 		BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB));
 
-		if (strategy_cdb_replace >= 0)
+		if (cdb_replace_index >= 0)
 		{
-			/* The buffer was formerly in a T list, move it's CDB
-			 * to the corresponding B list */
-			cdb_replace = &StrategyCDB[strategy_cdb_replace];
+			/*
+			 * The buffer was formerly in a T list, move its CDB
+			 * to the corresponding B list
+			 */
+			cdb_replace = &StrategyCDB[cdb_replace_index];
 
 			Assert(cdb_replace->list == STRAT_LIST_T1 || 
-					cdb_replace->list == STRAT_LIST_T2);
+				   cdb_replace->list == STRAT_LIST_T2);
 			Assert(cdb_replace->buf_id == buf->buf_id);
 			Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag)));
 
@@ -612,32 +618,32 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
 				STRAT_LIST_REMOVE(cdb_replace);
 				STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
 			}
-			/* And clear it's block reference */
+			/* And clear its block reference */
 			cdb_replace->buf_id = -1;
 		}
 		else
 		{
-			/* or we satisfy it with an unused buffer */
+			/* We are satisfying it with an unused buffer */
 		}
 
 		/* Assign the buffer id to the new CDB */
 		cdb_found->buf_id = buf->buf_id;
 
 		/*
-		 * Specialized VACUUM optimization. If this "complete cache miss"
-		 * happened because vacuum needed the page, we want it later on
-		 * to be placed at the LRU instead of the MRU position of T1.
+		 * Specialized VACUUM optimization. If this complete cache miss
+		 * happened because vacuum needed the page, we place it at the LRU
+		 * position of T1; normally it goes at the MRU position.
 		 */
 		if (strategy_hint_vacuum)
 		{
-			if (strategy_vacuum_xid != GetCurrentTransactionId())
+			if (TransactionIdIsCurrentTransactionId(strategy_vacuum_xid))
+				STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1);
+			else
 			{
+				/* VACUUM must have been aborted by error, reset flag */
 				strategy_hint_vacuum = false;
 				STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
 			}
-			else
-				STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1);
-			
 		}
 		else
 			STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
@@ -645,8 +651,10 @@ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
 		/*
 		 * Remember the Xid when this buffer went onto T1 to avoid
 		 * a single UPDATE promoting a newcomer straight into T2.
+		 * Also remember if it was loaded for VACUUM.
 		 */
 		cdb_found->t1_xid = GetCurrentTransactionId();
+		cdb_found->t1_vacuum = strategy_hint_vacuum;
 	}
 }
 
@@ -673,8 +681,7 @@ StrategyInvalidateBuffer(BufferDesc *buf)
 	 */
 	cdb_id = BufTableLookup(&(buf->tag));
 	if (cdb_id < 0)
-		elog(ERROR, "StrategyInvalidateBuffer() buffer %d not in directory",
-				buf->buf_id);
+		elog(ERROR, "buffer %d not in buffer hash table", buf->buf_id);
 	cdb = &StrategyCDB[cdb_id];
 
 	/*
@@ -694,7 +701,7 @@ StrategyInvalidateBuffer(BufferDesc *buf)
 	StrategyControl->listUnusedCDB = cdb_id;
 
 	/*
-	 * Clear out the buffers tag and add it to the list of
+	 * Clear out the buffer's tag and add it to the list of
 	 * currently unused buffers.
 	 */
 	CLEAR_BUFFERTAG(&(buf->tag));
@@ -702,7 +709,9 @@ StrategyInvalidateBuffer(BufferDesc *buf)
 	StrategyControl->listFreeBuffers = buf->buf_id;
 }
 
-
+/*
+ * StrategyHintVacuum -- tell us whether VACUUM is active
+ */
 void
 StrategyHintVacuum(bool vacuum_active)
 {
@@ -710,9 +719,24 @@ StrategyHintVacuum(bool vacuum_active)
 	strategy_vacuum_xid = GetCurrentTransactionId();
 }
 
-
+/*
+ * StrategyDirtyBufferList
+ *
+ * Returns a list of dirty buffers, in priority order for writing.
+ * Note that the caller may choose not to write them all.
+ *
+ * The caller must beware of the possibility that a buffer is no longer dirty,
+ * or even contains a different page, by the time he reaches it.  If it no
+ * longer contains the same page it need not be written, even if it is (again)
+ * dirty.
+ *
+ * Buffer pointers are stored into buffers[], and corresponding tags into
+ * buftags[], both of size max_buffers.  The function returns the number of
+ * buffer IDs stored.
+ */
 int
-StrategyDirtyBufferList(int *buffer_list, int max_buffers)
+StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
+						int max_buffers)
 {
 	int					num_buffer_dirty = 0;
 	int					cdb_id_t1;
@@ -724,13 +748,13 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers)
 	 * Traverse the T1 and T2 list LRU to MRU in "parallel"
 	 * and add all dirty buffers found in that order to the list.
 	 * The ARC strategy keeps all used buffers including pinned ones
-	 * in the T1 or T2 list. So we cannot loose any dirty buffers.
+	 * in the T1 or T2 list. So we cannot miss any dirty buffers.
 	 */
 	cdb_id_t1 = StrategyControl->listHead[STRAT_LIST_T1];
 	cdb_id_t2 = StrategyControl->listHead[STRAT_LIST_T2];
 
 	while ((cdb_id_t1 >= 0 || cdb_id_t2 >= 0) && 
-			num_buffer_dirty < max_buffers)
+		   num_buffer_dirty < max_buffers)
 	{
 		if (cdb_id_t1 >= 0)
 		{
@@ -741,7 +765,9 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers)
 			{
 				if ((buf->flags & BM_DIRTY) || (buf->cntxDirty))
 				{
-					buffer_list[num_buffer_dirty++] = buf_id;
+					buffers[num_buffer_dirty] = buf;
+					buftags[num_buffer_dirty] = buf->tag;
+					num_buffer_dirty++;
 				}
 			}
 
@@ -757,7 +783,9 @@ StrategyDirtyBufferList(int *buffer_list, int max_buffers)
 			{
 				if ((buf->flags & BM_DIRTY) || (buf->cntxDirty))
 				{
-					buffer_list[num_buffer_dirty++] = buf_id;
+					buffers[num_buffer_dirty] = buf;
+					buftags[num_buffer_dirty] = buf->tag;
+					num_buffer_dirty++;
 				}
 			}
 
@@ -785,16 +813,16 @@ StrategyInitialize(bool init)
 	/*
 	 * Initialize the shared CDB lookup hashtable
 	 */
-	InitBufTable(Data_Descriptors * 2);
+	InitBufTable(NBuffers * 2);
 
 	/*
 	 * Get or create the shared strategy control block and the CDB's
 	 */
 	StrategyControl = (BufferStrategyControl *)
-			ShmemInitStruct("Buffer Strategy Status",
-					sizeof(BufferStrategyControl) +
-					sizeof(BufferStrategyCDB) * (Data_Descriptors * 2 - 1),
-					&found);
+		ShmemInitStruct("Buffer Strategy Status",
+						sizeof(BufferStrategyControl) +
+						sizeof(BufferStrategyCDB) * (NBuffers * 2 - 1),
+						&found);
 	StrategyCDB = &(StrategyControl->cdb[0]);
 
 	if (!found)
@@ -805,8 +833,8 @@ StrategyInitialize(bool init)
 		Assert(init);
 
 		/*
-		 * Grab the whole linked list of free buffers for our
-		 * strategy
+		 * Grab the whole linked list of free buffers for our strategy.
+		 * We assume it was previously set up by InitBufferPool().
 		 */
 		StrategyControl->listFreeBuffers = 0;
 
@@ -814,7 +842,7 @@ StrategyInitialize(bool init)
 		 * We start off with a target T1 list size of
 		 * half the available cache blocks.
 		 */
-		StrategyControl->target_T1_size = Data_Descriptors / 2;
+		StrategyControl->target_T1_size = NBuffers / 2;
 
 		/*
 		 * Initialize B1, T1, T2 and B2 lists to be empty
@@ -832,14 +860,14 @@ StrategyInitialize(bool init)
 		/*
 		 * All CDB's are linked as the listUnusedCDB
 		 */
-		for (i = 0; i < Data_Descriptors * 2; i++)
+		for (i = 0; i < NBuffers * 2; i++)
 		{
 			StrategyCDB[i].next = i + 1;
 			StrategyCDB[i].list = STRAT_LIST_UNUSED;
 			CLEAR_BUFFERTAG(&(StrategyCDB[i].buf_tag));
 			StrategyCDB[i].buf_id = -1;
 		}
-		StrategyCDB[Data_Descriptors * 2 - 1].next = -1;
+		StrategyCDB[NBuffers * 2 - 1].next = -1;
 		StrategyControl->listUnusedCDB = 0;
 	}
 	else
@@ -847,91 +875,3 @@ StrategyInitialize(bool init)
 		Assert(!init);
 	}
 }
-
-
-#undef PinBuffer
-
-/*
- * PinBuffer -- make buffer unavailable for replacement.
- *
- * This should be applied only to shared buffers, never local ones.
- * Bufmgr lock must be held by caller.
- */
-void
-PinBuffer(BufferDesc *buf)
-{
-	int			b = BufferDescriptorGetBuffer(buf) - 1;
-
-	if (PrivateRefCount[b] == 0)
-		buf->refcount++;
-	PrivateRefCount[b]++;
-	Assert(PrivateRefCount[b] > 0);
-}
-
-#ifdef NOT_USED
-void
-PinBuffer_Debug(char *file, int line, BufferDesc *buf)
-{
-	PinBuffer(buf);
-	if (ShowPinTrace)
-	{
-		Buffer		buffer = BufferDescriptorGetBuffer(buf);
-
-		fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
-refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->blind.relname, buf->tag.blockNum,
-				PrivateRefCount[buffer - 1], file, line);
-	}
-}
-#endif
-
-#undef UnpinBuffer
-
-/*
- * UnpinBuffer -- make buffer available for replacement.
- *
- * This should be applied only to shared buffers, never local ones.
- * Bufmgr lock must be held by caller.
- */
-void
-UnpinBuffer(BufferDesc *buf)
-{
-	int			b = BufferDescriptorGetBuffer(buf) - 1;
-
-	Assert(buf->refcount > 0);
-	Assert(PrivateRefCount[b] > 0);
-	PrivateRefCount[b]--;
-	if (PrivateRefCount[b] == 0)
-		buf->refcount--;
-
-	if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
-			 buf->refcount == 1)
-	{
-		/* we just released the last pin other than the waiter's */
-		buf->flags &= ~BM_PIN_COUNT_WAITER;
-		ProcSendSignal(buf->wait_backend_id);
-	}
-	else
-	{
-		/* do nothing */
-	}
-}
-
-#ifdef NOT_USED
-void
-UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
-{
-	UnpinBuffer(buf);
-	if (ShowPinTrace)
-	{
-		Buffer		buffer = BufferDescriptorGetBuffer(buf);
-
-		fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
-refcount = %ld, file: %s, line: %d\n",
-				buffer, buf->blind.relname, buf->tag.blockNum,
-				PrivateRefCount[buffer - 1], file, line);
-	}
-}
-#endif
-
-
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ac738d8f77..3e8c2a6c1b 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.65 2004/02/25 19:41:22 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.66 2004/04/19 23:27:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,7 +60,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 		 * moderately-accurate estimates for the big hogs, plus 100K for the
 		 * stuff that's too small to bother with estimating.
 		 */
-		size = BufferShmemSize();
+		size = hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
+		size += BufferShmemSize();
 		size += LockShmemSize(maxBackends);
 		size += XLOGShmemSize();
 		size += CLOGShmemSize();
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index f401791d93..fc83396ff0 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.68 2004/02/12 15:06:56 wieck Exp $
+ * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.69 2004/04/19 23:27:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,15 +21,6 @@
 #include "storage/lwlock.h"
 
 
-/* Buf Mgr constants */
-/* in bufmgr.c */
-extern int	Data_Descriptors;
-extern int	Free_List_Descriptor;
-extern int	Lookup_List_Descriptor;
-extern int	Num_Descriptors;
-
-extern int	ShowPinTrace;
-
 /*
  * Flags for buffer descriptors
  */
@@ -51,10 +42,13 @@ typedef bits16 BufFlags;
  * that the backend flushing the buffer doesn't even believe the relation is
  * visible yet (its xact may have started before the xact that created the
  * rel).  The storage manager must be able to cope anyway.
+ *
+ * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
+ * to be fixed to zero them, since this struct is used as a hash key.
  */
 typedef struct buftag
 {
-	RelFileNode rnode;
+	RelFileNode rnode;			/* physical relation identifier */
 	BlockNumber blockNum;		/* blknum relative to begin of reln */
 } BufferTag;
 
@@ -71,12 +65,6 @@ typedef struct buftag
 	(a)->rnode = (xx_reln)->rd_node \
 )
 
-#define BUFFERTAG_EQUALS(a,xx_reln,xx_blockNum) \
-( \
-	(a)->rnode.tblNode == (xx_reln)->rd_node.tblNode && \
-	(a)->rnode.relNode == (xx_reln)->rd_node.relNode && \
-	(a)->blockNum == (xx_blockNum) \
-)
 #define BUFFERTAGS_EQUAL(a,b) \
 ( \
 	(a)->rnode.tblNode == (b)->rnode.tblNode && \
@@ -93,7 +81,7 @@ typedef struct sbufdesc
 	Buffer		bufNext;		/* link in freelist chain */
 	SHMEM_OFFSET data;			/* pointer to data in buf pool */
 
-	/* tag and id must be together for table lookup */
+	/* tag and id must be together for table lookup (still true?) */
 	BufferTag	tag;			/* file/block identifier */
 	int			buf_id;			/* buffer's index number (from 0) */
 
@@ -108,7 +96,7 @@ typedef struct sbufdesc
 	/*
 	 * We can't physically remove items from a disk page if another
 	 * backend has the buffer pinned.  Hence, a backend may need to wait
-	 * for all other pins to go away.  This is signaled by setting its own
+	 * for all other pins to go away.  This is signaled by storing its own
 	 * backend ID into wait_backend_id and setting flag bit
 	 * BM_PIN_COUNT_WAITER. At present, there can be only one such waiter
 	 * per buffer.
@@ -128,17 +116,17 @@ typedef struct sbufdesc
 #define BL_IO_IN_PROGRESS	(1 << 0)	/* unimplemented */
 #define BL_PIN_COUNT_LOCK	(1 << 1)
 
-/* entry for buffer hashtable */
+/* entry for buffer lookup hashtable */
 typedef struct
 {
-	BufferTag	key;
-	Buffer		id;
+	BufferTag	key;			/* Tag of a disk page */
+	int			id;				/* CDB id of associated CDB */
 } BufferLookupEnt;
 
 /*
  * Definitions for the buffer replacement strategy
  */
-#define STRAT_LIST_UNUSED	-1
+#define STRAT_LIST_UNUSED	(-1)
 #define STRAT_LIST_B1		0
 #define STRAT_LIST_T1		1
 #define STRAT_LIST_T2		2
@@ -150,12 +138,13 @@ typedef struct
  */
 typedef struct
 {
-	int				prev;		/* links in the queue */
+	int				prev;		/* list links */
 	int				next;
-	int				list;		/* current list */
-	BufferTag		buf_tag;	/* buffer key */
-	Buffer			buf_id;		/* currently assigned data buffer */
+	short			list;		/* ID of list it is currently in */
+	bool			t1_vacuum;	/* t => present only because of VACUUM */
 	TransactionId	t1_xid;		/* the xid this entry went onto T1 */
+	BufferTag		buf_tag;	/* page identifier */
+	int				buf_id;		/* currently assigned data buffer, or -1 */
 } BufferStrategyCDB;
 
 /*
@@ -163,7 +152,6 @@ typedef struct
  */
 typedef struct
 {
-
 	int		target_T1_size;				/* What T1 size are we aiming for */
 	int		listUnusedCDB;				/* All unused StrategyCDB */
 	int		listHead[STRAT_NUM_LISTS];	/* ARC lists B1, T1, T2 and B2 */
@@ -175,8 +163,10 @@ typedef struct
 	long	num_hit[STRAT_NUM_LISTS];
 	time_t	stat_report;
 
-	BufferStrategyCDB	cdb[1];			/* The cache directory */
+	/* Array of CDB's starts here */
+	BufferStrategyCDB	cdb[1];			/* VARIABLE SIZE ARRAY */
 } BufferStrategyControl;
+
  
 /* counters in buf_init.c */
 extern long int ReadBufferCount;
@@ -191,24 +181,25 @@ extern long int LocalBufferFlushCount;
  * Bufmgr Interface:
  */
 
-/* Internal routines: only called by buf.c */
+/* Internal routines: only called by bufmgr */
 
-/*freelist.c*/
-extern void PinBuffer(BufferDesc *buf);
-extern void UnpinBuffer(BufferDesc *buf);
-extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck);
-extern BufferDesc *StrategyGetBuffer(void);
-extern void StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum);
+/* freelist.c */
+extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck,
+										int *cdb_found_index);
+extern BufferDesc *StrategyGetBuffer(int *cdb_replace_index);
+extern void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
+								  int cdb_found_index, int cdb_replace_index);
 extern void StrategyInvalidateBuffer(BufferDesc *buf);
 extern void StrategyHintVacuum(bool vacuum_active);
-extern int StrategyDirtyBufferList(int *buffer_dirty, int max_buffers);
+extern int StrategyDirtyBufferList(BufferDesc **buffers, BufferTag *buftags,
+								   int max_buffers);
 extern void StrategyInitialize(bool init);
 
 /* buf_table.c */
 extern void InitBufTable(int size);
 extern int BufTableLookup(BufferTag *tagPtr);
-extern bool BufTableInsert(BufferTag *tagPtr, Buffer buf_id);
-extern bool BufTableDelete(BufferTag *tagPtr);
+extern void BufTableInsert(BufferTag *tagPtr, int cdb_id);
+extern void BufTableDelete(BufferTag *tagPtr);
 
 /* bufmgr.c */
 extern BufferDesc *BufferDescriptors;