HOT updates. When we update a tuple without changing any of its indexed

columns, and the new version can be stored on the same heap page, we no longer generate extra index entries for the new version. Instead, index searches follow the HOT-chain links to ensure they find the correct tuple version. In addition, this patch introduces the ability to "prune" dead tuples on a per-page basis, without having to do a complete VACUUM pass to recover space. VACUUM is still needed to clean up dead index entries, however. Pavan Deolasee, with help from a bunch of other people.
2007-09-20 17:56:33 +00:00 · 2007-09-20 17:56:33 +00:00 · 282d2a03dd
commit 282d2a03dd
parent bbf4fdc253
65 changed files with 3517 additions and 514 deletions
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@ -1,5 +1,5 @@
 /*
- * $PostgreSQL: pgsql/contrib/pgstattuple/pgstattuple.c,v 1.29 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/contrib/pgstattuple/pgstattuple.c,v 1.30 2007/09/20 17:56:30 tgl Exp $
 *
 * Copyright (c) 2001,2002	Tatsuo Ishii
 *
@ -290,7 +290,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
 		{
 			buffer = ReadBuffer(rel, block);
 			LockBuffer(buffer, BUFFER_LOCK_SHARE);
-			stat.free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
+			stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			ReleaseBuffer(buffer);
 			block++;
@ -301,7 +301,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
 	while (block < nblocks)
 	{
 		buffer = ReadBuffer(rel, block);
-		stat.free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
+		stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
 		ReleaseBuffer(buffer);
 		block++;
 	}
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.157 2007/09/05 18:10:47 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.158 2007/09/20 17:56:30 tgl Exp $ -->
 <!--
 Documentation of the system catalogs, directed toward PostgreSQL developers
 -->
@ -2565,6 +2565,29 @@
      </entry>
     </row>

+     <row>
+      <entry><structfield>indcheckxmin</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>
+       If true, queries must not use the index until the <structfield>xmin</>
+       of this <structname>pg_index</> row is below their TransactionXmin
+       event horizon, because the table may contain broken HOT chains with
+       incompatible rows that they can see
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>indisready</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>
+       If true, the index is currently ready for inserts.  False means the
+       index must be ignored by <command>INSERT</>/<command>UPDATE</>
+       operations
+      </entry>
+     </row>
+
     <row>
      <entry><structfield>indkey</structfield></entry>
      <entry><type>int2vector</type></entry>
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/monitoring.sgml,v 1.51 2007/06/28 00:02:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/monitoring.sgml,v 1.52 2007/09/20 17:56:30 tgl Exp $ -->

 <chapter id="monitoring">
 <title>Monitoring Database Activity</title>
@ -276,6 +276,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      scans, number of index scans initiated (over all indexes
      belonging to the table), number of live rows fetched by index
      scans, numbers of row insertions, updates, and deletions,
+      number of row updates that were HOT (i.e., no separate index update),
+      numbers of live and dead rows,
      the last time the table was vacuumed manually,
      the last time it was vacuumed by the autovacuum daemon,
      the last time it was analyzed manually,
@ -580,7 +582,7 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry><literal><function>pg_stat_get_tuples_updated</function>(<type>oid</type>)</literal></entry>
      <entry><type>bigint</type></entry>
      <entry>
-       Number of rows updated in table
+       Number of rows updated in table (includes HOT updates)
      </entry>
     </row>

@ -592,6 +594,30 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      </entry>
     </row>

+     <row>
+      <entry><literal><function>pg_stat_get_tuples_hot_updated</function>(<type>oid</type>)</literal></entry>
+      <entry><type>bigint</type></entry>
+      <entry>
+       Number of rows HOT-updated in table
+      </entry>
+     </row>
+
+     <row>
+      <entry><literal><function>pg_stat_get_live_tuples</function>(<type>oid</type>)</literal></entry>
+      <entry><type>bigint</type></entry>
+      <entry>
+       Number of live rows in table
+      </entry>
+     </row>
+
+     <row>
+      <entry><literal><function>pg_stat_get_dead_tuples</function>(<type>oid</type>)</literal></entry>
+      <entry><type>bigint</type></entry>
+      <entry>
+       Number of dead rows in table
+      </entry>
+     </row>
+
     <row>
      <entry><literal><function>pg_stat_get_blocks_fetched</function>(<type>oid</type>)</literal></entry>
      <entry><type>bigint</type></entry>
@ -716,6 +742,18 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      </entry>
     </row>

+     <row>
+      <entry><literal><function>pg_stat_get_backend_xact_start</function>(<type>integer</type>)</literal></entry>
+      <entry><type>timestamp with time zone</type></entry>
+      <entry>
+       The time at which the given server process' currently
+       executing transaction was started, but only if the
+       current user is a superuser or the same user as that of
+       the session being queried (and
+       <varname>stats_command_string</varname> is on)
+      </entry>
+     </row>
+
     <row>
      <entry><literal><function>pg_stat_get_backend_start</function>(<type>integer</type>)</literal></entry>
      <entry><type>timestamp with time zone</type></entry>
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/create_index.sgml,v 1.64 2007/09/07 00:58:56 tgl Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/create_index.sgml,v 1.65 2007/09/20 17:56:30 tgl Exp $
 PostgreSQL documentation
 -->

@ -329,7 +329,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] <replaceable class="parameter">name</re
   </para>

   <para>
-    If a problem arises during the second scan of the table, such as a
+    In a concurrent index build, the index is actually entered into the
+    system catalogs in one transaction, then the two table scans occur in a
+    second and third transaction.
+    If a problem arises while scanning the table, such as a
    uniqueness violation in a unique index, the <command>CREATE INDEX</>
    command will fail but leave behind an <quote>invalid</> index. This index
    will be ignored for querying purposes because it might be incomplete;
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
 *-------------------------------------------------------------------------
 */

@ -359,7 +359,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
 	*prdata = rdata;
 	data.updateBlkno = entryPreparePage(btree, page, off);

-	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false);
+	placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
 	if (placed != off)
 		elog(ERROR, "failed to add item to index page in \"%s\"",
 			 RelationGetRelationName(btree->index));
@ -488,7 +488,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
 			lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
 		}

-		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(btree->index));
 		ptr += MAXALIGN(IndexTupleSize(itup));
@ -563,11 +563,11 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
 	page = BufferGetPage(root);

 	itup = ginPageGetLinkItup(lbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");

 	itup = ginPageGetLinkItup(rbuf);
-	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+	if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to index root page");
 }

--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.16 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.17 2007/09/20 17:56:30 tgl Exp $
 *-------------------------------------------------------------------------
 */

@ -544,7 +544,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
 				itup = GinFormTuple(&gvs->ginstate, value, GinGetPosting(itup), newN);
 				PageIndexTupleDelete(tmppage, i);

-				if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false) != i)
+				if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
 					elog(ERROR, "failed to add item to index page in \"%s\"",
 						 RelationGetRelationName(gvs->index));

--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ *			 $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
@ -199,7 +199,7 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)

 		itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));

-		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				 data->node.spcNode, data->node.dbNode, data->node.relNode);

@ -281,7 +281,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)

 		for (i = 0; i < data->separator; i++)
 		{
-			if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@ -289,7 +289,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)

 		for (i = data->separator; i < data->nitem; i++)
 		{
-			if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@ -375,7 +375,7 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)

 		for (i = 0; i < data->nitem; i++)
 		{
-			if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in %u/%u/%u",
 				  data->node.spcNode, data->node.dbNode, data->node.relNode);
 			itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.146 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.147 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -366,7 +366,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
 			data = (char *) (ptr->list);
 			for (i = 0; i < ptr->block.num; i++)
 			{
-				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+				if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 					elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
 				data += IndexTupleSize((IndexTuple) data);
 			}
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.23 2007/09/12 22:10:25 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.24 2007/09/20 17:56:30 tgl Exp $
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
@ -42,7 +42,7 @@ gistfillbuffer(Relation r, Page page, IndexTuple *itup,
 	for (i = 0; i < len; i++)
 	{
 		l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]),
-						off, false);
+						off, false, false);
 		if (l == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(r));
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.31 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.32 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -201,7 +201,7 @@ vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon,
 		data = (char *) (ptr->list);
 		for (i = 0; i < ptr->block.num; i++)
 		{
-			if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+			if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index));
 			data += IndexTupleSize((IndexTuple) data);
 		}
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.46 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.47 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -200,7 +200,7 @@ _hash_pgaddtup(Relation rel,
 	page = BufferGetPage(buf);

 	itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
-	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false)
+	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
 		== InvalidOffsetNumber)
 		elog(ERROR, "failed to add index item to \"%s\"",
 			 RelationGetRelationName(rel));
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.59 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.60 2007/09/20 17:56:30 tgl Exp $
 *
 * NOTES
 *	  Overflow pages look like ordinary relation pages.
@ -684,7 +684,7 @@ _hash_squeezebucket(Relation rel,
 			 * we have found room so insert on the "write" page.
 			 */
 			woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
-			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false)
+			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false)
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.69 2007/09/12 22:10:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.70 2007/09/20 17:56:30 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@ -830,7 +830,7 @@ _hash_splitbucket(Relation rel,
 			}

 			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
-			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false)
+			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false)
 				== InvalidOffsetNumber)
 				elog(ERROR, "failed to add index item to \"%s\"",
 					 RelationGetRelationName(rel));
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@ -4,7 +4,7 @@
 #    Makefile for access/heap
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.16 2007/06/08 18:23:52 tgl Exp $
+#    $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.17 2007/09/20 17:56:30 tgl Exp $
 #
 #-------------------------------------------------------------------------

@ -12,7 +12,7 @@ subdir = src/backend/access/heap
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global

-OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o
+OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o

 all: SUBSYS.o

--- a/src/backend/access/heap/README.HOT
+++ b/src/backend/access/heap/README.HOT
@ -0,0 +1,489 @@
+$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.1 2007/09/20 17:56:30 tgl Exp $
+
+                           Heap Only Tuples (HOT)
+
+Introduction
+------------
+
+The Heap Only Tuple (HOT) feature eliminates redundant index entries and
+allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples
+without performing a table-wide vacuum.  It does this by allowing
+single-page vacuuming, also called "defragmentation".
+
+Note: there is a Glossary at the end of this document that may be helpful
+for first-time readers.
+
+
+Technical Challenges
+--------------------
+
+Page-at-a-time vacuuming is normally impractical because of the costs of
+finding and removing the index entries that link to the tuples to be
+reclaimed.  Standard vacuuming scans the indexes to ensure all such index
+entries are removed, amortizing the index scan cost across as many dead
+tuples as possible; this approach does not scale down well to the case of
+reclaiming just a few tuples.  In principle one could recompute the index
+keys and do standard index searches to find the index entries, but this is
+risky in the presence of possibly-buggy user-defined functions in
+functional indexes.  An allegedly immutable function that in fact is not
+immutable might prevent us from re-finding an index entry (and we cannot
+throw an error for not finding it, in view of the fact that dead index
+entries are sometimes reclaimed early).  That would lead to a seriously
+corrupt index, in the form of entries pointing to tuple slots that by now
+contain some unrelated content.  In any case we would prefer to be able
+to do vacuuming without invoking any user-written code.
+
+HOT solves this problem for a restricted but useful special case:
+where a tuple is repeatedly updated in ways that do not change its
+indexed columns.  (Here, "indexed column" means any column referenced
+at all in an index definition, including for example columns that are
+tested in a partial-index predicate but are not stored in the index.)
+
+An additional property of HOT is that it reduces index size by avoiding
+the creation of identically-keyed index entries.  This improves search
+speeds.
+
+
+Update Chains With a Single Index Entry
+---------------------------------------
+
+Without HOT, every version of a row in an update chain has its own index
+entries, even if all indexed columns are the same.  With HOT, a new tuple
+placed on the same page and with all indexed columns the same as its
+parent row version does not get new index entries.  This means there is
+only one index entry for the entire update chain on the heap page.
+An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag.
+The prior row version is marked HEAP_HOT_UPDATED, and (as always in an
+update chain) its t_ctid field links forward to the newer version.
+
+For example:
+
+	Index points to 1
+	lp [1]  [2]
+
+	[111111111]->[2222222222]
+
+In the above diagram, the index points to line pointer 1, and tuple 1 is
+marked as HEAP_HOT_UPDATED.  Tuple 2 is a HOT tuple, meaning it has
+no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE.
+Although tuple 2 is not directly referenced by the index, it can still be
+found by an index search: after traversing from the index to tuple 1,
+the index search proceeds forward to child tuples as long as it sees the
+HEAP_HOT_UPDATED flag set.  Since we restrict the HOT chain to lie within
+a single page, this requires no additional page fetches and doesn't
+introduce much performance penalty.
+
+Eventually, tuple 1 will no longer be visible to any transaction.
+At that point its space could be reclaimed, but its line pointer cannot,
+since the index still links to that line pointer and we still need to
+be able to find tuple 2 in an index search.  HOT handles this by turning
+line pointer 1 into a "redirecting line pointer", which links to tuple 2
+but has no actual tuple attached.  This state of affairs looks like
+
+	Index points to 1
+	lp [1]->[2]
+
+	[2222222222]
+
+If now the row is updated again, to version 3, the page looks like this:
+
+	Index points to 1
+	lp [1]->[2]  [3]
+
+	[2222222222]->[3333333333]
+
+At some later time when no transaction can see tuple 2 in its snapshot,
+tuple 2 and its line pointer can be pruned entirely:
+
+	Index points to 1
+	lp [1]------>[3]
+
+	[3333333333]
+
+This is safe because no index entry points to line pointer 2.  Subsequent
+insertions into the page can now recycle both line pointer 2 and the
+space formerly used by tuple 2.
+
+If an update changes any indexed column, or there is not room on the
+same page for the new tuple, then the HOT chain ends: the last member
+has a regular t_ctid link to the next version and is not marked
+HEAP_HOT_UPDATED.  (In principle we could continue a HOT chain across
+pages, but this would destroy the desired property of being able to
+reclaim space with just page-local manipulations.  Anyway, we don't
+want to have to chase through multiple heap pages to get from an index
+entry to the desired tuple, so it seems better to create a new index
+entry for the new tuple.)  If further updates occur, the next version
+could become the root of a new HOT chain.
+
+Line pointer 1 has to remain as long as there is any non-dead member of
+the chain on the page.  When there is not, it is marked "dead".
+This lets us reclaim the last child line pointer and associated tuple
+immediately.  The next regular VACUUM pass can reclaim the index entries
+pointing at the line pointer and then the line pointer itself.  Since a
+line pointer is small compared to a tuple, this does not represent an
+undue space cost.
+
+Note: we can use a "dead" line pointer for any DELETEd tuple,
+whether it was part of a HOT chain or not.  This allows space reclamation
+in advance of running VACUUM for plain DELETEs as well as HOT updates.
+
+The requirement for doing a HOT update is that none of the indexed
+columns are changed.  This is checked at execution time by comparing the
+binary representation of the old and new values.  We insist on bitwise
+equality rather than using datatype-specific equality routines.  The
+main reason to avoid the latter is that there might be multiple notions
+of equality for a datatype, and we don't know exactly which one is
+relevant for the indexes at hand.  We assume that bitwise equality
+guarantees equality for all purposes.
+
+
+Abort Cases
+-----------
+
+If a heap-only tuple's xmin is aborted, then it can be removed immediately:
+it was never visible to any other transaction, and all descendant row
+versions must be aborted as well.  Therefore we need not consider it part
+of a HOT chain.  By the same token, if a HOT-updated tuple's xmax is
+aborted, there is no need to follow the chain link.  However, there is a
+race condition here: the transaction that did the HOT update might abort
+between the time we inspect the HOT-updated tuple and the time we reach
+the descendant heap-only tuple.  It is conceivable that someone prunes
+the heap-only tuple before that, and even conceivable that the line pointer
+is re-used for another purpose.  Therefore, when following a HOT chain,
+it is always necessary to be prepared for the possibility that the
+linked-to item pointer is unused, dead, or redirected; and if it is a
+normal item pointer, we still have to check that XMIN of the tuple matches
+the XMAX of the tuple we left.  Otherwise we should assume that we have
+come to the end of the HOT chain.  Note that this sort of XMIN/XMAX
+matching is required when following ordinary update chains anyway.
+
+(Early versions of the HOT code assumed that holding pin on the page
+buffer while following a HOT link would prevent this type of problem,
+but checking XMIN/XMAX matching is a much more robust solution.)
+
+
+Index/Sequential Scans
+----------------------
+
+When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose
+xmax is not aborted, we need to follow its t_ctid link and check that
+entry as well; possibly repeatedly until we reach the end of the HOT
+chain.  (When using an MVCC snapshot it is possible to optimize this a
+bit: there can be at most one visible tuple in the chain, so we can stop
+when we find it.  This rule does not work for non-MVCC snapshots, though.)
+
+Sequential scans do not need to pay attention to the HOT links because
+they scan every item pointer on the page anyway.  The same goes for a
+bitmap heap scan with a lossy bitmap.
+
+
+Pruning
+-------
+
+HOT pruning means updating item pointers so that HOT chains are
+reduced in length, by collapsing out line pointers for intermediate dead
+tuples.  Although this makes those line pointers available for re-use,
+it does not immediately make the space occupied by their tuples available.
+
+
+Defragmentation
+---------------
+
+Defragmentation centralizes unused space.  After we have converted root
+line pointers to redirected line pointers and pruned away any dead
+intermediate line pointers, the tuples they linked to are free space.
+But unless that space is adjacent to the central "hole" on the page
+(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion.
+Defragmentation moves the surviving tuples to coalesce all the free
+space into one "hole".  This is done with the same PageRepairFragmentation
+function that regular VACUUM uses.
+
+
+When can/should we prune or defragment?
+---------------------------------------
+
+This is the most interesting question in HOT implementation, since there
+is no simple right answer: we must use heuristics to determine when it's
+most efficient to perform pruning and/or defragmenting.
+
+We cannot prune or defragment unless we can get a "buffer cleanup lock"
+on the target page; otherwise, pruning might destroy line pointers that
+other backends have live references to, and defragmenting might move
+tuples that other backends have live pointers to.  Thus the general
+approach must be to heuristically decide if we should try to prune
+or defragment, and if so try to acquire the buffer cleanup lock without
+blocking.  If we succeed we can proceed with our housekeeping work.
+If we cannot get the lock (which should not happen often, except under
+very heavy contention) then the housekeeping has to be postponed till
+some other time.  The worst-case consequence of this is only that an
+UPDATE cannot be made HOT but has to link to a new tuple version placed on
+some other page, for lack of centralized space on the original page.
+
+Ideally we would do defragmenting only when we are about to attempt
+heap_update on a HOT-safe tuple.  The difficulty with this approach
+is that the update query has certainly got a pin on the old tuple, and
+therefore our attempt to acquire a buffer cleanup lock will always fail.
+(This corresponds to the idea that we don't want to move the old tuple
+out from under where the query's HeapTuple pointer points.  It might
+be possible to finesse that, but it seems fragile.)
+
+Pruning, however, is potentially useful even when we are not about to
+insert a new tuple, since shortening a HOT chain reduces the cost of
+subsequent index searches.  However it is unclear that this gain is
+large enough to accept any extra maintenance burden for.
+
+The currently planned heuristic is to prune and defrag when first accessing
+a page that potentially has prunable tuples (flagged by the PD_PRUNABLE
+page hint bit) and that either has free space less than MAX(fillfactor
+target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to
+find enough free space to store an updated tuple version.  (These rules
+are subject to change.)
+
+We have effectively implemented the "truncate dead tuples to just line
+pointer" idea that has been proposed and rejected before because of fear
+of line pointer bloat: we might end up with huge numbers of line pointers
+and just a few actual tuples on a page.  To limit the damage in the worst
+case, and to keep various work arrays as well as the bitmaps in bitmap
+scans reasonably sized, the maximum number of line pointers per page
+is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that
+could fit without HOT pruning).
+
+
+VACUUM
+------
+
+There is little change to regular vacuum.  It performs pruning to remove
+dead heap-only tuples, and cleans up any dead line pointers as if they were
+regular dead tuples.
+
+
+VACUUM FULL
+-----------
+
+VACUUM FULL performs an extra operation of collapsing out redirecting line
+pointers, by moving the first non-DEAD tuple of each HOT chain to the root
+position and clearing its heap-only-tuple flag.  This effectively changes
+the user-visible CTID of that tuple.  This would be completely unsafe
+during normal concurrent operation, but since VACUUM FULL takes full
+exclusive lock on the table, it should be OK.  (Note that VACUUM FULL has
+always felt free to change tuples' CTIDs by moving them across pages.)
+Eliminating redirection links means that the main body of VACUUM FULL
+doesn't have to deal with them, which seems a good thing since VACUUM FULL
+is horrendously complex already.
+
+When VACUUM FULL tries to move tuple chains, it does not distinguish regular
+and heap-only tuples, but just moves both types the same.  This is OK because
+it will move the entire non-DEAD tail of an update chain and remove index
+entries for each item moved.  At worst, we'll uselessly search for index
+entries matching the heap-only tuples included in the move.
+
+
+Statistics
+----------
+
+Currently, we count HOT updates the same as cold updates for statistics
+purposes, though there is an additional per-table counter that counts
+only HOT updates.  When a page pruning operation is able to remove a
+physical tuple by eliminating an intermediate heap-only tuple or
+replacing a physical root tuple by a redirect pointer, a decrement in
+the table's number of dead tuples is reported to pgstats, which may
+postpone autovacuuming.  Note that we do not count replacing a root tuple
+by a DEAD item pointer as decrementing n_dead_tuples; we still want
+autovacuum to run to clean up the index entries and DEAD item.
+
+This area probably needs further work ...
+
+
+CREATE INDEX
+------------
+
+CREATE INDEX presents a problem for HOT updates.  While the existing HOT
+chains all have the same index values for existing indexes, the columns
+in the new index might change within a pre-existing HOT chain, creating
+a "broken" chain that can't be indexed properly.
+
+To address this issue, regular (non-concurrent) CREATE INDEX makes the
+new index usable only by transactions newer than the CREATE INDEX
+command.  This prevents transactions that can see the inconsistent HOT
+chains from trying to use the new index and getting incorrect results.  
+New transactions can only see the rows visible after the index was
+created, hence the HOT chains are consistent for them.
+
+Entries in the new index point to root tuples (tuples with current index
+pointers) so that our index uses the same index pointers as all other
+indexes on the table.  However the row we want to index is actually at
+the *end* of the chain, ie, the most recent live tuple on the HOT chain.
+That is the one we compute the index entry values for, but the TID
+we put into the index is that of the root tuple.  Since transactions that
+will be allowed to use the new index cannot see any of the older tuple
+versions in the chain, the fact that they might not match the index entry
+isn't a problem.  (Such transactions will check the tuple visibility
+information of the older versions and ignore them, without ever looking at
+their contents, so the content inconsistency is OK.)  Subsequent updates
+to the live tuple will be allowed to extend the HOT chain only if they are
+HOT-safe for all the indexes.
+
+Because we have ShareLock on the table, any DELETE_IN_PROGRESS or
+INSERT_IN_PROGRESS tuples should have come from our own transaction.
+Therefore we can consider them committed since if the CREATE INDEX
+commits, they will be committed, and if it aborts the index is discarded.
+An exception to this is that early lock release is customary for system
+catalog updates, and so we might find such tuples when reindexing a system
+catalog.  In that case we deal with it by waiting for the source
+transaction to commit or roll back.  (We could do that for user tables
+too, but since the case is unexpected we prefer to throw an error.)
+
+Practically, we prevent old transactions from using the new index by
+setting pg_index.indcheckxmin to TRUE.  Queries are allowed to use such an
+index only after pg_index.xmin is below their TransactionXmin horizon,
+thereby ensuring that any incompatible rows in HOT chains are dead to them.
+(pg_index.xmin will be the XID of the CREATE INDEX transaction.  The reason
+for using xmin rather than a normal column is that the regular vacuum
+freezing mechanism will take care of converting xmin to FrozenTransactionId
+before it can wrap around.)
+
+This means in particular that the transaction creating the index will be
+unable to use the index.  We alleviate that problem somewhat by not setting
+indcheckxmin unless the table actually contains HOT chains with
+RECENTLY_DEAD members.  (In 8.4 we may be able to improve the situation,
+at least for non-serializable transactions, because we expect to be able to
+advance TransactionXmin intratransaction.)
+
+Another unpleasant consequence is that it is now risky to use SnapshotAny
+in an index scan: if the index was created more recently than the last
+vacuum, it's possible that some of the visited tuples do not match the
+index entry they are linked to.  This does not seem to be a fatal
+objection, since there are few users of SnapshotAny and most use seqscans.
+The only exception at this writing is CLUSTER, which is okay because it
+does not require perfect ordering of the indexscan readout (and especially
+so because CLUSTER tends to write recently-dead tuples out of order anyway).
+
+
+CREATE INDEX CONCURRENTLY
+-------------------------
+
+In the concurrent case we must take a different approach.  We create the
+pg_index entry immediately, before we scan the table.  The pg_index entry
+is marked as "not ready for inserts".  Then we commit and wait for any
+transactions which have the table open to finish.  This ensures that no
+new HOT updates will change the key value for our new index, because all
+transactions will see the existence of the index and will respect its
+constraint on which updates can be HOT.  Other transactions must include
+such an index when determining HOT-safety of updates, even though they
+must ignore it for both insertion and searching purposes.
+
+We must do this to avoid making incorrect index entries.  For example,
+suppose we are building an index on column X and we make an index entry for
+a non-HOT tuple with X=1.  Then some other backend, unaware that X is an
+indexed column, HOT-updates the row to have X=2, and commits.  We now have
+an index entry for X=1 pointing at a HOT chain whose live row has X=2.
+We could make an index entry with X=2 during the validation pass, but
+there is no nice way to get rid of the wrong entry with X=1.  So we must
+have the HOT-safety property enforced before we start to build the new
+index.
+
+After waiting for transactions which had the table open, we build the index
+for all rows that are valid in a fresh snapshot.  Any tuples visible in the
+snapshot will have only valid forward-growing HOT chains.  (They might have
+older HOT updates behind them which are broken, but this is OK for the same
+reason it's OK in a regular index build.)  As above, we point the index
+entry at the root of the HOT-update chain but we use the key value from the
+live tuple.
+
+We mark the index open for inserts (but still not ready for reads) then
+we again wait for transactions which have the table open.  Then we take
+a second reference snapshot and validate the index.  This searches for
+tuples missing from the index, and inserts any missing ones.  Again,
+the index entries have to have TIDs equal to HOT-chain root TIDs, but
+the value to be inserted is the one from the live tuple.
+
+Then we wait until every transaction that could have a snapshot older than
+the second reference snapshot is finished.  This ensures that nobody is
+alive any longer who could need to see any tuples that might be missing
+from the index, as well as ensuring that no one can see any inconsistent
+rows in a broken HOT chain (the first condition is stronger than the
+second).  Finally, we can mark the index valid for searches.
+
+
+Limitations and Restrictions
+----------------------------
+
+It is worth noting that HOT forever forecloses alternative approaches
+to vacuuming, specifically the recompute-the-index-keys approach alluded
+to in Technical Challenges above.  It'll be tough to recompute the index
+keys for a root line pointer you don't have data for anymore ...
+
+
+Glossary
+--------
+
+Broken HOT Chain
+
+	A HOT chain in which the key value for an index has changed.
+
+	This is not allowed to occur normally but if a new index is created
+	it can happen.  In that case various strategies are used to ensure
+	that no transaction for which the older tuples are visible can
+	use the index.
+
+Cold update
+
+	A normal, non-HOT update, in which index entries are made for
+	the new version of the tuple.
+
+Dead line pointer
+
+	A stub line pointer, that does not point to anything, but cannot
+	be removed or reused yet because there are index pointers to it.
+	Semantically same as a dead tuple.  It has state LP_DEAD.
+
+Heap-only tuple
+
+	A heap tuple with no index pointers, which can only be reached
+	from indexes indirectly through its ancestral root tuple.
+	Marked with HEAP_ONLY_TUPLE flag.
+
+HOT-safe
+
+	A proposed tuple update is said to be HOT-safe if it changes
+	none of the tuple's indexed columns.  It will only become an
+	actual HOT update if we can find room on the same page for
+	the new tuple version.
+
+HOT update
+
+	An UPDATE where the new tuple becomes a heap-only tuple, and no
+	new index entries are made.
+
+HOT-updated tuple
+
+	An updated tuple, for which the next tuple in the chain is a
+	heap-only tuple.  Marked with HEAP_HOT_UPDATED flag.
+
+Indexed column
+
+	A column used in an index definition.  The column might not
+	actually be stored in the index --- it could be used in a
+	functional index's expression, or used in a partial index
+	predicate.  HOT treats all these cases alike.
+
+Redirecting line pointer
+
+	A line pointer that points to another line pointer and has no
+	associated tuple.  It has the special lp_flags state LP_REDIRECT,
+	and lp_off is the OffsetNumber of the line pointer it links to.
+	This is used when a root tuple becomes dead but we cannot prune
+	the line pointer because there are non-dead heap-only tuples
+	further down the chain.
+
+Root tuple
+
+	The first tuple in a HOT update chain; the one that indexes point to.
+
+Update chain
+
+	A chain of updated tuples, in which each tuple's ctid points to
+	the next tuple in the chain. A HOT update chain is an update chain
+	(or portion of an update chain) that consists of a root tuple and
+	one or more heap-only tuples.  A complete update chain can contain
+	both HOT and non-HOT (cold) updated tuples.
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.240 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.241 2007/09/20 17:56:30 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -52,6 +52,7 @@
 #include "pgstat.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "utils/datum.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/relcache.h"
@ -64,6 +65,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
 											bool is_bitmapscan);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
 		   ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
+static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+					   HeapTuple oldtup, HeapTuple newtup);


 /* ----------------------------------------------------------------
@ -183,6 +186,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
 	buffer = scan->rs_cbuf;
 	snapshot = scan->rs_snapshot;

+	/*
+	 * Prune and repair fragmentation for the whole page, if possible.
+	 */
+	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
 	 * visibility.	Afterwards, however, the tuples we have found to be
@ -316,7 +324,7 @@ heapgettup(HeapScanDesc scan,
 			 * forward scanners.
 			 */
 			scan->rs_syncscan = false;
-			/* start from last page of the scan */ 
+			/* start from last page of the scan */
 			if (scan->rs_startblock > 0)
 				page = scan->rs_startblock - 1;
 			else
@ -368,6 +376,7 @@ heapgettup(HeapScanDesc scan,
 		dp = (Page) BufferGetPage(scan->rs_cbuf);
 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
 		lpp = PageGetItemId(dp, lineoff);
+		Assert(ItemIdIsNormal(lpp));

 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 		tuple->t_len = ItemIdGetLength(lpp);
@ -583,7 +592,7 @@ heapgettup_pagemode(HeapScanDesc scan,
 			 * forward scanners.
 			 */
 			scan->rs_syncscan = false;
-			/* start from last page of the scan */ 
+			/* start from last page of the scan */
 			if (scan->rs_startblock > 0)
 				page = scan->rs_startblock - 1;
 			else
@ -632,6 +641,7 @@ heapgettup_pagemode(HeapScanDesc scan,
 		dp = (Page) BufferGetPage(scan->rs_cbuf);
 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
 		lpp = PageGetItemId(dp, lineoff);
+		Assert(ItemIdIsNormal(lpp));

 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 		tuple->t_len = ItemIdGetLength(lpp);
@ -1246,6 +1256,9 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
 * for statistical purposes.  (This could be the heap rel itself, an
 * associated index, or NULL to not count the fetch at all.)
 *
+ * heap_fetch does not follow HOT chains: only the exact TID requested will
+ * be fetched.
+ *
 * It is somewhat inconsistent that we ereport() on invalid block number but
 * return false on invalid item number.  There are a couple of reasons though.
 * One is that the caller can relatively easily check the block number for
@ -1389,6 +1402,143 @@ heap_release_fetch(Relation relation,
 	return false;
 }

+/*
+ *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
+ *
+ * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
+ * of a HOT chain), and buffer is the buffer holding this tuple.  We search
+ * for the first chain member satisfying the given snapshot.  If one is
+ * found, we update *tid to reference that tuple's offset number, and
+ * return TRUE.  If no match, return FALSE without modifying *tid.
+ *
+ * If all_dead is not NULL, we check non-visible tuples to see if they are
+ * globally dead; *all_dead is set TRUE if all members of the HOT chain
+ * are vacuumable, FALSE if not.
+ *
+ * Unlike heap_fetch, the caller must already have pin and (at least) share
+ * lock on the buffer; it is still pinned/locked at exit.  Also unlike
+ * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
+ */
+bool
+heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
+					   bool *all_dead)
+{
+	Page dp = (Page) BufferGetPage(buffer);
+	TransactionId prev_xmax = InvalidTransactionId;
+	OffsetNumber offnum;
+	bool at_chain_start;
+
+	if (all_dead)
+		*all_dead = true;
+
+	Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
+	offnum = ItemPointerGetOffsetNumber(tid);
+	at_chain_start = true;
+
+	/* Scan through possible multiple members of HOT-chain */
+	for (;;)
+	{
+		ItemId lp;
+		HeapTupleData heapTuple;
+
+		/* check for bogus TID */
+		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+			break;
+
+		lp = PageGetItemId(dp, offnum);
+
+		/* check for unused, dead, or redirected items */
+		if (!ItemIdIsNormal(lp))
+		{
+			/* We should only see a redirect at start of chain */
+			if (ItemIdIsRedirected(lp) && at_chain_start)
+			{
+				/* Follow the redirect */
+				offnum = ItemIdGetRedirect(lp);
+				at_chain_start = false;
+				continue;
+			}
+			/* else must be end of chain */
+			break;
+		}
+
+		heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+		heapTuple.t_len = ItemIdGetLength(lp);
+
+		/*
+		 * Shouldn't see a HEAP_ONLY tuple at chain start.
+		 */
+		if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
+			break;
+
+		/*
+		 * The xmin should match the previous xmax value, else chain is broken.
+		 */
+		if (TransactionIdIsValid(prev_xmax) &&
+			!TransactionIdEquals(prev_xmax,
+								 HeapTupleHeaderGetXmin(heapTuple.t_data)))
+			break;
+
+		/* If it's visible per the snapshot, we must return it */
+		if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
+		{
+			ItemPointerSetOffsetNumber(tid, offnum);
+			if (all_dead)
+				*all_dead = false;
+			return true;
+		}
+
+		/*
+		 * If we can't see it, maybe no one else can either.  At caller
+		 * request, check whether all chain members are dead to all
+		 * transactions.
+		 */
+		if (all_dead && *all_dead &&
+			HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
+									 buffer) != HEAPTUPLE_DEAD)
+			*all_dead = false;
+
+		/*
+		 * Check to see if HOT chain continues past this tuple; if so
+		 * fetch the next offnum and loop around.
+		 */
+		if (HeapTupleIsHotUpdated(&heapTuple))
+		{
+			Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
+				   ItemPointerGetBlockNumber(tid));
+			offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
+			at_chain_start = false;
+			prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
+		}
+		else
+			break;			/* end of chain */
+	}
+
+	return false;
+}
+
+/*
+ *	heap_hot_search		- search HOT chain for tuple satisfying snapshot
+ *
+ * This has the same API as heap_hot_search_buffer, except that the caller
+ * does not provide the buffer containing the page, rather we access it
+ * locally.
+ */
+bool
+heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
+				bool *all_dead)
+{
+	bool	result;
+	Buffer	buffer;
+
+	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+	LockBuffer(buffer, BUFFER_LOCK_SHARE);
+	result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buffer);
+	return result;
+}
+
 /*
 *	heap_get_latest_tid -  get the latest tid of a specified tuple
 *
@ -1594,6 +1744,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	}

 	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
 	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
 	HeapTupleHeaderSetXmin(tup->t_data, xid);
 	HeapTupleHeaderSetCmin(tup->t_data, cid);
@ -1628,6 +1779,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,

 	RelationPutHeapTuple(relation, buffer, heaptup);

+	/*
+	 * XXX Should we set PageSetPrunable on this page ?
+	 *
+	 * The inserting transaction may eventually abort thus making this tuple
+	 * DEAD and hence available for pruning. Though we don't want to optimize
+	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
+	 * aborted tuple will never be pruned until next vacuum is triggered.
+	 *
+	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+	 */
+
 	MarkBufferDirty(buffer);

 	/* XLOG stuff */
@ -1904,12 +2066,21 @@ l1:

 	START_CRIT_SECTION();

+	/*
+	 * If this transaction commits, the tuple will become DEAD sooner or
+	 * later. Set hint bit that this page is a candidate for pruning.  If
+	 * the transaction finally aborts, the subsequent page pruning will be
+	 * a no-op and the hint will be cleared.
+	 */
+	PageSetPrunable((Page) dp);
+
 	/* store transaction information of xact deleting the tuple */
 	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 							   HEAP_XMAX_INVALID |
 							   HEAP_XMAX_IS_MULTI |
 							   HEAP_IS_LOCKED |
 							   HEAP_MOVED);
+	HeapTupleHeaderClearHotUpdated(tp.t_data);
 	HeapTupleHeaderSetXmax(tp.t_data, xid);
 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
 	/* Make sure there is no forward chain link in t_ctid */
@ -2045,7 +2216,8 @@ simple_heap_delete(Relation relation, ItemPointer tid)
 *
 * On success, the header fields of *newtup are updated to match the new
 * stored tuple; in particular, newtup->t_self is set to the TID where the
- * new tuple was inserted.	However, any TOAST changes in the new tuple's
+ * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
+ * update was done.  However, any TOAST changes in the new tuple's
 * data are not reflected into *newtup.
 *
 * In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
@ -2060,6 +2232,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 {
 	HTSU_Result result;
 	TransactionId xid = GetCurrentTransactionId();
+	Bitmapset  *hot_attrs;
 	ItemId		lp;
 	HeapTupleData oldtup;
 	HeapTuple	heaptup;
@ -2072,9 +2245,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 				pagefree;
 	bool		have_tuple_lock = false;
 	bool		iscombo;
+	bool		use_hot_update = false;

 	Assert(ItemPointerIsValid(otid));

+	/*
+	 * Fetch the list of attributes to be checked for HOT update.  This is
+	 * wasted effort if we fail to update or have to put the new tuple on
+	 * a different page.  But we must compute the list before obtaining
+	 * buffer lock --- in the worst case, if we are doing an update on one
+	 * of the relevant system catalogs, we could deadlock if we try to
+	 * fetch the list later.  In any case, the relcache caches the data
+	 * so this is usually pretty cheap.
+	 *
+	 * Note that we get a copy here, so we need not worry about relcache
+	 * flush happening midway through.
+	 */
+	hot_attrs = RelationGetIndexAttrBitmap(relation);
+
 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

@ -2208,6 +2396,7 @@ l2:
 		UnlockReleaseBuffer(buffer);
 		if (have_tuple_lock)
 			UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+		bms_free(hot_attrs);
 		return result;
 	}

@ -2227,6 +2416,7 @@ l2:
 	}

 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
 	newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
@ -2261,17 +2451,20 @@ l2:
 					  HeapTupleHasExternal(newtup) ||
 					  newtup->t_len > TOAST_TUPLE_THRESHOLD);

-	pagefree = PageGetFreeSpace((Page) dp);
+	pagefree = PageGetHeapFreeSpace((Page) dp);

 	newtupsize = MAXALIGN(newtup->t_len);

 	if (need_toast || newtupsize > pagefree)
 	{
+		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
 									   HEAP_IS_LOCKED |
 									   HEAP_MOVED);
+		HeapTupleClearHotUpdated(&oldtup);
+		/* ... and store info about transaction updating this tuple */
 		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 		/* temporarily make it look not-updated */
@ -2324,7 +2517,7 @@ l2:
 			/* Re-acquire the lock on the old tuple's page. */
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 			/* Re-check using the up-to-date free space */
-			pagefree = PageGetFreeSpace((Page) dp);
+			pagefree = PageGetHeapFreeSpace((Page) dp);
 			if (newtupsize > pagefree)
 			{
 				/*
@ -2357,18 +2550,66 @@ l2:
 	 * one pin is held.
 	 */

+	if (newbuf == buffer)
+	{
+		/*
+		 * Since the new tuple is going into the same page, we might be able
+		 * to do a HOT update.  Check if any of the index columns have been
+		 * changed.  If not, then HOT update is possible.
+		 */
+		if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+			use_hot_update = true;
+	}
+	else
+	{
+		/* Set a hint that the old page could use prune/defrag */
+		PageSetFull(dp);
+	}
+
 	/* NO EREPORT(ERROR) from here till changes are logged */
 	START_CRIT_SECTION();

+	/*
+	 * If this transaction commits, the old tuple will become DEAD sooner or
+	 * later. Set hint bit that this page is a candidate for pruning.  If
+	 * the transaction finally aborts, the subsequent page pruning will be
+	 * a no-op and the hint will be cleared.
+	 *
+	 * XXX Should we set hint on newbuf as well?  If the transaction
+	 * aborts, there would be a prunable tuple in the newbuf; but for now
+	 * we choose not to optimize for aborts.  Note that heap_xlog_update
+	 * must be kept in sync if this changes.
+	 */
+	PageSetPrunable(dp);
+
+	if (use_hot_update)
+	{
+		/* Mark the old tuple as HOT-updated */
+		HeapTupleSetHotUpdated(&oldtup);
+		/* And mark the new tuple as heap-only */
+		HeapTupleSetHeapOnly(heaptup);
+		/* Mark the caller's copy too, in case different from heaptup */
+		HeapTupleSetHeapOnly(newtup);
+	}
+	else
+	{
+		/* Make sure tuples are correctly marked as not-HOT */
+		HeapTupleClearHotUpdated(&oldtup);
+		HeapTupleClearHeapOnly(heaptup);
+		HeapTupleClearHeapOnly(newtup);
+	}
+
 	RelationPutHeapTuple(relation, newbuf, heaptup);	/* insert new tuple */

 	if (!already_marked)
 	{
+		/* Clear obsolete visibility flags ... */
 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
 									   HEAP_XMAX_INVALID |
 									   HEAP_XMAX_IS_MULTI |
 									   HEAP_IS_LOCKED |
 									   HEAP_MOVED);
+		/* ... and store info about transaction updating this tuple */
 		HeapTupleHeaderSetXmax(oldtup.t_data, xid);
 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
 	}
@ -2427,7 +2668,7 @@ l2:
 	if (have_tuple_lock)
 		UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);

-	pgstat_count_heap_update(relation);
+	pgstat_count_heap_update(relation, use_hot_update);

 	/*
 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
@ -2439,9 +2680,119 @@ l2:
 		heap_freetuple(heaptup);
 	}

+	bms_free(hot_attrs);
+
 	return HeapTupleMayBeUpdated;
 }

+/*
+ * Check if the specified attribute's value is same in both given tuples.
+ * Subroutine for HeapSatisfiesHOTUpdate.
+ */
+static bool
+heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
+					   HeapTuple tup1, HeapTuple tup2)
+{
+	Datum value1, value2;
+	bool isnull1, isnull2;
+	Form_pg_attribute att;
+
+	/*
+	 * If it's a whole-tuple reference, say "not equal".  It's not really
+	 * worth supporting this case, since it could only succeed after a
+	 * no-op update, which is hardly a case worth optimizing for.
+	 */
+	if (attrnum == 0)
+		return false;
+
+	/*
+	 * Likewise, automatically say "not equal" for any system attribute
+	 * other than OID and tableOID; we cannot expect these to be consistent
+	 * in a HOT chain, or even to be set correctly yet in the new tuple.
+	 */
+	if (attrnum < 0)
+	{
+		if (attrnum != ObjectIdAttributeNumber &&
+			attrnum != TableOidAttributeNumber)
+			return false;
+	}
+
+	/*
+	 * Extract the corresponding values.  XXX this is pretty inefficient
+	 * if there are many indexed columns.  Should HeapSatisfiesHOTUpdate
+	 * do a single heap_deform_tuple call on each tuple, instead?  But
+	 * that doesn't work for system columns ...
+	 */
+	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
+	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
+
+	/*
+	 * If one value is NULL and other is not, then they are certainly
+	 * not equal
+	 */
+	if (isnull1 != isnull2)
+		return false;
+
+	/*
+	 * If both are NULL, they can be considered equal.
+	 */
+	if (isnull1)
+		return true;
+
+	/*
+	 * We do simple binary comparison of the two datums.  This may be overly
+	 * strict because there can be multiple binary representations for the
+	 * same logical value.  But we should be OK as long as there are no false
+	 * positives.  Using a type-specific equality operator is messy because
+	 * there could be multiple notions of equality in different operator
+	 * classes; furthermore, we cannot safely invoke user-defined functions
+	 * while holding exclusive buffer lock.
+	 */
+	if (attrnum <= 0)
+	{
+		/* The only allowed system columns are OIDs, so do this */
+		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+	}
+	else
+	{
+		Assert(attrnum <= tupdesc->natts);
+		att	= tupdesc->attrs[attrnum - 1];
+		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+	}
+}
+
+/*
+ * Check if the old and new tuples represent a HOT-safe update. To be able
+ * to do a HOT update, we must not have changed any columns used in index
+ * definitions.
+ *
+ * The set of attributes to be checked is passed in (we dare not try to
+ * compute it while holding exclusive buffer lock...)  NOTE that hot_attrs
+ * is destructively modified!  That is OK since this is invoked at most once
+ * by heap_update().
+ *
+ * Returns true if safe to do HOT update.
+ */
+static bool
+HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+					   HeapTuple oldtup, HeapTuple newtup)
+{
+	int attrnum;
+
+	while ((attrnum = bms_first_member(hot_attrs)) >= 0)
+	{
+		/* Adjust for system attributes */
+		attrnum += FirstLowInvalidHeapAttributeNumber;
+
+		/* If the attribute value has changed, we can't do HOT update */
+		if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
+									oldtup, newtup))
+			return false;
+	}
+
+	return true;
+}
+
 /*
 *	simple_heap_update - replace a tuple
 *
@ -2865,6 +3216,7 @@ l3:
 	 * avoids possibly generating a useless combo CID.
 	 */
 	tuple->t_data->t_infomask = new_infomask;
+	HeapTupleHeaderClearHotUpdated(tuple->t_data);
 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
 	/* Make sure there is no forward chain link in t_ctid */
 	tuple->t_data->t_ctid = *tid;
@ -3110,6 +3462,7 @@ recheck_xmax:
 			 */
 			tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
 			tuple->t_infomask |= HEAP_XMAX_INVALID;
+			HeapTupleHeaderClearHotUpdated(tuple);
 			changed = true;
 		}
 	}
@ -3245,21 +3598,29 @@ heap_restrpos(HeapScanDesc scan)
 * Perform XLogInsert for a heap-clean operation.  Caller must already
 * have modified the buffer and marked it dirty.
 *
- * Note: for historical reasons, the entries in the unused[] array should
- * be zero-based tuple indexes, not one-based.
+ * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
+ * zero-based tuple indexes.  Now they are one-based like other uses
+ * of OffsetNumber.
 */
 XLogRecPtr
-log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
+log_heap_clean(Relation reln, Buffer buffer,
+			   OffsetNumber *redirected, int nredirected,
+			   OffsetNumber *nowdead, int ndead,
+			   OffsetNumber *nowunused, int nunused,
+			   bool redirect_move)
 {
 	xl_heap_clean xlrec;
+	uint8		info;
 	XLogRecPtr	recptr;
-	XLogRecData rdata[2];
+	XLogRecData rdata[4];

 	/* Caller should not call me on a temp relation */
 	Assert(!reln->rd_istemp);

 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
+	xlrec.nredirected = nredirected;
+	xlrec.ndead = ndead;

 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapClean;
@ -3267,14 +3628,17 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 	rdata[0].next = &(rdata[1]);

 	/*
-	 * The unused-offsets array is not actually in the buffer, but pretend
-	 * that it is.	When XLogInsert stores the whole buffer, the offsets array
-	 * need not be stored too.
+	 * The OffsetNumber arrays are not actually in the buffer, but we pretend
+	 * that they are.  When XLogInsert stores the whole buffer, the offset
+	 * arrays need not be stored too.  Note that even if all three arrays
+	 * are empty, we want to expose the buffer as a candidate for whole-page
+	 * storage, since this record type implies a defragmentation operation
+	 * even if no item pointers changed state.
 	 */
-	if (uncnt > 0)
+	if (nredirected > 0)
 	{
-		rdata[1].data = (char *) unused;
-		rdata[1].len = uncnt * sizeof(OffsetNumber);
+		rdata[1].data = (char *) redirected;
+		rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
 	}
 	else
 	{
@ -3283,9 +3647,38 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 	}
 	rdata[1].buffer = buffer;
 	rdata[1].buffer_std = true;
-	rdata[1].next = NULL;
+	rdata[1].next = &(rdata[2]);

-	recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
+	if (ndead > 0)
+	{
+		rdata[2].data = (char *) nowdead;
+		rdata[2].len = ndead * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[2].data = NULL;
+		rdata[2].len = 0;
+	}
+	rdata[2].buffer = buffer;
+	rdata[2].buffer_std = true;
+	rdata[2].next = &(rdata[3]);
+
+	if (nunused > 0)
+	{
+		rdata[3].data = (char *) nowunused;
+		rdata[3].len = nunused * sizeof(OffsetNumber);
+	}
+	else
+	{
+		rdata[3].data = NULL;
+		rdata[3].len = 0;
+	}
+	rdata[3].buffer = buffer;
+	rdata[3].buffer_std = true;
+	rdata[3].next = NULL;
+
+	info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN;
+	recptr = XLogInsert(RM_HEAP2_ID, info, rdata);

 	return recptr;
 }
@ -3293,8 +3686,6 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
 /*
 * Perform XLogInsert for a heap-freeze operation.  Caller must already
 * have modified the buffer and marked it dirty.
- *
- * Unlike log_heap_clean(), the offsets[] entries are one-based.
 */
 XLogRecPtr
 log_heap_freeze(Relation reln, Buffer buffer,
@ -3363,17 +3754,28 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
 	}			xlhdr;
 	int			hsize = SizeOfHeapHeader;
 	xl_heap_update xlrec;
+	uint8		info;
 	XLogRecPtr	recptr;
 	XLogRecData rdata[4];
 	Page		page = BufferGetPage(newbuf);
-	uint8		info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE;

 	/* Caller should not call me on a temp relation */
 	Assert(!reln->rd_istemp);

+	if (move)
+	{
+		Assert(!HeapTupleIsHeapOnly(newtup));
+		info = XLOG_HEAP_MOVE;
+	}
+	else if (HeapTupleIsHeapOnly(newtup))
+		info = XLOG_HEAP_HOT_UPDATE;
+	else
+		info = XLOG_HEAP_UPDATE;
+
 	xlrec.target.node = reln->rd_node;
 	xlrec.target.tid = from;
 	xlrec.newtid = newtup->t_self;
+
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapUpdate;
 	rdata[0].buffer = InvalidBuffer;
@ -3489,13 +3891,21 @@ log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page)
 	return recptr;
 }

+/*
+ * Handles CLEAN and CLEAN_MOVE record types
+ */
 static void
-heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
+heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
 {
 	xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
 	Relation	reln;
 	Buffer		buffer;
 	Page		page;
+	OffsetNumber *offnum;
+	OffsetNumber *end;
+	int nredirected;
+	int ndead;
+	int i;

 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
@ -3512,25 +3922,63 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
 		return;
 	}

-	if (record->xl_len > SizeOfHeapClean)
+	nredirected = xlrec->nredirected;
+	ndead = xlrec->ndead;
+	offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
+	end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+	/* Update all redirected or moved line pointers */
+	for (i = 0; i < nredirected; i++)
 	{
-		OffsetNumber *unused;
-		OffsetNumber *unend;
-		ItemId		lp;
+		OffsetNumber fromoff = *offnum++;
+		OffsetNumber tooff = *offnum++;
+		ItemId	fromlp = PageGetItemId(page, fromoff);

-		unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
-		unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
-
-		while (unused < unend)
+		if (clean_move)
 		{
-			/* unused[] entries are zero-based */
-			lp = PageGetItemId(page, *unused + 1);
-			ItemIdSetUnused(lp);
-			unused++;
+			/* Physically move the "to" item to the "from" slot */
+			ItemId	tolp = PageGetItemId(page, tooff);
+			HeapTupleHeader htup;
+
+			*fromlp = *tolp;
+			ItemIdSetUnused(tolp);
+
+			/* We also have to clear the tuple's heap-only bit */
+			Assert(ItemIdIsNormal(fromlp));
+			htup = (HeapTupleHeader) PageGetItem(page, fromlp);
+			Assert(HeapTupleHeaderIsHeapOnly(htup));
+			HeapTupleHeaderClearHeapOnly(htup);
+		}
+		else
+		{
+			/* Just insert a REDIRECT link at fromoff */
+			ItemIdSetRedirect(fromlp, tooff);
 		}
 	}

-	PageRepairFragmentation(page, NULL);
+	/* Update all now-dead line pointers */
+	for (i = 0; i < ndead; i++)
+	{
+		OffsetNumber off = *offnum++;
+		ItemId	lp = PageGetItemId(page, off);
+
+		ItemIdSetDead(lp);
+	}
+
+	/* Update all now-unused line pointers */
+	while (offnum < end)
+	{
+		OffsetNumber off = *offnum++;
+		ItemId	lp = PageGetItemId(page, off);
+
+		ItemIdSetUnused(lp);
+	}
+
+	/*
+	 * Finally, repair any fragmentation, and update the page's hint bit
+	 * about whether it has free pointers.
+	 */
+	PageRepairFragmentation(page);

 	PageSetLSN(page, lsn);
 	PageSetTLI(page, ThisTimeLineID);
@ -3655,8 +4103,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 						  HEAP_XMAX_IS_MULTI |
 						  HEAP_IS_LOCKED |
 						  HEAP_MOVED);
+	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, record->xl_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+
+	/* Mark the page as a candidate for pruning */
+	PageSetPrunable(page);
+
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->target.tid;
 	PageSetLSN(page, lsn);
@ -3736,7 +4189,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 	HeapTupleHeaderSetCmin(htup, FirstCommandId);
 	htup->t_ctid = xlrec->target.tid;

-	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_insert_redo: failed to add tuple");
 	PageSetLSN(page, lsn);
@ -3746,10 +4199,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
 }

 /*
- * Handles UPDATE & MOVE
+ * Handles UPDATE, HOT_UPDATE & MOVE
 */
 static void
-heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
+heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
 {
 	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
 	Relation	reln = XLogOpenRelation(xlrec->target.node);
@ -3808,6 +4261,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
 							  HEAP_XMIN_INVALID |
 							  HEAP_MOVED_IN);
 		htup->t_infomask |= HEAP_MOVED_OFF;
+		HeapTupleHeaderClearHotUpdated(htup);
 		HeapTupleHeaderSetXvac(htup, record->xl_xid);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = xlrec->target.tid;
@ -3819,12 +4273,19 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
 							  HEAP_XMAX_IS_MULTI |
 							  HEAP_IS_LOCKED |
 							  HEAP_MOVED);
+		if (hot_update)
+			HeapTupleHeaderSetHotUpdated(htup);
+		else
+			HeapTupleHeaderClearHotUpdated(htup);
 		HeapTupleHeaderSetXmax(htup, record->xl_xid);
 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = xlrec->newtid;
 	}

+	/* Mark the page as a candidate for pruning */
+	PageSetPrunable(page);
+
 	/*
 	 * this test is ugly, but necessary to avoid thinking that insert change
 	 * is already applied
@ -3914,7 +4375,7 @@ newsame:;
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->newtid;

-	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+	offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "heap_update_redo: failed to add tuple");
 	PageSetLSN(page, lsn);
@ -3971,6 +4432,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
 		htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
 	else
 		htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+	HeapTupleHeaderClearHotUpdated(htup);
 	HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 	/* Make sure there is no forward chain link in t_ctid */
@ -4039,25 +4501,35 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;

-	info &= XLOG_HEAP_OPMASK;
-	if (info == XLOG_HEAP_INSERT)
-		heap_xlog_insert(lsn, record);
-	else if (info == XLOG_HEAP_DELETE)
-		heap_xlog_delete(lsn, record);
-	else if (info == XLOG_HEAP_UPDATE)
-		heap_xlog_update(lsn, record, false);
-	else if (info == XLOG_HEAP_MOVE)
-		heap_xlog_update(lsn, record, true);
-	else if (info == XLOG_HEAP_CLEAN)
-		heap_xlog_clean(lsn, record);
-	else if (info == XLOG_HEAP_NEWPAGE)
-		heap_xlog_newpage(lsn, record);
-	else if (info == XLOG_HEAP_LOCK)
-		heap_xlog_lock(lsn, record);
-	else if (info == XLOG_HEAP_INPLACE)
-		heap_xlog_inplace(lsn, record);
-	else
-		elog(PANIC, "heap_redo: unknown op code %u", info);
+	switch (info & XLOG_HEAP_OPMASK)
+	{
+		case XLOG_HEAP_INSERT:
+			heap_xlog_insert(lsn, record);
+			break;
+		case XLOG_HEAP_DELETE:
+			heap_xlog_delete(lsn, record);
+			break;
+		case XLOG_HEAP_UPDATE:
+			heap_xlog_update(lsn, record, false, false);
+			break;
+		case XLOG_HEAP_MOVE:
+			heap_xlog_update(lsn, record, true, false);
+			break;
+		case XLOG_HEAP_HOT_UPDATE:
+			heap_xlog_update(lsn, record, false, true);
+			break;
+		case XLOG_HEAP_NEWPAGE:
+			heap_xlog_newpage(lsn, record);
+			break;
+		case XLOG_HEAP_LOCK:
+			heap_xlog_lock(lsn, record);
+			break;
+		case XLOG_HEAP_INPLACE:
+			heap_xlog_inplace(lsn, record);
+			break;
+		default:
+			elog(PANIC, "heap_redo: unknown op code %u", info);
+	}
 }

 void
@ -4065,11 +4537,20 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;

-	info &= XLOG_HEAP_OPMASK;
-	if (info == XLOG_HEAP2_FREEZE)
-		heap_xlog_freeze(lsn, record);
-	else
-		elog(PANIC, "heap2_redo: unknown op code %u", info);
+	switch (info & XLOG_HEAP_OPMASK)
+	{
+		case XLOG_HEAP2_FREEZE:
+			heap_xlog_freeze(lsn, record);
+			break;
+		case XLOG_HEAP2_CLEAN:
+			heap_xlog_clean(lsn, record, false);
+			break;
+		case XLOG_HEAP2_CLEAN_MOVE:
+			heap_xlog_clean(lsn, record, true);
+			break;
+		default:
+			elog(PANIC, "heap2_redo: unknown op code %u", info);
+	}
 }

 static void
@ -4130,13 +4611,18 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
 						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
 	}
-	else if (info == XLOG_HEAP_CLEAN)
+	else if (info == XLOG_HEAP_HOT_UPDATE)
 	{
-		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+		xl_heap_update *xlrec = (xl_heap_update *) rec;

-		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
-						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+		if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
+			appendStringInfo(buf, "hot_update(init): ");
+		else
+			appendStringInfo(buf, "hot_update: ");
+		out_target(buf, &(xlrec->target));
+		appendStringInfo(buf, "; new %u/%u",
+						 ItemPointerGetBlockNumber(&(xlrec->newtid)),
+						 ItemPointerGetOffsetNumber(&(xlrec->newtid)));
 	}
 	else if (info == XLOG_HEAP_NEWPAGE)
 	{
@ -4187,6 +4673,22 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 						 xlrec->node.relNode, xlrec->block,
 						 xlrec->cutoff_xid);
 	}
+	else if (info == XLOG_HEAP2_CLEAN)
+	{
+		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block);
+	}
+	else if (info == XLOG_HEAP2_CLEAN_MOVE)
+	{
+		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+						 xlrec->node.spcNode, xlrec->node.dbNode,
+						 xlrec->node.relNode, xlrec->block);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.66 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.67 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -41,7 +41,7 @@ RelationPutHeapTuple(Relation relation,
 	pageHeader = BufferGetPage(buffer);

 	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
-						 tuple->t_len, InvalidOffsetNumber, false);
+						 tuple->t_len, InvalidOffsetNumber, false, true);

 	if (offnum == InvalidOffsetNumber)
 		elog(PANIC, "failed to add tuple to page");
@ -218,7 +218,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 		 * we're done.
 		 */
 		pageHeader = (Page) BufferGetPage(buffer);
-		pageFreeSpace = PageGetFreeSpace(pageHeader);
+		pageFreeSpace = PageGetHeapFreeSpace(pageHeader);
 		if (len + saveFreeSpace <= pageFreeSpace)
 		{
 			/* use this page as future insert target, too */
@ -311,7 +311,7 @@ RelationGetBufferForTuple(Relation relation, Size len,

 	PageInit(pageHeader, BufferGetPageSize(buffer), 0);

-	if (len > PageGetFreeSpace(pageHeader))
+	if (len > PageGetHeapFreeSpace(pageHeader))
 	{
 		/* We should not get here given the test at the top */
 		elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@ -0,0 +1,702 @@
+/*-------------------------------------------------------------------------
+ *
+ * pruneheap.c
+ *	  heap page pruning and HOT-chain management code
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.1 2007/09/20 17:56:30 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/inval.h"
+
+
+/* Local functions */
+static int	heap_prune_chain(Relation relation, Buffer buffer,
+							 OffsetNumber rootoffnum,
+							 TransactionId OldestXmin,
+							 OffsetNumber *redirected, int *nredirected,
+							 OffsetNumber *nowdead, int *ndead,
+							 OffsetNumber *nowunused, int *nunused,
+							 bool redirect_move);
+static void heap_prune_record_redirect(OffsetNumber *redirected,
+			int *nredirected,
+			OffsetNumber offnum,
+			OffsetNumber rdoffnum);
+static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+			OffsetNumber offnum);
+static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+			OffsetNumber offnum);
+
+
+/*
+ * Optionally prune and repair fragmentation in the specified page.
+ *
+ * This is an opportunistic function.  It will perform housekeeping
+ * only if the page heuristically looks like a candidate for pruning and we
+ * can acquire buffer cleanup lock without blocking.
+ *
+ * Note: this is called quite often.  It's important that it fall out quickly
+ * if there's not any use in pruning.
+ *
+ * Caller must have pin on the buffer, and must *not* have a lock on it.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ */
+void
+heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+{
+	PageHeader	dp = (PageHeader) BufferGetPage(buffer);
+	Size		minfree;
+
+	/*
+	 * Let's see if we really need pruning.
+	 *
+	 * Forget it if page is not hinted to contain something prunable
+	 */
+	if (!PageIsPrunable(dp))
+		return;
+
+	/*
+	 * We prune when a previous UPDATE failed to find enough space on the
+	 * page for a new tuple version, or when free space falls below the
+	 * relation's fill-factor target (but not less than 10%).
+	 *
+	 * Checking free space here is questionable since we aren't holding
+	 * any lock on the buffer; in the worst case we could get a bogus
+	 * answer.  It's unlikely to be *seriously* wrong, though, since
+	 * reading either pd_lower or pd_upper is probably atomic.  Avoiding
+	 * taking a lock seems better than sometimes getting a wrong answer
+	 * in what is after all just a heuristic estimate.
+	 */
+	minfree = RelationGetTargetPageFreeSpace(relation,
+											 HEAP_DEFAULT_FILLFACTOR);
+	minfree = Max(minfree, BLCKSZ / 10);
+
+	if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+	{
+		/* OK, try to get exclusive buffer lock */
+		if (!ConditionalLockBufferForCleanup(buffer))
+			return;
+
+		/*
+		 * Now that we have buffer lock, get accurate information about the
+		 * page's free space, and recheck the heuristic about whether to prune.
+		 */
+		if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+		{
+			/* OK to prune (though not to remove redirects) */
+			(void) heap_page_prune(relation, buffer, OldestXmin, false, true);
+		}
+
+		/* And release buffer lock */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	}
+}
+
+
+/*
+ * Prune and repair fragmentation in the specified page.
+ *
+ * Caller must have pin and buffer cleanup lock on the page.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ *
+ * If redirect_move is set, we remove redirecting line pointers by
+ * updating the root line pointer to point directly to the first non-dead
+ * tuple in the chain.  NOTE: eliminating the redirect changes the first
+ * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL.
+ * The only reason we support this capability at all is that by using it,
+ * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a
+ * good thing since VACUUM FULL is overly complicated already.
+ *
+ * If report_stats is true then we send the number of reclaimed heap-only
+ * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
+ * send its own new total to pgstats, and we don't want this delta applied
+ * on top of that.)
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+int
+heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
+				bool redirect_move, bool report_stats)
+{
+	int				ndeleted = 0;
+	Page			page = BufferGetPage(buffer);
+	OffsetNumber 	offnum,
+					maxoff;
+	OffsetNumber	redirected[MaxHeapTuplesPerPage * 2];
+	OffsetNumber	nowdead[MaxHeapTuplesPerPage];
+	OffsetNumber	nowunused[MaxHeapTuplesPerPage];
+	int				nredirected = 0;
+	int				ndead = 0;
+	int				nunused = 0;
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Mark the page as clear of prunable tuples. If we find a tuple which
+	 * may soon become prunable, we shall set the hint again.  Also clear
+	 * the "page is full" flag, since there's no point in repeating the
+	 * prune/defrag process until something else happens to the page.
+	 */
+	PageClearPrunable(page);
+	PageClearFull(page);
+
+	/* Scan the page */
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId itemid = PageGetItemId(page, offnum);
+
+		/* Nothing to do if slot is empty or already dead */
+		if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
+			continue;
+
+		/* Process this item or chain of items */
+		ndeleted += heap_prune_chain(relation, buffer, offnum,
+									 OldestXmin,
+									 redirected, &nredirected,
+									 nowdead, &ndead,
+									 nowunused, &nunused,
+									 redirect_move);
+	}
+
+	/* Have we pruned any items? */
+	if (nredirected > 0 || ndead > 0 || nunused > 0)
+	{
+		/*
+		 * Repair page fragmentation, and update the page's hint bit about
+		 * whether it has free line pointers.
+		 */
+		PageRepairFragmentation((Page) page);
+
+		MarkBufferDirty(buffer);
+
+		/*
+		 * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did
+		 */
+		if (!relation->rd_istemp)
+		{
+			XLogRecPtr	recptr;
+
+			recptr = log_heap_clean(relation, buffer,
+									redirected, nredirected,
+									nowdead, ndead,
+									nowunused, nunused,
+									redirect_move);
+			PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
+			PageSetLSN(BufferGetPage(buffer), recptr);
+		}
+	}
+
+	END_CRIT_SECTION();
+
+	/*
+	 * If requested, report the number of tuples reclaimed to pgstats.
+	 * This is ndeleted minus ndead, because we don't want to count a now-DEAD
+	 * root item as a deletion for this purpose.
+	 */
+	if (report_stats && ndeleted > ndead)
+		pgstat_update_heap_dead_tuples(relation, ndeleted - ndead);
+
+	/*
+	 * XXX Should we update the FSM information of this page ?
+	 *
+	 * There are two schools of thought here. We may not want to update
+	 * FSM information so that the page is not used for unrelated
+	 * UPDATEs/INSERTs and any free space in this page will remain
+	 * available for further UPDATEs in *this* page, thus improving
+	 * chances for doing HOT updates.
+	 *
+	 * But for a large table and where a page does not receive further
+	 * UPDATEs for a long time, we might waste this space by not
+	 * updating the FSM information. The relation may get extended and
+	 * fragmented further.
+	 *
+	 * One possibility is to leave "fillfactor" worth of space in this
+	 * page and update FSM with the remaining space.
+	 *
+	 * In any case, the current FSM implementation doesn't accept
+	 * one-page-at-a-time updates, so this is all academic for now.
+	 */
+
+	return ndeleted;
+}
+
+
+/*
+ * Prune specified item pointer or a HOT chain originating at that item.
+ *
+ * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
+ * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
+ * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
+ * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
+ * DEAD, the OldestXmin test is just too coarse to detect it.
+ *
+ * The root line pointer is redirected to the tuple immediately after the
+ * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
+ * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
+ * tuple, which we treat as a chain of length 1.)
+ *
+ * OldestXmin is the cutoff XID used to identify dead tuples.
+ *
+ * Redirected items are added to the redirected[] array (two entries per
+ * redirection); items set to LP_DEAD state are added to nowdead[]; and
+ * items set to LP_UNUSED state are added to nowunused[].  (These arrays
+ * will be used to generate a WAL record after all chains are pruned.)
+ *
+ * If redirect_move is true, we get rid of redirecting line pointers.
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+static int
+heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
+				 TransactionId OldestXmin,
+				 OffsetNumber *redirected, int *nredirected,
+				 OffsetNumber *nowdead, int *ndead,
+				 OffsetNumber *nowunused, int *nunused,
+				 bool redirect_move)
+{
+	int				ndeleted = 0;
+	Page			dp = (Page) BufferGetPage(buffer);
+	TransactionId	priorXmax = InvalidTransactionId;
+	ItemId			rootlp;
+	HeapTupleHeader	htup;
+	OffsetNumber	latestdead = InvalidOffsetNumber,
+					maxoff = PageGetMaxOffsetNumber(dp),
+					offnum;
+	OffsetNumber	chainitems[MaxHeapTuplesPerPage];
+	int				nchain = 0,
+					i;
+
+	rootlp = PageGetItemId(dp, rootoffnum);
+
+	/*
+	 * If it's a heap-only tuple, then it is not the start of a HOT chain.
+	 */
+	if (ItemIdIsNormal(rootlp))
+	{
+		htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
+		if (HeapTupleHeaderIsHeapOnly(htup))
+		{
+			/*
+			 * If the tuple is DEAD and doesn't chain to anything else, mark it
+			 * unused immediately.  (If it does chain, we can only remove it as
+			 * part of pruning its chain.)
+			 *
+			 * We need this primarily to handle aborted HOT updates, that is,
+			 * XMIN_INVALID heap-only tuples.  Those might not be linked to
+			 * by any chain, since the parent tuple might be re-updated before
+			 * any pruning occurs.  So we have to be able to reap them
+			 * separately from chain-pruning.
+			 *
+			 * Note that we might first arrive at a dead heap-only tuple
+			 * either here or while following a chain below.  Whichever path
+			 * gets there first will mark the tuple unused.
+			 */
+			if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
+				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+			{
+				ItemIdSetUnused(rootlp);
+				heap_prune_record_unused(nowunused, nunused, rootoffnum);
+				ndeleted++;
+			}
+
+			/* Nothing more to do */
+			return ndeleted;
+		}
+	}
+
+	/* Start from the root tuple */
+	offnum = rootoffnum;
+
+	/* while not end of the chain */
+	for (;;)
+	{
+		ItemId			lp;
+		bool			tupdead,
+						recent_dead;
+
+		/* Some sanity checks */
+		if (offnum < FirstOffsetNumber || offnum > maxoff)
+			break;
+
+		lp = PageGetItemId(dp, offnum);
+
+		if (!ItemIdIsUsed(lp))
+			break;
+
+		/*
+		 * If we are looking at the redirected root line pointer,
+		 * jump to the first normal tuple in the chain.  If we find
+		 * a redirect somewhere else, stop --- it must not be same chain.
+		 */
+		if (ItemIdIsRedirected(lp))
+		{
+			if (nchain > 0)
+				break;			/* not at start of chain */
+			chainitems[nchain++] = offnum;
+			offnum = ItemIdGetRedirect(rootlp);
+			continue;
+		}
+
+		/*
+		 * Likewise, a dead item pointer can't be part of the chain.
+		 * (We already eliminated the case of dead root tuple outside
+		 * this function.)
+		 */
+		if (ItemIdIsDead(lp))
+			break;
+
+		Assert(ItemIdIsNormal(lp));
+		htup = (HeapTupleHeader) PageGetItem(dp, lp);
+
+		/*
+		 * Check the tuple XMIN against prior XMAX, if any
+		 */
+		if (TransactionIdIsValid(priorXmax) &&
+			!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+			break;
+
+		/*
+		 * OK, this tuple is indeed a member of the chain.
+		 */
+		chainitems[nchain++] = offnum;
+
+		/*
+		 * Check tuple's visibility status.
+		 */
+		tupdead = recent_dead = false;
+
+		switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
+		{
+			case HEAPTUPLE_DEAD:
+				tupdead = true;
+				break;
+
+			case HEAPTUPLE_RECENTLY_DEAD:
+				recent_dead = true;
+				/*
+				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * that the page is reconsidered for pruning in future.
+				 */
+				PageSetPrunable(dp);
+				break;
+
+			case HEAPTUPLE_DELETE_IN_PROGRESS:
+				/*
+				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * that the page is reconsidered for pruning in future.
+				 */
+				PageSetPrunable(dp);
+				break;
+
+			case HEAPTUPLE_LIVE:
+			case HEAPTUPLE_INSERT_IN_PROGRESS:
+				/*
+				 * If we wanted to optimize for aborts, we might consider
+				 * marking the page prunable when we see INSERT_IN_PROGRESS.
+				 * But we don't.  See related decisions about when to mark
+				 * the page prunable in heapam.c.
+				 */
+				break;
+
+			default:
+				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+				break;
+		}
+
+		/*
+		 * Remember the last DEAD tuple seen.  We will advance past
+		 * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
+		 * but we can't advance past anything else.  (XXX is it really worth
+		 * continuing to scan beyond RECENTLY_DEAD?  The case where we will
+		 * find another DEAD tuple is a fairly unusual corner case.)
+		 */
+		if (tupdead)
+			latestdead = offnum;
+		else if (!recent_dead)
+			break;
+
+		/*
+		 * If the tuple is not HOT-updated, then we are at the end of this
+		 * HOT-update chain.
+		 */
+		if (!HeapTupleHeaderIsHotUpdated(htup))
+			break;
+
+		/*
+		 * Advance to next chain member.
+		 */
+		Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
+			   BufferGetBlockNumber(buffer));
+		offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+		priorXmax = HeapTupleHeaderGetXmax(htup);
+	}
+
+	/*
+	 * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
+	 * the DEAD tuples at the start of the chain are removed and the root line
+	 * pointer is appropriately redirected.
+	 */
+	if (OffsetNumberIsValid(latestdead))
+	{
+		/*
+		 * Mark as unused each intermediate item that we are able to remove
+		 * from the chain.
+		 *
+		 * When the previous item is the last dead tuple seen, we are at
+		 * the right candidate for redirection.
+		 */
+		for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
+		{
+			ItemId lp = PageGetItemId(dp, chainitems[i]);
+
+			ItemIdSetUnused(lp);
+			heap_prune_record_unused(nowunused, nunused, chainitems[i]);
+			ndeleted++;
+		}
+
+		/*
+		 * If the root entry had been a normal tuple, we are deleting it,
+		 * so count it in the result.  But changing a redirect (even to
+		 * DEAD state) doesn't count.
+		 */
+		if (ItemIdIsNormal(rootlp))
+			ndeleted++;
+
+		/*
+		 * If the DEAD tuple is at the end of the chain, the entire chain is
+		 * dead and the root line pointer can be marked dead.  Otherwise
+		 * just redirect the root to the correct chain member.
+		 */
+		if (i >= nchain)
+		{
+			ItemIdSetDead(rootlp);
+			heap_prune_record_dead(nowdead, ndead, rootoffnum);
+		}
+		else
+		{
+			ItemIdSetRedirect(rootlp, chainitems[i]);
+			heap_prune_record_redirect(redirected, nredirected,
+									   rootoffnum,
+									   chainitems[i]);
+		}
+	}
+	else if (nchain < 2 && ItemIdIsRedirected(rootlp))
+	{
+		/*
+		 * We found a redirect item that doesn't point to a valid follow-on
+		 * item.  This can happen if the loop in heap_page_prune caused us
+		 * to visit the dead successor of a redirect item before visiting
+		 * the redirect item.  We can clean up by setting the redirect item
+		 * to DEAD state.
+		 */
+		ItemIdSetDead(rootlp);
+		heap_prune_record_dead(nowdead, ndead, rootoffnum);
+	}
+
+	/*
+	 * If requested, eliminate LP_REDIRECT items by moving tuples.  Note that
+	 * if the root item is LP_REDIRECT and doesn't point to a valid follow-on
+	 * item, we already killed it above.
+	 */
+	if (redirect_move && ItemIdIsRedirected(rootlp))
+	{
+		OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp);
+		ItemId firstlp = PageGetItemId(dp, firstoffnum);
+		HeapTupleData	firsttup;
+
+		Assert(ItemIdIsNormal(firstlp));
+		/* Set up firsttup to reference the tuple at its existing CTID */
+		firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp);
+		firsttup.t_len = ItemIdGetLength(firstlp);
+		ItemPointerSet(&firsttup.t_self,
+					   BufferGetBlockNumber(buffer),
+					   firstoffnum);
+		firsttup.t_tableOid = RelationGetRelid(relation);
+
+		/*
+		 * Mark the tuple for invalidation.  Needed because we're changing
+		 * its CTID.
+		 */
+		CacheInvalidateHeapTuple(relation, &firsttup);
+
+		/*
+		 * Change heap-only status of the tuple because after the line
+		 * pointer manipulation, it's no longer a heap-only tuple, but is
+		 * directly pointed to by index entries.
+		 */
+		Assert(HeapTupleIsHeapOnly(&firsttup));
+		HeapTupleClearHeapOnly(&firsttup);
+
+		/* Now move the item pointer */
+		*rootlp = *firstlp;
+		ItemIdSetUnused(firstlp);
+
+		/*
+		 * If latestdead is valid, we have already recorded the redirection
+		 * above.  Otherwise, do it now.
+		 *
+		 * We don't record firstlp in the nowunused[] array, since the
+		 * redirection entry is enough to tell heap_xlog_clean what to do.
+		 */
+		if (!OffsetNumberIsValid(latestdead))
+			heap_prune_record_redirect(redirected, nredirected, rootoffnum,
+									   firstoffnum);
+	}
+
+	return ndeleted;
+}
+
+
+/* Record newly-redirected item pointer */
+static void
+heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected,
+			OffsetNumber offnum, OffsetNumber rdoffnum)
+{
+	Assert(*nredirected < MaxHeapTuplesPerPage);
+	redirected[*nredirected * 2] = offnum;
+	redirected[*nredirected * 2 + 1] = rdoffnum;
+	(*nredirected)++;
+}
+
+/* Record newly-dead item pointer */
+static void
+heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+					   OffsetNumber offnum)
+{
+	Assert(*ndead < MaxHeapTuplesPerPage);
+	nowdead[*ndead] = offnum;
+	(*ndead)++;
+}
+
+/* Record newly-unused item pointer */
+static void
+heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+						 OffsetNumber offnum)
+{
+	Assert(*nunused < MaxHeapTuplesPerPage);
+	nowunused[*nunused] = offnum;
+	(*nunused)++;
+}
+
+
+/*
+ * For all items in this page, find their respective root line pointers.
+ * If item k is part of a HOT-chain with root at item j, then we set
+ * root_offsets[k - 1] = j.
+ *
+ * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
+ * We zero out all unused entries.
+ *
+ * The function must be called with at least share lock on the buffer, to
+ * prevent concurrent prune operations.
+ *
+ * Note: The information collected here is valid only as long as the caller
+ * holds a pin on the buffer. Once pin is released, a tuple might be pruned
+ * and reused by a completely unrelated tuple.
+ */
+void
+heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
+{
+	OffsetNumber	offnum, maxoff;
+
+	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+	{
+		ItemId			lp = PageGetItemId(page, offnum);
+		HeapTupleHeader	htup;
+		OffsetNumber	nextoffnum;
+		TransactionId	priorXmax;
+
+		/* skip unused and dead items */
+		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+			continue;
+
+		if (ItemIdIsNormal(lp))
+		{
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			/*
+			 * Check if this tuple is part of a HOT-chain rooted at some other
+			 * tuple. If so, skip it for now; we'll process it when we find
+			 * its root.
+			 */
+			if (HeapTupleHeaderIsHeapOnly(htup))
+				continue;
+
+			/*
+			 * This is either a plain tuple or the root of a HOT-chain.
+			 * Remember it in the mapping.
+			 */
+			root_offsets[offnum - 1] = offnum;
+
+			/* If it's not the start of a HOT-chain, we're done with it */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				continue;
+
+			/* Set up to scan the HOT-chain */
+			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetXmax(htup);
+		}
+		else
+		{
+			/* Must be a redirect item. We do not set its root_offsets entry */
+			Assert(ItemIdIsRedirected(lp));
+			/* Set up to scan the HOT-chain */
+			nextoffnum = ItemIdGetRedirect(lp);
+			priorXmax = InvalidTransactionId;
+		}
+
+		/*
+		 * Now follow the HOT-chain and collect other tuples in the chain.
+		 *
+		 * Note: Even though this is a nested loop, the complexity of the
+		 * function is O(N) because a tuple in the page should be visited not
+		 * more than twice, once in the outer loop and once in HOT-chain
+		 * chases.
+		 */
+		for (;;)
+		{
+			lp = PageGetItemId(page, nextoffnum);
+
+			/* Check for broken chains */
+			if (!ItemIdIsNormal(lp))
+				break;
+
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			if (TransactionIdIsValid(priorXmax) &&
+				!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
+				break;
+
+			/* Remember the root line pointer for this item */
+			root_offsets[nextoffnum - 1] = offnum;
+
+			/* Advance to next chain member, if any */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				break;
+
+			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetXmax(htup);
+		}
+	}
+}
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@ -96,7 +96,7 @@
 * Portions Copyright (c) 1994-5, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.6 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.7 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -320,12 +320,14 @@ rewrite_heap_tuple(RewriteState state,
 	 * Copy the original tuple's visibility information into new_tuple.
 	 *
 	 * XXX we might later need to copy some t_infomask2 bits, too?
+	 * Right now, we intentionally clear the HOT status bits.
 	 */
 	memcpy(&new_tuple->t_data->t_choice.t_heap,
 		   &old_tuple->t_data->t_choice.t_heap,
 		   sizeof(HeapTupleFields));

 	new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
+	new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
 	new_tuple->t_data->t_infomask |=
 		old_tuple->t_data->t_infomask & HEAP_XACT_MASK;

@ -593,7 +595,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	/* Now we can check to see if there's enough free space already. */
 	if (state->rs_buffer_valid)
 	{
-		pageFreeSpace = PageGetFreeSpace(page);
+		pageFreeSpace = PageGetHeapFreeSpace(page);

 		if (len + saveFreeSpace > pageFreeSpace)
 		{
@ -628,7 +630,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)

 	/* And now we can insert the tuple into the page */
 	newoff = PageAddItem(page, (Item) heaptup->t_data, len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(ERROR, "failed to add tuple");

--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.62 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.63 2007/09/20 17:56:30 tgl Exp $
 *
 * NOTES
 *	  many of the old access method routines have been turned into
@ -21,6 +21,7 @@

 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "miscadmin.h"
 #include "pgstat.h"

@ -95,6 +96,9 @@ RelationGetIndexScan(Relation indexRelation,
 	ItemPointerSetInvalid(&scan->xs_ctup.t_self);
 	scan->xs_ctup.t_data = NULL;
 	scan->xs_cbuf = InvalidBuffer;
+	scan->xs_prev_xmax = InvalidTransactionId;
+	scan->xs_next_hot = InvalidOffsetNumber;
+	scan->xs_hot_dead = false;

 	/*
 	 * Let the AM fill in the key and any opaque data it wants.
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.98 2007/05/27 03:50:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.99 2007/09/20 17:56:30 tgl Exp $
 *
 * INTERFACE ROUTINES
 *		index_open		- open an index relation by relation OID
@ -64,6 +64,7 @@

 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "pgstat.h"
 #include "utils/relcache.h"

@ -313,6 +314,8 @@ index_rescan(IndexScanDesc scan, ScanKey key)
 		scan->xs_cbuf = InvalidBuffer;
 	}

+	scan->xs_next_hot = InvalidOffsetNumber;
+
 	scan->kill_prior_tuple = false;		/* for safety */

 	FunctionCall2(procedure,
@ -370,6 +373,14 @@ index_markpos(IndexScanDesc scan)
 * NOTE: this only restores the internal scan state of the index AM.
 * The current result tuple (scan->xs_ctup) doesn't change.  See comments
 * for ExecRestrPos().
+ *
+ * NOTE: in the presence of HOT chains, mark/restore only works correctly
+ * if the scan's snapshot is MVCC-safe; that ensures that there's at most one
+ * returnable tuple in each HOT chain, and so restoring the prior state at the
+ * granularity of the index AM is sufficient.  Since the only current user
+ * of mark/restore functionality is nodeMergejoin.c, this effectively means
+ * that merge-join plans only work for MVCC snapshots.  This could be fixed
+ * if necessary, but for now it seems unimportant.
 * ----------------
 */
 void
@ -377,9 +388,13 @@ index_restrpos(IndexScanDesc scan)
 {
 	FmgrInfo   *procedure;

+	Assert(IsMVCCSnapshot(scan->xs_snapshot));
+
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amrestrpos);

+	scan->xs_next_hot = InvalidOffsetNumber;
+
 	scan->kill_prior_tuple = false;		/* for safety */

 	FunctionCall1(procedure, PointerGetDatum(scan));
@ -398,72 +413,224 @@ HeapTuple
 index_getnext(IndexScanDesc scan, ScanDirection direction)
 {
 	HeapTuple	heapTuple = &scan->xs_ctup;
+	ItemPointer	tid = &heapTuple->t_self;
 	FmgrInfo   *procedure;

 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amgettuple);

-	/* just make sure this is false... */
-	scan->kill_prior_tuple = false;
+	/*
+	 * We always reset xs_hot_dead; if we are here then either we are just
+	 * starting the scan, or we previously returned a visible tuple, and in
+	 * either case it's inappropriate to kill the prior index entry.
+	 */
+	scan->xs_hot_dead = false;

 	for (;;)
 	{
-		bool		found;
+		OffsetNumber offnum;
+		bool at_chain_start;
+		Page dp;

-		/*
-		 * The AM's gettuple proc finds the next tuple matching the scan keys.
-		 */
-		found = DatumGetBool(FunctionCall2(procedure,
-										   PointerGetDatum(scan),
-										   Int32GetDatum(direction)));
-
-		/* Reset kill flag immediately for safety */
-		scan->kill_prior_tuple = false;
-
-		if (!found)
+		if (scan->xs_next_hot != InvalidOffsetNumber)
 		{
-			/* Release any held pin on a heap page */
-			if (BufferIsValid(scan->xs_cbuf))
-			{
-				ReleaseBuffer(scan->xs_cbuf);
-				scan->xs_cbuf = InvalidBuffer;
-			}
-			return NULL;		/* failure exit */
+			/*
+			 * We are resuming scan of a HOT chain after having returned
+			 * an earlier member.  Must still hold pin on current heap page.
+			 */
+			Assert(BufferIsValid(scan->xs_cbuf));
+			Assert(ItemPointerGetBlockNumber(tid) ==
+				   BufferGetBlockNumber(scan->xs_cbuf));
+			Assert(TransactionIdIsValid(scan->xs_prev_xmax));
+			offnum = scan->xs_next_hot;
+			at_chain_start = false;
+			scan->xs_next_hot = InvalidOffsetNumber;
+		}
+		else
+		{
+			bool		found;
+			Buffer		prev_buf;
+
+			/*
+			 * If we scanned a whole HOT chain and found only dead tuples,
+			 * tell index AM to kill its entry for that TID.
+			 */
+			scan->kill_prior_tuple = scan->xs_hot_dead;
+
+			/*
+			 * The AM's gettuple proc finds the next index entry matching the
+			 * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid).
+			 */
+			found = DatumGetBool(FunctionCall2(procedure,
+											   PointerGetDatum(scan),
+											   Int32GetDatum(direction)));
+
+			/* Reset kill flag immediately for safety */
+			scan->kill_prior_tuple = false;
+
+			/* If we're out of index entries, break out of outer loop */
+			if (!found)
+				break;
+
+			pgstat_count_index_tuples(scan->indexRelation, 1);
+
+			/* Switch to correct buffer if we don't have it already */
+			prev_buf = scan->xs_cbuf;
+			scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+												 scan->heapRelation,
+											 ItemPointerGetBlockNumber(tid));
+
+			/*
+			 * Prune page, but only if we weren't already on this page
+			 */
+			if (prev_buf != scan->xs_cbuf)
+				heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+									RecentGlobalXmin);
+
+			/* Prepare to scan HOT chain starting at index-referenced offnum */
+			offnum = ItemPointerGetOffsetNumber(tid);
+			at_chain_start = true;
+
+			/* We don't know what the first tuple's xmin should be */
+			scan->xs_prev_xmax = InvalidTransactionId;
+
+			/* Initialize flag to detect if all entries are dead */
+			scan->xs_hot_dead = true;
 		}

-		pgstat_count_index_tuples(scan->indexRelation, 1);
-
-		/*
-		 * Fetch the heap tuple and see if it matches the snapshot.
-		 */
-		if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
-							   heapTuple, &scan->xs_cbuf, true,
-							   scan->indexRelation))
-			break;
-
-		/* Skip if no undeleted tuple at this location */
-		if (heapTuple->t_data == NULL)
-			continue;
-
-		/*
-		 * If we can't see it, maybe no one else can either.  Check to see if
-		 * the tuple is dead to all transactions.  If so, signal the index AM
-		 * to not return it on future indexscans.
-		 *
-		 * We told heap_release_fetch to keep a pin on the buffer, so we can
-		 * re-access the tuple here.  But we must re-lock the buffer first.
-		 */
+		/* Obtain share-lock on the buffer so we can examine visibility */
 		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);

-		if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
-									 scan->xs_cbuf) == HEAPTUPLE_DEAD)
-			scan->kill_prior_tuple = true;
+		dp = (Page) BufferGetPage(scan->xs_cbuf);
+
+		/* Scan through possible multiple members of HOT-chain */
+		for (;;)
+		{
+			ItemId lp;
+			ItemPointer ctid;
+
+			/* check for bogus TID */
+			if (offnum < FirstOffsetNumber ||
+				offnum > PageGetMaxOffsetNumber(dp))
+				break;
+
+			lp = PageGetItemId(dp, offnum);
+
+			/* check for unused, dead, or redirected items */
+			if (!ItemIdIsNormal(lp))
+			{
+				/* We should only see a redirect at start of chain */
+				if (ItemIdIsRedirected(lp) && at_chain_start)
+				{
+					/* Follow the redirect */
+					offnum = ItemIdGetRedirect(lp);
+					at_chain_start = false;
+					continue;
+				}
+				/* else must be end of chain */
+				break;
+			}
+
+			/*
+			 * We must initialize all of *heapTuple (ie, scan->xs_ctup)
+			 * since it is returned to the executor on success.
+			 */
+			heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+			heapTuple->t_len = ItemIdGetLength(lp);
+			ItemPointerSetOffsetNumber(tid, offnum);
+			heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation);
+			ctid = &heapTuple->t_data->t_ctid;
+
+			/*
+			 * Shouldn't see a HEAP_ONLY tuple at chain start.  (This test
+			 * should be unnecessary, since the chain root can't be removed
+			 * while we have pin on the index entry, but let's make it anyway.)
+			 */
+			if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
+				break;
+
+			/*
+			 * The xmin should match the previous xmax value, else chain is
+			 * broken.  (Note: this test is not optional because it protects
+			 * us against the case where the prior chain member's xmax
+			 * aborted since we looked at it.)
+			 */
+			if (TransactionIdIsValid(scan->xs_prev_xmax) &&
+				!TransactionIdEquals(scan->xs_prev_xmax,
+								 HeapTupleHeaderGetXmin(heapTuple->t_data)))
+				break;
+
+			/* If it's visible per the snapshot, we must return it */
+			if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
+											 scan->xs_cbuf))
+			{
+				/*
+				 * If the snapshot is MVCC, we know that it could accept
+				 * at most one member of the HOT chain, so we can skip
+				 * examining any more members.  Otherwise, check for
+				 * continuation of the HOT-chain, and set state for next time.
+				 */
+				if (IsMVCCSnapshot(scan->xs_snapshot))
+					scan->xs_next_hot = InvalidOffsetNumber;
+				else if (HeapTupleIsHotUpdated(heapTuple))
+				{
+					Assert(ItemPointerGetBlockNumber(ctid) ==
+						   ItemPointerGetBlockNumber(tid));
+					scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid);
+					scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+				}
+				else
+					scan->xs_next_hot = InvalidOffsetNumber;
+
+				LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+				pgstat_count_heap_fetch(scan->indexRelation);
+
+				return heapTuple;
+			}
+
+			/*
+			 * If we can't see it, maybe no one else can either.  Check to see
+			 * if the tuple is dead to all transactions.  If we find that all
+			 * the tuples in the HOT chain are dead, we'll signal the index AM
+			 * to not return that TID on future indexscans.
+			 */
+			if (scan->xs_hot_dead &&
+				HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
+										 scan->xs_cbuf) != HEAPTUPLE_DEAD)
+				scan->xs_hot_dead = false;
+
+			/*
+			 * Check to see if HOT chain continues past this tuple; if so
+			 * fetch the next offnum (we don't bother storing it into
+			 * xs_next_hot, but must store xs_prev_xmax), and loop around.
+			 */
+			if (HeapTupleIsHotUpdated(heapTuple))
+			{
+				Assert(ItemPointerGetBlockNumber(ctid) ==
+					   ItemPointerGetBlockNumber(tid));
+				offnum = ItemPointerGetOffsetNumber(ctid);
+				at_chain_start = false;
+				scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+			}
+			else
+				break;			/* end of chain */
+		} /* loop over a single HOT chain */

 		LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+		/* Loop around to ask index AM for another TID */
+		scan->xs_next_hot = InvalidOffsetNumber;
 	}

-	/* Success exit */
-	return heapTuple;
+	/* Release any held pin on a heap page */
+	if (BufferIsValid(scan->xs_cbuf))
+	{
+		ReleaseBuffer(scan->xs_cbuf);
+		scan->xs_cbuf = InvalidBuffer;
+	}
+
+	return NULL;				/* failure exit */
 }

 /* ----------------
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.160 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -193,8 +193,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 	 */
 	for (;;)
 	{
-		HeapTupleData htup;
-		Buffer		hbuffer;
 		ItemId		curitemid;
 		IndexTuple	curitup;
 		BlockNumber nblkno;
@ -223,6 +221,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 			 */
 			if (!ItemIdIsDead(curitemid))
 			{
+				ItemPointerData htid;
+				bool all_dead;
+
 				/*
 				 * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
 				 * how we handling NULLs - and so we must not use _bt_compare
@ -234,17 +235,20 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,

 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);
-				htup.t_self = curitup->t_tid;
-				if (heap_fetch(heapRel, &SnapshotDirty, &htup, &hbuffer,
-							   true, NULL))
+				htid = curitup->t_tid;
+
+				/*
+				 * We check the whole HOT-chain to see if there is any tuple
+				 * that satisfies SnapshotDirty.  This is necessary because
+				 * we have just a single index entry for the entire chain.
+				 */
+				if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
 				{
 					/* it is a duplicate */
 					TransactionId xwait =
 					(TransactionIdIsValid(SnapshotDirty.xmin)) ?
 					SnapshotDirty.xmin : SnapshotDirty.xmax;

-					ReleaseBuffer(hbuffer);
-
 					/*
 					 * If this tuple is being updated by other transaction
 					 * then we have to wait for its commit/abort.
@ -263,15 +267,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 					 * is itself now committed dead --- if so, don't complain.
 					 * This is a waste of time in normal scenarios but we must
 					 * do it to support CREATE INDEX CONCURRENTLY.
+					 * 
+					 * We must follow HOT-chains here because during
+					 * concurrent index build, we insert the root TID though
+					 * the actual tuple may be somewhere in the HOT-chain.
+					 * While following the chain we might not stop at the exact
+					 * tuple which triggered the insert, but that's OK because
+					 * if we find a live tuple anywhere in this chain, we have
+					 * a unique key conflict.  The other live tuple is not part
+					 * of this chain because it had a different index entry.
 					 */
-					htup.t_self = itup->t_tid;
-					if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer,
-								   false, NULL))
+					htid = itup->t_tid;
+					if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
-						ReleaseBuffer(hbuffer);
 					}
-					else if (htup.t_data != NULL)
+					else
 					{
 						/*
 						 * It's been deleted, so no error, and no need to
@ -279,39 +290,27 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
 						 */
 						break;
 					}
-					else
-					{
-						/* couldn't find the tuple?? */
-						elog(ERROR, "failed to fetch tuple being inserted");
-					}

 					ereport(ERROR,
 							(errcode(ERRCODE_UNIQUE_VIOLATION),
 					errmsg("duplicate key value violates unique constraint \"%s\"",
 						   RelationGetRelationName(rel))));
 				}
-				else if (htup.t_data != NULL)
+				else if (all_dead)
 				{
 					/*
-					 * Hmm, if we can't see the tuple, maybe it can be marked
-					 * killed.	This logic should match index_getnext and
-					 * btgettuple.
+					 * The conflicting tuple (or whole HOT chain) is dead to
+					 * everyone, so we may as well mark the index entry
+					 * killed.
 					 */
-					LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
-					if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
-												 hbuffer) == HEAPTUPLE_DEAD)
-					{
-						ItemIdMarkDead(curitemid);
-						opaque->btpo_flags |= BTP_HAS_GARBAGE;
-						/* be sure to mark the proper buffer dirty... */
-						if (nbuf != InvalidBuffer)
-							SetBufferCommitInfoNeedsSave(nbuf);
-						else
-							SetBufferCommitInfoNeedsSave(buf);
-					}
-					LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
+					ItemIdMarkDead(curitemid);
+					opaque->btpo_flags |= BTP_HAS_GARBAGE;
+					/* be sure to mark the proper buffer dirty... */
+					if (nbuf != InvalidBuffer)
+						SetBufferCommitInfoNeedsSave(nbuf);
+					else
+						SetBufferCommitInfoNeedsSave(buf);
 				}
-				ReleaseBuffer(hbuffer);
 			}
 		}

@ -840,7 +839,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
-						false) == InvalidOffsetNumber)
+						false, false) == InvalidOffsetNumber)
 			elog(PANIC, "failed to add hikey to the right sibling");
 		rightoff = OffsetNumberNext(rightoff);
 	}
@ -865,7 +864,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 	}
 	if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add hikey to the left sibling");
 	leftoff = OffsetNumberNext(leftoff);

@ -1700,7 +1699,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * benefit of _bt_restore_page().
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add leftkey to new root page");
 	pfree(new_item);

@ -1718,7 +1717,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * insert the right page pointer into the new root page.
 	 */
 	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add rightkey to new root page");
 	pfree(new_item);

@ -1805,7 +1804,7 @@ _bt_pgaddtup(Relation rel,
 	}

 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item to the %s for \"%s\"",
 			 where, RelationGetRelationName(rel));
 }
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@ -57,7 +57,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.113 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
 	}

 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
-					false) == InvalidOffsetNumber)
+					false, false) == InvalidOffsetNumber)
 		elog(ERROR, "failed to add item to the index page");
 }

--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.46 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -141,8 +141,8 @@ _bt_restore_page(Page page, char *from, int len)
 		memcpy(&itupdata, from, sizeof(IndexTupleData));
 		itemsz = IndexTupleDSize(itupdata);
 		itemsz = MAXALIGN(itemsz);
-		if (PageAddItem(page, (Item) from, itemsz,
-						FirstOffsetNumber, false) == InvalidOffsetNumber)
+		if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
+						false, false) == InvalidOffsetNumber)
 			elog(PANIC, "_bt_restore_page: cannot add item to page");
 		from += itemsz;
 	}
@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 			{
 				if (PageAddItem(page, (Item) datapos, datalen,
 							ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
-								false) == InvalidOffsetNumber)
+								false, false) == InvalidOffsetNumber)
 					elog(PANIC, "btree_insert_redo: failed to add item");

 				PageSetLSN(page, lsn);
@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
 				if (onleft)
 				{
 					if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
-									false) == InvalidOffsetNumber)
+									false, false) == InvalidOffsetNumber)
 						elog(PANIC, "failed to add new item to left page after split");
 				}

@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
 				hiItem = PageGetItem(rpage, hiItemId);

 				if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
-								P_HIKEY, false) == InvalidOffsetNumber)
+								P_HIKEY, false, false) == InvalidOffsetNumber)
 					elog(PANIC, "failed to add high key to left page after split");

 				/* Fix opaque fields */
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.285 2007/09/20 17:56:30 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -410,6 +410,9 @@ UpdateIndexRelation(Oid indexoid,
 	values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
 	values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
 	values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
+	values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false);
+	/* we set isvalid and isready the same way */
+	values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid);
 	values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
 	values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
 	values[Anum_pg_index_indoption - 1] = PointerGetDatum(indoption);
@ -944,7 +947,11 @@ BuildIndexInfo(Relation index)

 	/* other info */
 	ii->ii_Unique = indexStruct->indisunique;
-	ii->ii_Concurrent = false;	/* assume normal case */
+	ii->ii_ReadyForInserts = indexStruct->indisready;
+
+	/* initialize index-build state to default */
+	ii->ii_Concurrent = false;
+	ii->ii_BrokenHotChain = false;

 	return ii;
 }
@ -1308,6 +1315,35 @@ index_build(Relation heapRelation,
 										 PointerGetDatum(indexInfo)));
 	Assert(PointerIsValid(stats));

+	/*
+	 * If we found any potentially broken HOT chains, mark the index as
+	 * not being usable until the current transaction is below the event
+	 * horizon.  See src/backend/access/heap/README.HOT for discussion.
+	 */
+	if (indexInfo->ii_BrokenHotChain)
+	{
+		Oid indexId = RelationGetRelid(indexRelation);
+		Relation pg_index;
+		HeapTuple indexTuple;
+		Form_pg_index indexForm;
+
+		pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+		indexTuple = SearchSysCacheCopy(INDEXRELID,
+										ObjectIdGetDatum(indexId),
+										0, 0, 0);
+		if (!HeapTupleIsValid(indexTuple))
+			elog(ERROR, "cache lookup failed for index %u", indexId);
+		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+		indexForm->indcheckxmin = true;
+		simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+		CatalogUpdateIndexes(pg_index, indexTuple);
+
+		heap_freetuple(indexTuple);
+		heap_close(pg_index, RowExclusiveLock);
+	}
+
 	/*
 	 * Update heap and index pg_class rows
 	 */
@ -1346,6 +1382,11 @@ index_build(Relation heapRelation,
 * must keep track of the number of index tuples; we don't do so here because
 * the AM might reject some of the tuples for its own reasons, such as being
 * unable to store NULLs.
+ *
+ * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
+ * any potentially broken HOT chains.  Currently, we set this if there are
+ * any RECENTLY_DEAD entries in a HOT chain, without trying very hard to
+ * detect whether they're really incompatible with the chain tip.
 */
 double
 IndexBuildHeapScan(Relation heapRelation,
@ -1365,6 +1406,8 @@ IndexBuildHeapScan(Relation heapRelation,
 	ExprContext *econtext;
 	Snapshot	snapshot;
 	TransactionId OldestXmin;
+	BlockNumber root_blkno = InvalidBlockNumber;
+	OffsetNumber root_offsets[MaxHeapTuplesPerPage];

 	/*
 	 * sanity checks
@ -1427,15 +1470,47 @@ IndexBuildHeapScan(Relation heapRelation,

 		CHECK_FOR_INTERRUPTS();

+		/*
+		 * When dealing with a HOT-chain of updated tuples, we want to
+		 * index the values of the live tuple (if any), but index it
+		 * under the TID of the chain's root tuple.  This approach is
+		 * necessary to preserve the HOT-chain structure in the heap.
+		 * So we need to be able to find the root item offset for every
+		 * tuple that's in a HOT-chain.  When first reaching a new page
+		 * of the relation, call heap_get_root_tuples() to build a map
+		 * of root item offsets on the page.
+		 *
+		 * It might look unsafe to use this information across buffer
+		 * lock/unlock.  However, we hold ShareLock on the table so no
+		 * ordinary insert/update/delete should occur; and we hold pin on
+		 * the buffer continuously while visiting the page, so no pruning
+		 * operation can occur either.
+		 *
+		 * Note the implied assumption that there is no more than one live
+		 * tuple per HOT-chain ...
+		 */
+		if (scan->rs_cblock != root_blkno)
+		{
+			Page page = BufferGetPage(scan->rs_cbuf);
+
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+			heap_get_root_tuples(page, root_offsets);
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+			root_blkno = scan->rs_cblock;
+		}
+
 		if (snapshot == SnapshotAny)
 		{
 			/* do our own time qual check */
 			bool		indexIt;

+		recheck:
 			/*
 			 * We could possibly get away with not locking the buffer here,
 			 * since caller should hold ShareLock on the relation, but let's
-			 * be conservative about it.
+			 * be conservative about it.  (This remark is still correct
+			 * even with HOT-pruning: our pin on the buffer prevents pruning.)
 			 */
 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);

@ -1458,10 +1533,29 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * If tuple is recently deleted then we must index it
 					 * anyway to preserve MVCC semantics.  (Pre-existing
 					 * transactions could try to use the index after we finish
-					 * building it, and may need to see such tuples.) Exclude
-					 * it from unique-checking, however.
+					 * building it, and may need to see such tuples.)
+					 *
+					 * However, if it was HOT-updated then we must only index
+					 * the live tuple at the end of the HOT-chain.  Since this
+					 * breaks semantics for pre-existing snapshots, mark
+					 * the index as unusable for them.
+					 *
+					 * If we've already decided that the index will be unsafe
+					 * for old snapshots, we may as well stop indexing
+					 * recently-dead tuples, since there's no longer any
+					 * point.
 					 */
-					indexIt = true;
+					if (HeapTupleIsHotUpdated(heapTuple))
+					{
+						indexIt = false;
+						/* mark the index as unsafe for old snapshots */
+						indexInfo->ii_BrokenHotChain = true;
+					}
+					else if (indexInfo->ii_BrokenHotChain)
+						indexIt = false;
+					else
+						indexIt = true;
+					/* In any case, exclude the tuple from unique-checking */
 					tupleIsAlive = false;
 					break;
 				case HEAPTUPLE_INSERT_IN_PROGRESS:
@ -1473,12 +1567,31 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * followed by CREATE INDEX within a transaction.)	An
 					 * exception occurs when reindexing a system catalog,
 					 * because we often release lock on system catalogs before
-					 * committing.
+					 * committing.  In that case we wait for the inserting
+					 * transaction to finish and check again.  (We could do
+					 * that on user tables too, but since the case is not
+					 * expected it seems better to throw an error.)
 					 */
 					if (!TransactionIdIsCurrentTransactionId(
-								   HeapTupleHeaderGetXmin(heapTuple->t_data))
-						&& !IsSystemRelation(heapRelation))
-						elog(ERROR, "concurrent insert in progress");
+								   HeapTupleHeaderGetXmin(heapTuple->t_data)))
+					{
+						if (!IsSystemRelation(heapRelation))
+							elog(ERROR, "concurrent insert in progress");
+						else
+						{
+							/*
+							 * Must drop the lock on the buffer before we wait
+							 */
+							TransactionId xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
+							LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+							XactLockTableWait(xwait);
+							goto recheck;
+						}
+					}
+					/*
+					 * We must index such tuples, since if the index build
+					 * commits then they're good.
+					 */
 					indexIt = true;
 					tupleIsAlive = true;
 					break;
@ -1491,19 +1604,48 @@ IndexBuildHeapScan(Relation heapRelation,
 					 * followed by CREATE INDEX within a transaction.)	An
 					 * exception occurs when reindexing a system catalog,
 					 * because we often release lock on system catalogs before
-					 * committing.
+					 * committing.  In that case we wait for the deleting
+					 * transaction to finish and check again.  (We could do
+					 * that on user tables too, but since the case is not
+					 * expected it seems better to throw an error.)
 					 */
 					Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
 					if (!TransactionIdIsCurrentTransactionId(
-								   HeapTupleHeaderGetXmax(heapTuple->t_data))
-						&& !IsSystemRelation(heapRelation))
-						elog(ERROR, "concurrent delete in progress");
-					indexIt = true;
+								   HeapTupleHeaderGetXmax(heapTuple->t_data)))
+					{
+						if (!IsSystemRelation(heapRelation))
+							elog(ERROR, "concurrent delete in progress");
+						else
+						{
+							/*
+							 * Must drop the lock on the buffer before we wait
+							 */
+							TransactionId xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
+							LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+							XactLockTableWait(xwait);
+							goto recheck;
+						}
+					}
+					/*
+					 * Otherwise, we have to treat these tuples just like
+					 * RECENTLY_DELETED ones.
+					 */
+					if (HeapTupleIsHotUpdated(heapTuple))
+					{
+						indexIt = false;
+						/* mark the index as unsafe for old snapshots */
+						indexInfo->ii_BrokenHotChain = true;
+					}
+					else if (indexInfo->ii_BrokenHotChain)
+						indexIt = false;
+					else
+						indexIt = true;
+					/* In any case, exclude the tuple from unique-checking */
 					tupleIsAlive = false;
 					break;
 				default:
 					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-					indexIt = tupleIsAlive = false;		/* keep compiler quiet */
+					indexIt = tupleIsAlive = false;	/* keep compiler quiet */
 					break;
 			}

@ -1552,9 +1694,33 @@ IndexBuildHeapScan(Relation heapRelation,
 		 * pass the values[] and isnull[] arrays, instead.
 		 */

-		/* Call the AM's callback routine to process the tuple */
-		callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
-				 callback_state);
+		if (HeapTupleIsHeapOnly(heapTuple))
+		{
+			/*
+			 * For a heap-only tuple, pretend its TID is that of the root.
+			 * See src/backend/access/heap/README.HOT for discussion.
+			 */
+			HeapTupleData	rootTuple;
+			OffsetNumber	offnum;
+
+			rootTuple = *heapTuple;
+			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
+
+			Assert(OffsetNumberIsValid(root_offsets[offnum - 1]));
+
+			ItemPointerSetOffsetNumber(&rootTuple.t_self,
+									   root_offsets[offnum - 1]);
+
+			/* Call the AM's callback routine to process the tuple */
+			callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
+					 callback_state);
+		}
+		else
+		{
+			/* Call the AM's callback routine to process the tuple */
+			callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
+					 callback_state);
+		}
 	}

 	heap_endscan(scan);
@ -1574,8 +1740,15 @@ IndexBuildHeapScan(Relation heapRelation,
 /*
 * validate_index - support code for concurrent index builds
 *
- * We do a concurrent index build by first building the index normally via
- * index_create(), while holding a weak lock that allows concurrent
+ * We do a concurrent index build by first inserting the catalog entry for the
+ * index via index_create(), marking it not indisready and not indisvalid.
+ * Then we commit our transaction and start a new one, then we wait for all
+ * transactions that could have been modifying the table to terminate.  Now
+ * we know that any subsequently-started transactions will see the index and
+ * honor its constraints on HOT updates; so while existing HOT-chains might
+ * be broken with respect to the index, no currently live tuple will have an
+ * incompatible HOT update done to it.  We now build the index normally via
+ * index_build(), while holding a weak lock that allows concurrent
 * insert/update/delete.  Also, we index only tuples that are valid
 * as of the start of the scan (see IndexBuildHeapScan), whereas a normal
 * build takes care to include recently-dead tuples.  This is OK because
@ -1586,11 +1759,10 @@ IndexBuildHeapScan(Relation heapRelation,
 * if we used HeapTupleSatisfiesVacuum).  This leaves us with an index that
 * does not contain any tuples added to the table while we built the index.
 *
- * Next, we commit the transaction so that the index becomes visible to other
- * backends, but it is marked not "indisvalid" to prevent the planner from
- * relying on it for indexscans.  Then we wait for all transactions that
- * could have been modifying the table to terminate.  At this point we
- * know that any subsequently-started transactions will see the index and
+ * Next, we mark the index "indisready" (but still not "indisvalid") and
+ * commit the second transaction and start a third.  Again we wait for all
+ * transactions that could have been modifying the table to terminate.  Now
+ * we know that any subsequently-started transactions will see the index and
 * insert their new tuples into it.  We then take a new reference snapshot
 * which is passed to validate_index().  Any tuples that are valid according
 * to this snap, but are not in the index, must be added to the index.
@ -1610,7 +1782,7 @@ IndexBuildHeapScan(Relation heapRelation,
 * Building a unique index this way is tricky: we might try to insert a
 * tuple that is already dead or is in process of being deleted, and we
 * mustn't have a uniqueness failure against an updated version of the same
- * row.  We can check the tuple to see if it's already dead and tell
+ * row.  We could try to check the tuple to see if it's already dead and tell
 * index_insert() not to do the uniqueness check, but that still leaves us
 * with a race condition against an in-progress update.  To handle that,
 * we expect the index AM to recheck liveness of the to-be-inserted tuple
@ -1620,7 +1792,8 @@ IndexBuildHeapScan(Relation heapRelation,
 * were alive at the time of the reference snapshot are gone; this is
 * necessary to be sure there are none left with a serializable snapshot
 * older than the reference (and hence possibly able to see tuples we did
- * not index).	Then we mark the index valid and commit.
+ * not index).	Then we mark the index "indisvalid" and commit.  Subsequent
+ * transactions will be able to use it for queries.
 *
 * Doing two full table scans is a brute-force strategy.  We could try to be
 * cleverer, eg storing new tuples in a special area of the table (perhaps
@ -1727,6 +1900,9 @@ validate_index_heapscan(Relation heapRelation,
 	TupleTableSlot *slot;
 	EState	   *estate;
 	ExprContext *econtext;
+	BlockNumber root_blkno = InvalidBlockNumber;
+	OffsetNumber	root_offsets[MaxHeapTuplesPerPage];
+	bool			in_index[MaxHeapTuplesPerPage];

 	/* state variables for the merge */
 	ItemPointer indexcursor = NULL;
@ -1768,39 +1944,86 @@ validate_index_heapscan(Relation heapRelation,
 	while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
 		ItemPointer heapcursor = &heapTuple->t_self;
+		ItemPointerData rootTuple;
+		OffsetNumber	root_offnum;

 		CHECK_FOR_INTERRUPTS();

 		state->htups += 1;

+		/*
+		 * As commented in IndexBuildHeapScan, we should index heap-only tuples
+		 * under the TIDs of their root tuples; so when we advance onto a new
+		 * heap page, build a map of root item offsets on the page.
+		 *
+		 * This complicates merging against the tuplesort output: we will
+		 * visit the live tuples in order by their offsets, but the root
+		 * offsets that we need to compare against the index contents might
+		 * be ordered differently.  So we might have to "look back" within
+		 * the tuplesort output, but only within the current page.  We handle
+		 * that by keeping a bool array in_index[] showing all the
+		 * already-passed-over tuplesort output TIDs of the current page.
+		 * We clear that array here, when advancing onto a new heap page.
+		 */
+		if (scan->rs_cblock != root_blkno)
+		{
+			Page page = BufferGetPage(scan->rs_cbuf);
+
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+			heap_get_root_tuples(page, root_offsets);
+			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+			memset(in_index, 0, sizeof(in_index));
+
+			root_blkno = scan->rs_cblock;
+		}
+
+		/* Convert actual tuple TID to root TID */
+		rootTuple = *heapcursor;
+		root_offnum = ItemPointerGetOffsetNumber(heapcursor);
+
+		if (HeapTupleIsHeapOnly(heapTuple))
+		{
+			root_offnum = root_offsets[root_offnum - 1];
+			Assert(OffsetNumberIsValid(root_offnum));
+			ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
+		}
+
 		/*
 		 * "merge" by skipping through the index tuples until we find or pass
-		 * the current heap tuple.
+		 * the current root tuple.
 		 */
 		while (!tuplesort_empty &&
 			   (!indexcursor ||
-				ItemPointerCompare(indexcursor, heapcursor) < 0))
+				ItemPointerCompare(indexcursor, &rootTuple) < 0))
 		{
 			Datum		ts_val;
 			bool		ts_isnull;

 			if (indexcursor)
+			{
+				/*
+				 * Remember index items seen earlier on the current heap page
+				 */
+				if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
+					in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
 				pfree(indexcursor);
+			}
+
 			tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
 												  &ts_val, &ts_isnull);
 			Assert(tuplesort_empty || !ts_isnull);
 			indexcursor = (ItemPointer) DatumGetPointer(ts_val);
 		}

-		if (tuplesort_empty ||
-			ItemPointerCompare(indexcursor, heapcursor) > 0)
+		/*
+		 * If the tuplesort has overshot *and* we didn't see a match earlier,
+		 * then this tuple is missing from the index, so insert it.
+		 */
+		if ((tuplesort_empty ||
+			 ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
+			!in_index[root_offnum - 1])
 		{
-			/*
-			 * We've overshot which means this heap tuple is missing from the
-			 * index, so insert it.
-			 */
-			bool		check_unique;
-
 			MemoryContextReset(econtext->ecxt_per_tuple_memory);

 			/* Set up for predicate or expression evaluation */
@ -1827,40 +2050,30 @@ validate_index_heapscan(Relation heapRelation,
 						   values,
 						   isnull);

-			/*
-			 * If the tuple is already committed dead, we still have to put it
-			 * in the index (because some xacts might be able to see it), but
-			 * we might as well suppress uniqueness checking. This is just an
-			 * optimization because the index AM is not supposed to raise a
-			 * uniqueness failure anyway.
-			 */
-			if (indexInfo->ii_Unique)
-			{
-				/* must lock buffer to call HeapTupleSatisfiesVisibility */
-				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-				if (HeapTupleSatisfiesVisibility(heapTuple, SnapshotNow,
-												 scan->rs_cbuf))
-					check_unique = true;
-				else
-					check_unique = false;
-
-				LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-			}
-			else
-				check_unique = false;
-
 			/*
 			 * You'd think we should go ahead and build the index tuple here,
 			 * but some index AMs want to do further processing on the data
 			 * first. So pass the values[] and isnull[] arrays, instead.
 			 */
+
+			/*
+			 * If the tuple is already committed dead, you might think we
+			 * could suppress uniqueness checking, but this is no longer
+			 * true in the presence of HOT, because the insert is actually
+			 * a proxy for a uniqueness check on the whole HOT-chain.  That
+			 * is, the tuple we have here could be dead because it was already
+			 * HOT-updated, and if so the updating transaction will not have
+			 * thought it should insert index entries.  The index AM will
+			 * check the whole HOT-chain and correctly detect a conflict
+			 * if there is one.
+			 */
+
 			index_insert(indexRelation,
 						 values,
 						 isnull,
-						 heapcursor,
+						 &rootTuple,
 						 heapRelation,
-						 check_unique);
+						 indexInfo->ii_Unique);

 			state->tups_inserted += 1;
 		}
@ -1983,9 +2196,9 @@ reindex_index(Oid indexId)
 	ResetReindexProcessing();

 	/*
-	 * If the index is marked invalid (ie, it's from a failed CREATE INDEX
-	 * CONCURRENTLY), we can now mark it valid.  This allows REINDEX to be
-	 * used to clean up in such cases.
+	 * If the index is marked invalid or not ready (ie, it's from a failed
+	 * CREATE INDEX CONCURRENTLY), we can now mark it valid.  This allows
+	 * REINDEX to be used to clean up in such cases.
 	 */
 	pg_index = heap_open(IndexRelationId, RowExclusiveLock);

@ -1996,9 +2209,10 @@ reindex_index(Oid indexId)
 		elog(ERROR, "cache lookup failed for index %u", indexId);
 	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

-	if (!indexForm->indisvalid)
+	if (!indexForm->indisvalid || !indexForm->indisready)
 	{
 		indexForm->indisvalid = true;
+		indexForm->indisready = true;
 		simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
 		CatalogUpdateIndexes(pg_index, indexTuple);
 	}
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/indexing.c,v 1.114 2007/01/05 22:19:24 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/indexing.c,v 1.115 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -78,6 +78,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 	Datum		values[INDEX_MAX_KEYS];
 	bool		isnull[INDEX_MAX_KEYS];

+	/* HOT update does not require index inserts */
+	if (HeapTupleIsHeapOnly(heapTuple))
+		return;
+
 	/*
 	 * Get information from the state structure.  Fall out if nothing to do.
 	 */
@ -101,6 +105,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)

 		indexInfo = indexInfoArray[i];

+		/* If the index is marked as read-only, ignore it */
+		if (!indexInfo->ii_ReadyForInserts)
+			continue;
+
 		/*
 		 * Expressional and partial indexes on system catalogs are not
 		 * supported
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@ -3,7 +3,7 @@
 *
 * Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.44 2007/09/11 08:51:22 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.45 2007/09/20 17:56:30 tgl Exp $
 */

 CREATE VIEW pg_roles AS 
@ -207,6 +207,7 @@ CREATE VIEW pg_stat_all_tables AS
            pg_stat_get_tuples_inserted(C.oid) AS n_tup_ins, 
            pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, 
            pg_stat_get_tuples_deleted(C.oid) AS n_tup_del,
+            pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd,
            pg_stat_get_live_tuples(C.oid) AS n_live_tup, 
            pg_stat_get_dead_tuples(C.oid) AS n_dead_tup,
            pg_stat_get_last_vacuum_time(C.oid) as last_vacuum,
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.7 2007/07/25 22:16:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.8 2007/09/20 17:56:30 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -225,7 +225,9 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid)
 	indexInfo->ii_Predicate = NIL;
 	indexInfo->ii_PredicateState = NIL;
 	indexInfo->ii_Unique = true;
+	indexInfo->ii_ReadyForInserts = true;
 	indexInfo->ii_Concurrent = false;
+	indexInfo->ii_BrokenHotChain = false;

 	classObjectId[0] = OID_BTREE_OPS_OID;
 	classObjectId[1] = INT4_BTREE_OPS_OID;
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.165 2007/09/10 21:59:37 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.166 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -119,6 +119,7 @@ DefineIndex(RangeVar *heapRelation,
 	Oid			namespaceId;
 	Oid			tablespaceId;
 	Relation	rel;
+	Relation	indexRelation;
 	HeapTuple	tuple;
 	Form_pg_am	accessMethodForm;
 	bool		amcanorder;
@ -420,7 +421,10 @@ DefineIndex(RangeVar *heapRelation,
 	indexInfo->ii_Predicate = make_ands_implicit(predicate);
 	indexInfo->ii_PredicateState = NIL;
 	indexInfo->ii_Unique = unique;
+	/* In a concurrent build, mark it not-ready-for-inserts */
+	indexInfo->ii_ReadyForInserts = !concurrent;
 	indexInfo->ii_Concurrent = concurrent;
+	indexInfo->ii_BrokenHotChain = false;

 	classObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid));
 	coloptions = (int16 *) palloc(numberOfAttributes * sizeof(int16));
@ -439,23 +443,38 @@ DefineIndex(RangeVar *heapRelation,
 				  primary ? "PRIMARY KEY" : "UNIQUE",
 				  indexRelationName, RelationGetRelationName(rel))));

-	/* save lockrelid for below, then close rel */
+	/* save lockrelid and locktag for below, then close rel */
 	heaprelid = rel->rd_lockInfo.lockRelId;
+	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
 	heap_close(rel, NoLock);

+	if (!concurrent)
+	{
+		indexRelationId =
+			index_create(relationId, indexRelationName, indexRelationId,
+						 indexInfo, accessMethodId, tablespaceId, classObjectId,
+						 coloptions, reloptions, primary, isconstraint,
+						 allowSystemTableMods, skip_build, concurrent);
+
+		return;					/* We're done, in the standard case */
+	}
+
+	/*
+	 * For a concurrent build, we next insert the catalog entry and add
+	 * constraints.  We don't build the index just yet; we must first make
+	 * the catalog entry so that the new index is visible to updating
+	 * transactions.  That will prevent them from making incompatible HOT
+	 * updates.  The new index will be marked not indisready and not
+	 * indisvalid, so that no one else tries to either insert into it or use
+	 * it for queries.  We pass skip_build = true to prevent the build.
+	 */
 	indexRelationId =
 		index_create(relationId, indexRelationName, indexRelationId,
 					 indexInfo, accessMethodId, tablespaceId, classObjectId,
 					 coloptions, reloptions, primary, isconstraint,
-					 allowSystemTableMods, skip_build, concurrent);
-
-	if (!concurrent)
-		return;					/* We're done, in the standard case */
+				 	 allowSystemTableMods, true, concurrent);

 	/*
-	 * Phase 2 of concurrent index build (see comments for validate_index()
-	 * for an overview of how this works)
-	 *
 	 * We must commit our current transaction so that the index becomes
 	 * visible; then start another.  Note that all the data structures we just
 	 * built are lost in the commit.  The only data we keep past here are the
@ -476,6 +495,9 @@ DefineIndex(RangeVar *heapRelation,
 	StartTransactionCommand();

 	/*
+	 * Phase 2 of concurrent index build (see comments for validate_index()
+	 * for an overview of how this works)
+	 *
 	 * Now we must wait until no running transaction could have the table open
 	 * with the old list of indexes.  To do this, inquire which xacts
 	 * currently would conflict with ShareLock on the table -- ie, which ones
@ -494,7 +516,91 @@ DefineIndex(RangeVar *heapRelation,
 	 * check for that.  Also, prepared xacts are not reported, which is
 	 * fine since they certainly aren't going to do anything more.
 	 */
-	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
+	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
+
+	while (VirtualTransactionIdIsValid(*old_lockholders))
+	{
+		VirtualXactLockTableWait(*old_lockholders);
+		old_lockholders++;
+	}
+
+	/*
+	 * At this moment we are sure that there are no transactions with the
+	 * table open for write that don't have this new index in their list of
+	 * indexes.  We have waited out all the existing transactions and any new
+	 * transaction will have the new index in its list, but the index is still
+	 * marked as "not-ready-for-inserts".  The index is consulted while
+	 * deciding HOT-safety though.  This arrangement ensures that no new HOT
+	 * chains can be created where the new tuple and the old tuple in the
+	 * chain have different index keys.
+	 *
+	 * We now take a new snapshot, and build the index using all tuples that
+	 * are visible in this snapshot.  We can be sure that any HOT updates
+	 * to these tuples will be compatible with the index, since any updates
+	 * made by transactions that didn't know about the index are now committed
+	 * or rolled back.  Thus, each visible tuple is either the end of its
+	 * HOT-chain or the extension of the chain is HOT-safe for this index.
+	 */
+
+	/* Open and lock the parent heap relation */
+	rel = heap_openrv(heapRelation, ShareUpdateExclusiveLock);
+
+	/* And the target index relation */
+	indexRelation = index_open(indexRelationId, RowExclusiveLock);
+
+	/* Set ActiveSnapshot since functions in the indexes may need it */
+	ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
+
+	/* We have to re-build the IndexInfo struct, since it was lost in commit */
+	indexInfo = BuildIndexInfo(indexRelation);
+	Assert(!indexInfo->ii_ReadyForInserts);
+	indexInfo->ii_Concurrent = true;
+	indexInfo->ii_BrokenHotChain = false;
+
+	/* Now build the index */
+	index_build(rel, indexRelation, indexInfo, primary);
+
+	/* Close both the relations, but keep the locks */
+	heap_close(rel, NoLock);
+	index_close(indexRelation, NoLock);
+
+	/*
+	 * Update the pg_index row to mark the index as ready for inserts.
+	 * Once we commit this transaction, any new transactions that
+	 * open the table must insert new entries into the index for insertions
+	 * and non-HOT updates.
+	 */
+	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+	indexTuple = SearchSysCacheCopy(INDEXRELID,
+									ObjectIdGetDatum(indexRelationId),
+									0, 0, 0);
+	if (!HeapTupleIsValid(indexTuple))
+		elog(ERROR, "cache lookup failed for index %u", indexRelationId);
+	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+	Assert(!indexForm->indisready);
+	Assert(!indexForm->indisvalid);
+
+	indexForm->indisready = true;
+
+	simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+	CatalogUpdateIndexes(pg_index, indexTuple);
+
+	heap_close(pg_index, RowExclusiveLock);
+
+	/*
+	 * Commit this transaction to make the indisready update visible.
+	 */
+	CommitTransactionCommand();
+	StartTransactionCommand();
+
+	/*
+	 * Phase 3 of concurrent index build
+	 *
+	 * We once again wait until no transaction can have the table open with
+	 * the index marked as read-only for updates.
+	 */
 	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);

 	while (VirtualTransactionIdIsValid(*old_lockholders))
@ -505,7 +611,7 @@ DefineIndex(RangeVar *heapRelation,

 	/*
 	 * Now take the "reference snapshot" that will be used by validate_index()
-	 * to filter candidate tuples.  Beware!  There might be still snapshots
+	 * to filter candidate tuples.  Beware!  There might still be snapshots
 	 * in use that treat some transaction as in-progress that our reference
 	 * snapshot treats as committed.  If such a recently-committed transaction
 	 * deleted tuples in the table, we will not include them in the index; yet
@ -560,7 +666,7 @@ DefineIndex(RangeVar *heapRelation,
 		elog(ERROR, "cache lookup failed for index %u", indexRelationId);
 	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

-	Assert(indexForm->indexrelid = indexRelationId);
+	Assert(indexForm->indisready);
 	Assert(!indexForm->indisvalid);

 	indexForm->indisvalid = true;
@ -575,7 +681,8 @@ DefineIndex(RangeVar *heapRelation,
 	 * relcache entries for the index itself, but we should also send a
 	 * relcache inval on the parent table to force replanning of cached plans.
 	 * Otherwise existing sessions might fail to use the new index where it
-	 * would be useful.
+	 * would be useful.  (Note that our earlier commits did not create
+	 * reasons to replan; relcache flush on the index itself was sufficient.)
 	 */
 	CacheInvalidateRelcacheByRelid(heaprelid.relId);

--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.145 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.146 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1281,7 +1281,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
 	itemsz = record->xl_len - sizeof(xl_seq_rec);
 	itemsz = MAXALIGN(itemsz);
 	if (PageAddItem(page, (Item) item, itemsz,
-					FirstOffsetNumber, false) == InvalidOffsetNumber)
+					FirstOffsetNumber, false, false) == InvalidOffsetNumber)
 		elog(PANIC, "seq_redo: failed to add item to page");

 	PageSetLSN(page, lsn);
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@ -13,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.358 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.359 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -124,10 +124,11 @@ typedef VTupleMoveData *VTupleMove;
 typedef struct VRelStats
 {
 	/* miscellaneous statistics */
-	BlockNumber rel_pages;
-	double		rel_tuples;
-	Size		min_tlen;
-	Size		max_tlen;
+	BlockNumber rel_pages;		/* pages in relation */
+	double		rel_tuples;		/* tuples that remain after vacuuming */
+	double		rel_indexed_tuples;		/* indexed tuples that remain */
+	Size		min_tlen;		/* min surviving tuple size */
+	Size		max_tlen;		/* max surviving tuple size */
 	bool		hasindex;
 	/* vtlinks array for tuple chain following - sorted by new_tid */
 	int			num_vtlinks;
@ -1177,6 +1178,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
 	vacrelstats->rel_pages = 0;
 	vacrelstats->rel_tuples = 0;
+	vacrelstats->rel_indexed_tuples = 0;
 	vacrelstats->hasindex = false;

 	/* scan the heap */
@ -1195,13 +1197,13 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 		{
 			for (i = 0; i < nindexes; i++)
 				vacuum_index(&vacuum_pages, Irel[i],
-							 vacrelstats->rel_tuples, 0);
+							 vacrelstats->rel_indexed_tuples, 0);
 		}
 		else
 		{
 			/* just scan indexes to update statistic */
 			for (i = 0; i < nindexes; i++)
-				scan_index(Irel[i], vacrelstats->rel_tuples);
+				scan_index(Irel[i], vacrelstats->rel_indexed_tuples);
 		}
 	}

@ -1256,6 +1258,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 	BlockNumber empty_pages,
 				empty_end_pages;
 	double		num_tuples,
+				num_indexed_tuples,
 				tups_vacuumed,
 				nkeep,
 				nunused;
@ -1278,7 +1281,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 					relname)));

 	empty_pages = empty_end_pages = 0;
-	num_tuples = tups_vacuumed = nkeep = nunused = 0;
+	num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0;
 	free_space = 0;

 	nblocks = RelationGetNumberOfBlocks(onerel);
@ -1313,9 +1316,13 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		 * background writer will try to write the page if it's already marked
 		 * dirty.  To ensure that invalid data doesn't get written to disk, we
 		 * must take exclusive buffer lock wherever we potentially modify
-		 * pages.
+		 * pages.  In fact, we insist on cleanup lock so that we can safely
+		 * call heap_page_prune().  (This might be overkill, since the bgwriter
+		 * pays no attention to individual tuples, but on the other hand it's
+		 * unlikely that the bgwriter has this particular page pinned at this
+		 * instant.  So violating the coding rule would buy us little anyway.)
 		 */
-		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		LockBufferForCleanup(buf);

 		vacpage->blkno = blkno;
 		vacpage->offsets_used = 0;
@ -1356,6 +1363,21 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			continue;
 		}

+		/* 
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We use the redirect_move option so that redirecting line pointers
+		 * get collapsed out; this allows us to not worry about them below.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+										 true, false);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for
+		 * tuples requiring freezing.
+		 */
 		nfrozen = 0;
 		notup = true;
 		maxoff = PageGetMaxOffsetNumber(page);
@ -1369,7 +1391,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,

 			/*
 			 * Collect un-used items too - it's possible to have indexes
-			 * pointing here after crash.
+			 * pointing here after crash.  (That's an ancient comment and
+			 * is likely obsolete with WAL, but we might as well continue
+			 * to check for such problems.)
 			 */
 			if (!ItemIdIsUsed(itemid))
 			{
@ -1378,6 +1402,23 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 				continue;
 			}

+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting
+			 * (at least in the common case where heap_page_prune() just
+			 * freed up a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				vacpage->offsets[vacpage->offsets_free++] = offnum;
+				continue;
+			}
+
+			/* Shouldn't have any redirected items anymore */
+			if (!ItemIdIsNormal(itemid))
+				elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item",
+					 relname, blkno, offnum);
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple.t_len = ItemIdGetLength(itemid);
 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
@ -1410,12 +1451,45 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 					}
 					break;
 				case HEAPTUPLE_DEAD:
-					tupgone = true;		/* we can delete the tuple */
 					/*
-					 * We need not require XMIN_COMMITTED or XMAX_COMMITTED to
-					 * be set, since we will remove the tuple without any
-					 * further examination of its hint bits.
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition, though it
+					 * does suggest that someone released a lock early.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it as if it
+					 * were RECENTLY_DEAD, and abandon shrinking. (XXX is it
+					 * worth trying to make the shrinking code smart enough
+					 * to handle this?  It's an unusual corner case.)
+					 *
+					 * DEAD heap-only tuples can safely be removed if they
+					 * aren't themselves HOT-updated, although this is a bit
+					 * inefficient since we'll uselessly try to remove
+					 * index entries for them.
 					 */
+					if (HeapTupleIsHotUpdated(&tuple))
+					{
+						nkeep += 1;
+						if (do_shrinking)
+							ereport(LOG,
+									(errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation",
+											relname, blkno, offnum)));
+						do_shrinking = false;
+					}
+					else
+					{
+						tupgone = true;		/* we can delete the tuple */
+						/*
+						 * We need not require XMIN_COMMITTED or
+						 * XMAX_COMMITTED to be set, since we will remove the
+						 * tuple without any further examination of its hint
+						 * bits.
+						 */
+					}
 					break;
 				case HEAPTUPLE_RECENTLY_DEAD:

@ -1530,6 +1604,8 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			else
 			{
 				num_tuples += 1;
+				if (!HeapTupleIsHeapOnly(&tuple))
+					num_indexed_tuples += 1;
 				notup = false;
 				if (tuple.t_len < min_tlen)
 					min_tlen = tuple.t_len;
@ -1549,7 +1625,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		if (tempPage != NULL)
 		{
 			/* Some tuples are removable; figure free space after removal */
-			PageRepairFragmentation(tempPage, NULL);
+			PageRepairFragmentation(tempPage);
 			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
 			pfree(tempPage);
 			do_reap = true;
@ -1558,7 +1634,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 		{
 			/* Just use current available space */
 			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
-			/* Need to reap the page if it has LP_UNUSED line pointers */
+			/* Need to reap the page if it has UNUSED or DEAD line pointers */
 			do_reap = (vacpage->offsets_free > 0);
 		}

@ -1621,6 +1697,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,

 	/* save stats in the rel list for use later */
 	vacrelstats->rel_tuples = num_tuples;
+	vacrelstats->rel_indexed_tuples = num_indexed_tuples;
 	vacrelstats->rel_pages = nblocks;
 	if (num_tuples == 0)
 		min_tlen = max_tlen = 0;
@ -1720,6 +1797,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				num_fraged_pages,
 				vacuumed_pages;
 	int			keep_tuples = 0;
+	int			keep_indexed_tuples = 0;
 	PGRUsage	ru0;

 	pg_rusage_init(&ru0);
@ -1845,6 +1923,16 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			if (!ItemIdIsUsed(itemid))
 				continue;

+			if (ItemIdIsDead(itemid))
+			{
+				/* just remember it for vacuum_page() */
+				vacpage->offsets[vacpage->offsets_free++] = offnum;
+				continue;
+			}
+
+			/* Shouldn't have any redirected items now */
+			Assert(ItemIdIsNormal(itemid));
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
@ -1906,12 +1994,28 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					if (i >= vacpage->offsets_free)		/* not found */
 					{
 						vacpage->offsets[vacpage->offsets_free++] = offnum;
+						/*
+						 * If this is not a heap-only tuple, there must be an
+						 * index entry for this item which will be removed in
+						 * the index cleanup. Decrement the keep_indexed_tuples
+						 * count to remember this.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+							keep_indexed_tuples--;
 						keep_tuples--;
 					}
 				}
 				else
 				{
 					vacpage->offsets[vacpage->offsets_free++] = offnum;
+					/*
+					 * If this is not a heap-only tuple, there must be an
+					 * index entry for this item which will be removed in
+					 * the index cleanup. Decrement the keep_indexed_tuples
+					 * count to remember this.
+					 */
+					if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+						keep_indexed_tuples--;
 					keep_tuples--;
 				}
 				continue;
@ -2028,7 +2132,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						break;
 					}
 					nextItemid = PageGetItemId(nextPage, nextOffnum);
-					if (!ItemIdIsUsed(nextItemid))
+					if (!ItemIdIsNormal(nextItemid))
 					{
 						ReleaseBuffer(nextBuf);
 						break;
@ -2166,7 +2270,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					Pitemid = PageGetItemId(Ppage,
 								   ItemPointerGetOffsetNumber(&(tp.t_self)));
 					/* this can't happen since we saw tuple earlier: */
-					if (!ItemIdIsUsed(Pitemid))
+					if (!ItemIdIsNormal(Pitemid))
 						elog(ERROR, "parent itemid marked as unused");
 					PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);

@ -2268,6 +2372,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 									 dst_buffer, dst_page, destvacpage,
 									 &ec, &Ctid, vtmove[ti].cleanVpd);

+					/*
+					 * If the tuple we are moving is a heap-only tuple,
+					 * this move will generate an additional index entry,
+					 * so increment the rel_indexed_tuples count.
+					 */ 
+					if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+						vacrelstats->rel_indexed_tuples++;
+
 					num_moved++;
 					if (destvacpage->blkno > last_move_dest_block)
 						last_move_dest_block = destvacpage->blkno;
@ -2280,7 +2392,31 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 						vacpage->offsets[vacpage->offsets_free++] =
 							ItemPointerGetOffsetNumber(&(tuple.t_self));
 					else
+					{
+						/*
+						 * When we move tuple chains, we may need to move
+						 * tuples from a block that we haven't yet scanned in
+						 * the outer walk-along-the-relation loop. Note that we
+						 * can't be moving a tuple from a block that we have
+						 * already scanned because if such a tuple exists, then
+						 * we must have moved the chain along with that tuple
+						 * when we scanned that block. IOW the test of
+						 * (Cbuf != buf) guarantees that the tuple we are
+						 * looking at right now is in a block which is yet to
+						 * be scanned.
+						 *
+						 * We maintain two counters to correctly count the
+						 * moved-off tuples from blocks that are not yet
+						 * scanned (keep_tuples) and how many of them have
+						 * index pointers (keep_indexed_tuples).  The main
+						 * reason to track the latter is to help verify
+						 * that indexes have the expected number of entries
+						 * when all the dust settles.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(tuple.t_data))
+							keep_indexed_tuples++;
 						keep_tuples++;
+					}

 					ReleaseBuffer(dst_buffer);
 					ReleaseBuffer(Cbuf);
@ -2328,6 +2464,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			move_plain_tuple(onerel, buf, page, &tuple,
 							 dst_buffer, dst_page, dst_vacpage, &ec);

+			/*
+			 * If the tuple we are moving is a heap-only tuple,
+			 * this move will generate an additional index entry,
+			 * so increment the rel_indexed_tuples count.
+			 */
+			if (HeapTupleHeaderIsHeapOnly(tuple.t_data))
+				vacrelstats->rel_indexed_tuples++;
+
 			num_moved++;
 			if (dst_vacpage->blkno > last_move_dest_block)
 				last_move_dest_block = dst_vacpage->blkno;
@ -2361,6 +2505,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,

 				if (!ItemIdIsUsed(itemid))
 					continue;
+				/* Shouldn't be any DEAD or REDIRECT items anymore */
+				Assert(ItemIdIsNormal(itemid));
+
 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 					continue;
@ -2389,6 +2536,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					{
 						vacpage->offsets[vacpage->offsets_free++] = off;
 						Assert(keep_tuples > 0);
+						/*
+						 * If this is not a heap-only tuple, there must be an
+						 * index entry for this item which will be removed in
+						 * the index cleanup. Decrement the keep_indexed_tuples
+						 * count to remember this.
+						 */
+						if (!HeapTupleHeaderIsHeapOnly(htup))
+							keep_indexed_tuples--;
 						keep_tuples--;
 					}
 				}
@ -2396,6 +2551,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				{
 					vacpage->offsets[vacpage->offsets_free++] = off;
 					Assert(keep_tuples > 0);
+					if (!HeapTupleHeaderIsHeapOnly(htup))
+						keep_indexed_tuples--;
 					keep_tuples--;
 				}
 			}
@ -2529,11 +2686,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			 * page during chain moves but not been scanned over subsequently.
 			 * The tuple ids of these tuples are not recorded as free offsets
 			 * for any VacPage, so they will not be cleared from the indexes.
+			 * keep_indexed_tuples is the portion of these that are expected
+			 * to have index entries.
 			 */
 			Assert(keep_tuples >= 0);
 			for (i = 0; i < nindexes; i++)
 				vacuum_index(&Nvacpagelist, Irel[i],
-							 vacrelstats->rel_tuples, keep_tuples);
+							 vacrelstats->rel_indexed_tuples,
+							 keep_indexed_tuples);
 		}

 		/*
@ -2551,7 +2711,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			OffsetNumber unused[MaxOffsetNumber];
 			OffsetNumber offnum,
 						maxoff;
-			int			uncnt;
+			int			uncnt = 0;
 			int			num_tuples = 0;

 			buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy);
@ -2567,6 +2727,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,

 				if (!ItemIdIsUsed(itemid))
 					continue;
+				/* Shouldn't be any DEAD or REDIRECT items anymore */
+				Assert(ItemIdIsNormal(itemid));
+
 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 					continue;
@ -2584,12 +2747,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,

 				ItemIdSetUnused(itemid);
 				num_tuples++;
+
+				unused[uncnt++] = offnum;
 			}
 			Assert(vacpage->offsets_free == num_tuples);

 			START_CRIT_SECTION();

-			uncnt = PageRepairFragmentation(page, unused);
+			PageRepairFragmentation(page);

 			MarkBufferDirty(buf);

@ -2598,7 +2763,10 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			{
 				XLogRecPtr	recptr;

-				recptr = log_heap_clean(onerel, buf, unused, uncnt);
+				recptr = log_heap_clean(onerel, buf,
+										NULL, 0, NULL, 0,
+										unused, uncnt,
+										false);
 				PageSetLSN(page, recptr);
 				PageSetTLI(page, ThisTimeLineID);
 			}
@ -2706,15 +2874,17 @@ move_chain_tuple(Relation rel,

 	/*
 	 * Update the state of the copied tuple, and store it on the destination
-	 * page.
+	 * page.  The copied tuple is never part of a HOT chain.
 	 */
 	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
 								   HEAP_XMIN_INVALID |
 								   HEAP_MOVED_OFF);
 	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+	HeapTupleHeaderClearHotUpdated(newtup.t_data);
+	HeapTupleHeaderClearHeapOnly(newtup.t_data);
 	HeapTupleHeaderSetXvac(newtup.t_data, myXID);
 	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
 			 (unsigned long) tuple_len, dst_vacpage->blkno);
@ -2809,17 +2979,19 @@ move_plain_tuple(Relation rel,
 	START_CRIT_SECTION();

 	/*
-	 * Mark new tuple as MOVED_IN by me.
+	 * Mark new tuple as MOVED_IN by me; also mark it not HOT.
 	 */
 	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
 								   HEAP_XMIN_INVALID |
 								   HEAP_MOVED_OFF);
 	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
+	HeapTupleHeaderClearHotUpdated(newtup.t_data);
+	HeapTupleHeaderClearHeapOnly(newtup.t_data);
 	HeapTupleHeaderSetXvac(newtup.t_data, myXID);

 	/* add tuple to the page */
 	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
-						 InvalidOffsetNumber, false);
+						 InvalidOffsetNumber, false, true);
 	if (newoff == InvalidOffsetNumber)
 		elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
 			 (unsigned long) tuple_len,
@ -2934,6 +3106,9 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,

 			if (!ItemIdIsUsed(itemid))
 				continue;
+			/* Shouldn't be any DEAD or REDIRECT items anymore */
+			Assert(ItemIdIsNormal(itemid));
+
 			htup = (HeapTupleHeader) PageGetItem(page, itemid);
 			if (htup->t_infomask & HEAP_XMIN_COMMITTED)
 				continue;
@ -3019,10 +3194,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 static void
 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 {
-	OffsetNumber unused[MaxOffsetNumber];
-	int			uncnt;
 	Page		page = BufferGetPage(buffer);
-	ItemId		itemid;
 	int			i;

 	/* There shouldn't be any tuples moved onto the page yet! */
@ -3032,11 +3204,12 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)

 	for (i = 0; i < vacpage->offsets_free; i++)
 	{
-		itemid = PageGetItemId(page, vacpage->offsets[i]);
+		ItemId		itemid = PageGetItemId(page, vacpage->offsets[i]);
+
 		ItemIdSetUnused(itemid);
 	}

-	uncnt = PageRepairFragmentation(page, unused);
+	PageRepairFragmentation(page);

 	MarkBufferDirty(buffer);

@ -3045,7 +3218,10 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 	{
 		XLogRecPtr	recptr;

-		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+		recptr = log_heap_clean(onerel, buffer,
+								NULL, 0, NULL, 0,
+								vacpage->offsets, vacpage->offsets_free,
+								false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
@ -3527,8 +3703,7 @@ enough_space(VacPage vacpage, Size len)
 static Size
 PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
 {
-	PageHeader	pd = (PageHeader) page;
-	Size		freespace = pd->pd_upper - pd->pd_lower;
+	Size		freespace = PageGetHeapFreeSpace(page);
 	Size		targetfree;

 	targetfree = RelationGetTargetPageFreeSpace(relation,
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@ -36,7 +36,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.96 2007/09/16 02:37:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.97 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -326,8 +326,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,

 		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);

-		/* Initially, we only need shared access to the buffer */
-		LockBuffer(buf, BUFFER_LOCK_SHARE);
+		/* We need buffer cleanup lock so that we can prune HOT chains. */
+		LockBufferForCleanup(buf);

 		page = BufferGetPage(buf);

@ -341,11 +341,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			 * We have to be careful here because we could be looking at a
 			 * page that someone has just added to the relation and not yet
 			 * been able to initialize (see RelationGetBufferForTuple). To
-			 * interlock against that, release the buffer read lock (which we
-			 * must do anyway) and grab the relation extension lock before
-			 * re-locking in exclusive mode.  If the page is still
-			 * uninitialized by then, it must be left over from a crashed
-			 * backend, and we can initialize it.
+			 * protect against that, release the buffer lock, grab the
+			 * relation extension lock momentarily, and re-lock the buffer.
+			 * If the page is still uninitialized by then, it must be left
+			 * over from a crashed backend, and we can initialize it.
 			 *
 			 * We don't really need the relation lock when this is a new or
 			 * temp relation, but it's probably not worth the code space to
@ -357,7 +356,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 			LockRelationForExtension(onerel, ExclusiveLock);
 			UnlockRelationForExtension(onerel, ExclusiveLock);
-			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+			LockBufferForCleanup(buf);
 			if (PageIsNew(page))
 			{
 				ereport(WARNING,
@ -366,7 +365,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				PageInit(page, BufferGetPageSize(buf), 0);
 				empty_pages++;
 				lazy_record_free_space(vacrelstats, blkno,
-									   PageGetFreeSpace(page));
+									   PageGetHeapFreeSpace(page));
 			}
 			MarkBufferDirty(buf);
 			UnlockReleaseBuffer(buf);
@ -377,11 +376,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		{
 			empty_pages++;
 			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetFreeSpace(page));
+								   PageGetHeapFreeSpace(page));
 			UnlockReleaseBuffer(buf);
 			continue;
 		}

+		/* 
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
+										 false, false);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for
+		 * tuples requiring freezing.
+		 */
 		nfrozen = 0;
 		hastup = false;
 		prev_dead_count = vacrelstats->num_dead_tuples;
@ -394,22 +405,64 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,

 			itemid = PageGetItemId(page, offnum);

+			/* Unused items require no processing, but we count 'em */
 			if (!ItemIdIsUsed(itemid))
 			{
 				nunused += 1;
 				continue;
 			}

+			/* Redirect items mustn't be touched */
+ 			if (ItemIdIsRedirected(itemid))
+ 			{
+				hastup = true;	/* this page won't be truncatable */
+ 				continue;
+ 			}
+
+ 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
+
+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting
+			 * (at least in the common case where heap_page_prune() just
+			 * freed up a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+				continue;
+			}
+
+			Assert(ItemIdIsNormal(itemid));
+
 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 			tuple.t_len = ItemIdGetLength(itemid);
-			ItemPointerSet(&(tuple.t_self), blkno, offnum);

 			tupgone = false;

 			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 			{
 				case HEAPTUPLE_DEAD:
-					tupgone = true;		/* we can delete the tuple */
+					/*
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it just as
+					 * if it were RECENTLY_DEAD.  Also, if it's a heap-only
+					 * tuple, we choose to keep it, because it'll be a
+					 * lot cheaper to get rid of it in the next pruning pass
+					 * than to treat it like an indexed tuple.
+					 */
+					if (HeapTupleIsHotUpdated(&tuple) ||
+						HeapTupleIsHeapOnly(&tuple))
+						nkeep += 1;
+					else
+						tupgone = true;		/* we can delete the tuple */
 					break;
 				case HEAPTUPLE_LIVE:
 					/* Tuple is good --- but let's do some validity checks */
@ -449,11 +502,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,

 				/*
 				 * Each non-removable tuple must be checked to see if it
-				 * needs freezing.  If we already froze anything, then
-				 * we've already switched the buffer lock to exclusive.
+				 * needs freezing.  Note we already have exclusive buffer lock.
 				 */
 				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
-									  (nfrozen > 0) ? InvalidBuffer : buf))
+									  InvalidBuffer))
 					frozen[nfrozen++] = offnum;
 			}
 		}						/* scan along page */
@ -485,9 +537,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if (nindexes == 0 &&
 			vacrelstats->num_dead_tuples > 0)
 		{
-			/* Trade in buffer share lock for super-exclusive lock */
-			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			LockBufferForCleanup(buf);
 			/* Remove tuples from heap */
 			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
 			/* Forget the now-vacuumed tuples, and press on */
@ -505,7 +554,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if (vacrelstats->num_dead_tuples == prev_dead_count)
 		{
 			lazy_record_free_space(vacrelstats, blkno,
-								   PageGetFreeSpace(page));
+								   PageGetHeapFreeSpace(page));
 		}

 		/* Remember the location of the last page with nonremovable tuples */
@ -598,7 +647,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 		/* Now that we've compacted the page, record its available space */
 		page = BufferGetPage(buf);
 		lazy_record_free_space(vacrelstats, tblk,
-							   PageGetFreeSpace(page));
+							   PageGetHeapFreeSpace(page));
 		UnlockReleaseBuffer(buf);
 		npages++;
 	}
@ -615,7 +664,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 *	lazy_vacuum_page() -- free dead tuples on a page
 *					 and repair its fragmentation.
 *
- * Caller must hold pin and lock on the buffer.
+ * Caller must hold pin and buffer cleanup lock on the buffer.
 *
 * tupindex is the index in vacrelstats->dead_tuples of the first dead
 * tuple for this page.  We assume the rest follow sequentially.
@ -625,10 +674,9 @@ static int
 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 				 int tupindex, LVRelStats *vacrelstats)
 {
-	OffsetNumber unused[MaxOffsetNumber];
-	int			uncnt;
 	Page		page = BufferGetPage(buffer);
-	ItemId		itemid;
+	OffsetNumber unused[MaxOffsetNumber];
+	int			uncnt = 0;

 	START_CRIT_SECTION();

@ -636,6 +684,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	{
 		BlockNumber tblk;
 		OffsetNumber toff;
+		ItemId		itemid;

 		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 		if (tblk != blkno)
@ -643,9 +692,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
 		itemid = PageGetItemId(page, toff);
 		ItemIdSetUnused(itemid);
+		unused[uncnt++] = toff;
 	}

-	uncnt = PageRepairFragmentation(page, unused);
+	PageRepairFragmentation(page);

 	MarkBufferDirty(buffer);

@ -654,7 +704,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	{
 		XLogRecPtr	recptr;

-		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
+		recptr = log_heap_clean(onerel, buffer,
+								NULL, 0, NULL, 0,
+								unused, uncnt,
+								false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
@ -980,7 +1033,7 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats,
 	/*
 	 * The array shouldn't overflow under normal behavior, but perhaps it
 	 * could if we are given a really small maintenance_work_mem. In that
-	 * case, just forget the last few tuples.
+	 * case, just forget the last few tuples (we'll get 'em next time).
 	 */
 	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
 	{
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@ -26,7 +26,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.297 2007/09/07 20:59:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.298 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1813,8 +1813,10 @@ lreplace:;
 	 *
 	 * Note: heap_update returns the tid (location) of the new tuple in the
 	 * t_self field.
+	 *
+	 * If it's a HOT update, we mustn't insert new index entries.
 	 */
-	if (resultRelInfo->ri_NumIndices > 0)
+	if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple))
 		ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false);

 	/* AFTER ROW UPDATE Triggers */
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.150 2007/08/15 21:39:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.151 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -981,6 +981,10 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
 *		stuff as it only exists here because the genam stuff
 *		doesn't provide the functionality needed by the
 *		executor.. -cim 9/27/89
+ *
+ *		CAUTION: this must not be called for a HOT update.
+ *		We can't defend against that here for lack of info.
+ *		Should we change the API to make it safer?
 * ----------------------------------------------------------------
 */
 void
@ -1029,6 +1033,10 @@ ExecInsertIndexTuples(TupleTableSlot *slot,

 		indexInfo = indexInfoArray[i];

+		/* If the index is marked as read-only, ignore it */
+		if (!indexInfo->ii_ReadyForInserts)
+			continue;
+
 		/* Check for partial index */
 		if (indexInfo->ii_Predicate != NIL)
 		{
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@ -21,7 +21,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.19 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.20 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -240,12 +240,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	BlockNumber page = tbmres->blockno;
 	Buffer		buffer;
 	Snapshot	snapshot;
-	Page		dp;
 	int			ntup;
-	int			curslot;
-	int			minslot;
-	int			maxslot;
-	int			maxoff;

 	/*
 	 * Acquire pin on the target heap page, trading in any pin we held before.
@ -258,6 +253,13 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	buffer = scan->rs_cbuf;
 	snapshot = scan->rs_snapshot;

+	ntup = 0;
+
+	/*
+	 * Prune and repair fragmentation for the whole page, if possible.
+	 */
+	heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
 	/*
 	 * We must hold share lock on the buffer content while examining tuple
 	 * visibility.	Afterwards, however, the tuples we have found to be
@ -265,71 +267,51 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	 */
 	LockBuffer(buffer, BUFFER_LOCK_SHARE);

-	dp = (Page) BufferGetPage(buffer);
-	maxoff = PageGetMaxOffsetNumber(dp);
-
 	/*
-	 * Determine how many entries we need to look at on this page. If the
-	 * bitmap is lossy then we need to look at each physical item pointer;
-	 * otherwise we just look through the offsets listed in tbmres.
+	 * We need two separate strategies for lossy and non-lossy cases.
 	 */
 	if (tbmres->ntuples >= 0)
 	{
-		/* non-lossy case */
-		minslot = 0;
-		maxslot = tbmres->ntuples - 1;
+		/*
+		 * Bitmap is non-lossy, so we just look through the offsets listed in
+		 * tbmres; but we have to follow any HOT chain starting at each such
+		 * offset.
+		 */
+		int curslot;
+
+		for (curslot = 0; curslot < tbmres->ntuples; curslot++)
+		{
+			OffsetNumber offnum = tbmres->offsets[curslot];
+			ItemPointerData tid;
+
+			ItemPointerSet(&tid, page, offnum);
+			if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL))
+				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
+		}
 	}
 	else
 	{
-		/* lossy case */
-		minslot = FirstOffsetNumber;
-		maxslot = maxoff;
-	}
+		/*
+		 * Bitmap is lossy, so we must examine each item pointer on the page.
+		 * But we can ignore HOT chains, since we'll check each tuple anyway.
+		 */
+		Page		dp = (Page) BufferGetPage(buffer);
+		OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
+		OffsetNumber offnum;

-	ntup = 0;
-	for (curslot = minslot; curslot <= maxslot; curslot++)
-	{
-		OffsetNumber targoffset;
-		ItemId		lp;
-		HeapTupleData loctup;
-		bool		valid;
-
-		if (tbmres->ntuples >= 0)
+		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
 		{
-			/* non-lossy case */
-			targoffset = tbmres->offsets[curslot];
+			ItemId		lp;
+			HeapTupleData loctup;
+
+			lp = PageGetItemId(dp, offnum);
+			if (!ItemIdIsNormal(lp))
+				continue;
+			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+			loctup.t_len = ItemIdGetLength(lp);
+			if (HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer))
+				scan->rs_vistuples[ntup++] = offnum;
 		}
-		else
-		{
-			/* lossy case */
-			targoffset = (OffsetNumber) curslot;
-		}
-
-		/*
-		 * We'd better check for out-of-range offnum in case of VACUUM since
-		 * the TID was obtained.
-		 */
-		if (targoffset < FirstOffsetNumber || targoffset > maxoff)
-			continue;
-
-		lp = PageGetItemId(dp, targoffset);
-
-		/*
-		 * Must check for deleted tuple.
-		 */
-		if (!ItemIdIsNormal(lp))
-			continue;
-
-		/*
-		 * check time qualification of tuple, remember it if valid
-		 */
-		loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
-		loctup.t_len = ItemIdGetLength(lp);
-		ItemPointerSet(&(loctup.t_self), page, targoffset);
-
-		valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
-		if (valid)
-			scan->rs_vistuples[ntup++] = targoffset;
 	}

 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.180 2007/08/15 19:15:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.181 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1407,6 +1407,7 @@ _SPI_prepare_plan(const char *src, SPIPlanPtr plan)
 		plansource->num_params = nargs;
 		plansource->fully_planned = true;
 		plansource->fixed_result = false;
+		/* no need to set search_path, generation or saved_xmin */
 		plansource->resultDesc = PlanCacheComputeResultDesc(stmt_list);
 		plansource->plan = cplan;

@ -1973,6 +1974,7 @@ _SPI_copy_plan(SPIPlanPtr plan, MemoryContext parentcxt)
 		newsource->num_params = newplan->nargs;
 		newsource->fully_planned = plansource->fully_planned;
 		newsource->fixed_result = plansource->fixed_result;
+		/* no need to worry about seach_path, generation or saved_xmin */
 		if (plansource->resultDesc)
 			newsource->resultDesc = CreateTupleDescCopy(plansource->resultDesc);
 		newsource->plan = newcplan;
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@ -23,7 +23,7 @@
 * Copyright (c) 2003-2007, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.12 2007/04/26 23:24:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.13 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -32,6 +32,7 @@
 #include <limits.h>

 #include "access/htup.h"
+#include "nodes/bitmapset.h"
 #include "nodes/tidbitmap.h"
 #include "storage/bufpage.h"
 #include "utils/hsearch.h"
@ -61,9 +62,7 @@
 */
 #define PAGES_PER_CHUNK  (BLCKSZ / 32)

-/* The bitmap unit size can be adjusted by changing these declarations: */
-#define BITS_PER_BITMAPWORD 32
-typedef uint32 bitmapword;		/* must be an unsigned type */
+/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */

 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
 #define BITNUM(x)	((x) % BITS_PER_BITMAPWORD)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.221 2007/05/26 18:23:01 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.222 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -134,6 +134,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
 	glob->subrtables = NIL;
 	glob->rewindPlanIDs = NULL;
 	glob->finalrtable = NIL;
+	glob->transientPlan = false;

 	/* Determine what fraction of the plan is likely to be scanned */
 	if (cursorOptions & CURSOR_OPT_FAST_PLAN)
@ -183,6 +184,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)

 	result->commandType = parse->commandType;
 	result->canSetTag = parse->canSetTag;
+	result->transientPlan = glob->transientPlan;
 	result->planTree = top_plan;
 	result->rtable = glob->finalrtable;
 	result->resultRelations = root->resultRelations;
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.136 2007/05/31 16:57:34 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.137 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -19,6 +19,7 @@

 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "catalog/pg_inherits.h"
 #include "nodes/makefuncs.h"
 #include "optimizer/clauses.h"
@ -164,6 +165,20 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				continue;
 			}

+			/*
+			 * If the index is valid, but cannot yet be used, ignore it;
+			 * but mark the plan we are generating as transient.
+			 * See src/backend/access/heap/README.HOT for discussion.
+			 */
+			if (index->indcheckxmin &&
+				!TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data),
+									   TransactionXmin))
+			{
+				root->glob->transientPlan = true;
+				index_close(indexRelation, NoLock);
+				continue;
+			}
+
 			info = makeNode(IndexOptInfo);

 			info->indexoid = index->indexrelid;
--- a/src/backend/optimizer/util/var.c
+++ b/src/backend/optimizer/util/var.c
@ -8,12 +8,13 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/var.c,v 1.70 2007/06/11 01:16:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/var.c,v 1.71 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

+#include "access/htup.h"
 #include "optimizer/clauses.h"
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
@ -54,6 +55,7 @@ typedef struct

 static bool pull_varnos_walker(Node *node,
 				   pull_varnos_context *context);
+static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos);
 static bool contain_var_reference_walker(Node *node,
 							 contain_var_reference_context *context);
 static bool contain_var_clause_walker(Node *node, void *context);
@ -134,6 +136,47 @@ pull_varnos_walker(Node *node, pull_varnos_context *context)
 								  (void *) context);
 }

+/*
+ * pull_varattnos
+ *		Find all the distinct attribute numbers present in an expression tree,
+ *		and add them to the initial contents of *varattnos.
+ *		Only Vars that reference RTE 1 of rtable level zero are considered.
+ *
+ * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
+ * we can include system attributes (e.g., OID) in the bitmap representation.
+ *
+ * Currently, this does not support subqueries nor expressions containing
+ * references to multiple tables; not needed since it's only applied to
+ * index expressions and predicates.
+ */
+void
+pull_varattnos(Node *node, Bitmapset **varattnos)
+{
+	(void) pull_varattnos_walker(node, varattnos);
+}
+
+static bool
+pull_varattnos_walker(Node *node, Bitmapset **varattnos)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Var))
+	{
+		Var		   *var = (Var *) node;
+
+		Assert(var->varno == 1);
+		*varattnos = bms_add_member(*varattnos,
+						var->varattno - FirstLowInvalidHeapAttributeNumber);
+		return false;
+	}
+	/* Should not find a subquery or subplan */
+	Assert(!IsA(node, Query));
+	Assert(!is_subplan(node));
+
+	return expression_tree_walker(node, pull_varattnos_walker,
+								  (void *) varattnos);
+}
+

 /*
 *		contain_var_reference
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@ -13,7 +13,7 @@
 *
 *	Copyright (c) 2001-2007, PostgreSQL Global Development Group
 *
- *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.163 2007/09/11 03:28:05 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.164 2007/09/20 17:56:31 tgl Exp $
 * ----------
 */
 #include "postgres.h"
@ -1294,7 +1294,7 @@ pgstat_count_heap_insert(Relation rel)
 * pgstat_count_heap_update - count a tuple update
 */
 void
-pgstat_count_heap_update(Relation rel)
+pgstat_count_heap_update(Relation rel, bool hot)
 {
 	PgStat_TableStatus *pgstat_info = rel->pgstat_info;

@ -1304,6 +1304,9 @@ pgstat_count_heap_update(Relation rel)

 		/* t_tuples_updated is nontransactional, so just advance it */
 		pgstat_info->t_counts.t_tuples_updated++;
+		/* ditto for the hot_update counter */
+		if (hot)
+			pgstat_info->t_counts.t_tuples_hot_updated++;

 		/* We have to log the transactional effect at the proper level */
 		if (pgstat_info->trans == NULL ||
@ -1340,6 +1343,23 @@ pgstat_count_heap_delete(Relation rel)
 	}
 }

+/*
+ * pgstat_update_heap_dead_tuples - update dead-tuples count
+ *
+ * The semantics of this are that we are reporting the nontransactional
+ * recovery of "delta" dead tuples; so t_new_dead_tuples decreases
+ * rather than increasing, and the change goes straight into the per-table
+ * counter, not into transactional state.
+ */
+void
+pgstat_update_heap_dead_tuples(Relation rel, int delta)
+{
+	PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+	if (pgstat_collect_tuplelevel && pgstat_info != NULL)
+		pgstat_info->t_counts.t_new_dead_tuples -= delta;
+}
+

 /* ----------
 * AtEOXact_PgStat
@ -2901,6 +2921,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 			tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted;
 			tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated;
 			tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted;
+			tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated;
 			tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples;
 			tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples;
 			tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched;
@ -2923,6 +2944,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 			tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted;
 			tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated;
 			tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted;
+			tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated;
 			tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples;
 			tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples;
 			tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched;
@ -2931,6 +2953,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)

 		/* Clamp n_live_tuples in case of negative new_live_tuples */
 		tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0);
+		/* Likewise for n_dead_tuples */
+		tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0);

 		/*
 		 * Add per-table stats to the per-database entry, too.
@ -3115,6 +3139,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len)
 	else
 		tabentry->vacuum_timestamp = msg->m_vacuumtime;
 	tabentry->n_live_tuples = msg->m_tuples;
+	/* Resetting dead_tuples to 0 is an approximation ... */
 	tabentry->n_dead_tuples = 0;
 	if (msg->m_analyze)
 	{
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.223 2007/06/30 19:12:01 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.224 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -2066,6 +2066,55 @@ LockBufferForCleanup(Buffer buffer)
 	}
 }

+/*
+ * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
+ *
+ * We won't loop, but just check once to see if the pin count is OK.  If
+ * not, return FALSE with no lock held.
+ */ 
+bool
+ConditionalLockBufferForCleanup(Buffer buffer)
+{
+	volatile BufferDesc *bufHdr;
+
+	Assert(BufferIsValid(buffer));
+
+	if (BufferIsLocal(buffer))
+	{
+		/* There should be exactly one pin */
+		Assert(LocalRefCount[-buffer - 1] > 0);
+		if (LocalRefCount[-buffer - 1] != 1)
+			return false;
+		/* Nobody else to wait for */
+		return true;
+	}
+
+	/* There should be exactly one local pin */
+	Assert(PrivateRefCount[buffer - 1] > 0);
+	if (PrivateRefCount[buffer - 1] != 1)
+		return false;
+
+	/* Try to acquire lock */
+	if (!ConditionalLockBuffer(buffer))
+		return false;
+
+	bufHdr = &BufferDescriptors[buffer - 1];
+	LockBufHdr(bufHdr);
+	Assert(bufHdr->refcount > 0);
+	if (bufHdr->refcount == 1)
+	{
+		/* Successfully acquired exclusive lock with pincount 1 */
+		UnlockBufHdr(bufHdr);
+		return true;
+	}
+
+	/* Failed, so release the lock */
+	UnlockBufHdr(bufHdr);
+	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	return false;
+}
+
+
 /*
 *	Functions for buffer I/O handling
 *
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@ -8,12 +8,13 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.73 2007/09/12 22:10:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.74 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

+#include "access/htup.h"
 #include "storage/bufpage.h"


@ -108,6 +109,9 @@ PageHeaderIsValid(PageHeader page)
 *	If offsetNumber is not valid, then assign one by finding the first
 *	one that is both unused and deallocated.
 *
+ *	If is_heap is true, we enforce that there can't be more than
+ *	MaxHeapTuplesPerPage line pointers on the page.
+ *
 *	!!! EREPORT(ERROR) IS DISALLOWED HERE !!!
 */
 OffsetNumber
@ -115,7 +119,8 @@ PageAddItem(Page page,
 			Item item,
 			Size size,
 			OffsetNumber offsetNumber,
-			bool overwrite)
+			bool overwrite,
+			bool is_heap)
 {
 	PageHeader	phdr = (PageHeader) page;
 	Size		alignedSize;
@ -200,6 +205,12 @@ PageAddItem(Page page,
 		return InvalidOffsetNumber;
 	}

+	if (is_heap && offsetNumber > MaxHeapTuplesPerPage)
+	{
+		elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
+		return InvalidOffsetNumber;
+	}
+
 	/*
 	 * Compute new lower and upper pointers for page, see if it'll fit.
 	 *
@ -315,11 +326,10 @@ itemoffcompare(const void *itemidp1, const void *itemidp2)
 *
 * This routine is usable for heap pages only, but see PageIndexMultiDelete.
 *
- * Returns number of unused line pointers on page.	If "unused" is not NULL
- * then the unused[] array is filled with indexes of unused line pointers.
+ * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated.
 */
-int
-PageRepairFragmentation(Page page, OffsetNumber *unused)
+void
+PageRepairFragmentation(Page page)
 {
 	Offset		pd_lower = ((PageHeader) page)->pd_lower;
 	Offset		pd_upper = ((PageHeader) page)->pd_upper;
@ -329,7 +339,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 	ItemId		lp;
 	int			nline,
 				nstorage,
-				nused;
+				nunused;
 	int			i;
 	Size		totallen;
 	Offset		upper;
@ -352,13 +362,12 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 						pd_lower, pd_upper, pd_special)));

 	nline = PageGetMaxOffsetNumber(page);
-	nused = nstorage = 0;
-	for (i = 0; i < nline; i++)
+	nunused = nstorage = 0;
+	for (i = FirstOffsetNumber; i <= nline; i++)
 	{
-		lp = PageGetItemId(page, i + 1);
+		lp = PageGetItemId(page, i);
 		if (ItemIdIsUsed(lp))
 		{
-			nused++;
 			if (ItemIdHasStorage(lp))
 				nstorage++;
 		}
@ -366,9 +375,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 		{
 			/* Unused entries should have lp_len = 0, but make sure */
 			ItemIdSetUnused(lp);
-			/* Report to caller if asked for */
-			if (unused)
-				unused[i - nused] = (OffsetNumber) i;
+			nunused++;
 		}
 	}

@ -431,18 +438,19 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 	}

 	/* Set hint bit for PageAddItem */
-	if (nused < nline)
+	if (nunused > 0)
 		PageSetHasFreeLinePointers(page);
 	else
 		PageClearHasFreeLinePointers(page);
-
-	return (nline - nused);
 }

 /*
 * PageGetFreeSpace
 *		Returns the size of the free (allocatable) space on a page,
 *		reduced by the space needed for a new line pointer.
+ *
+ * Note: this should usually only be used on index pages.  Use
+ * PageGetHeapFreeSpace on heap pages.
 */
 Size
 PageGetFreeSpace(Page page)
@ -465,7 +473,8 @@ PageGetFreeSpace(Page page)

 /*
 * PageGetExactFreeSpace
- *		Returns the size of the free (allocatable) space on a page.
+ *		Returns the size of the free (allocatable) space on a page,
+ *		without any consideration for adding/removing line pointers.
 */
 Size
 PageGetExactFreeSpace(Page page)
@ -483,6 +492,73 @@ PageGetExactFreeSpace(Page page)
 }


+/*
+ * PageGetHeapFreeSpace
+ *		Returns the size of the free (allocatable) space on a page,
+ *		reduced by the space needed for a new line pointer.
+ *
+ * The difference between this and PageGetFreeSpace is that this will return
+ * zero if there are already MaxHeapTuplesPerPage line pointers in the page
+ * and none are free.  We use this to enforce that no more than
+ * MaxHeapTuplesPerPage line pointers are created on a heap page.  (Although
+ * no more tuples than that could fit anyway, in the presence of redirected
+ * or dead line pointers it'd be possible to have too many line pointers.
+ * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
+ * on the number of line pointers, we make this extra check.)
+ */
+Size
+PageGetHeapFreeSpace(Page page)
+{
+	Size			space;
+
+	space = PageGetFreeSpace(page);
+	if (space > 0)
+	{
+		OffsetNumber	offnum, nline;
+
+		/*
+		 * Are there already MaxHeapTuplesPerPage line pointers in the page?
+		 */
+		nline = PageGetMaxOffsetNumber(page);
+		if (nline >= MaxHeapTuplesPerPage)
+		{
+			if (PageHasFreeLinePointers((PageHeader) page))
+			{
+				/*
+				 * Since this is just a hint, we must confirm that there is
+				 * indeed a free line pointer
+				 */
+				for (offnum = FirstOffsetNumber; offnum <= nline; offnum++)
+				{
+					ItemId	lp = PageGetItemId(page, offnum);
+
+					if (!ItemIdIsUsed(lp))
+						break;
+				}
+
+				if (offnum > nline)
+				{
+					/*
+					 * The hint is wrong, but we can't clear it here since
+					 * we don't have the ability to mark the page dirty.
+					 */
+					space = 0;
+				}
+			}
+			else
+			{
+				/*
+				 * Although the hint might be wrong, PageAddItem will believe
+				 * it anyway, so we must believe it too.
+				 */
+				space = 0;
+			}
+		}
+	}
+	return space;
+}
+
+
 /*
 * PageIndexTupleDelete
 *
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/pgstatfuncs.c,v 1.44 2007/09/11 03:28:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/pgstatfuncs.c,v 1.45 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -28,6 +28,7 @@ extern Datum pg_stat_get_tuples_fetched(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_inserted(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_updated(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS);
+extern Datum pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_dead_tuples(PG_FUNCTION_ARGS);
 extern Datum pg_stat_get_blocks_fetched(PG_FUNCTION_ARGS);
@ -169,6 +170,22 @@ pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS)
 }


+Datum
+pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	int64		result;
+	PgStat_StatTabEntry *tabentry;
+
+	if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL)
+		result = 0;
+	else
+		result = (int64) (tabentry->tuples_hot_updated);
+
+	PG_RETURN_INT64(result);
+}
+
+
 Datum
 pg_stat_get_live_tuples(PG_FUNCTION_ARGS)
 { 
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@ -33,13 +33,14 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/plancache.c,v 1.10 2007/06/05 20:00:41 wieck Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/plancache.c,v 1.11 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

 #include "utils/plancache.h"
+#include "access/transam.h"
 #include "catalog/namespace.h"
 #include "executor/executor.h"
 #include "optimizer/clauses.h"
@ -79,6 +80,7 @@ static void ScanQueryForRelids(Query *parsetree,
 							   void *arg);
 static bool ScanQueryWalker(Node *node, ScanQueryWalkerContext *context);
 static bool rowmark_member(List *rowMarks, int rt_index);
+static bool plan_list_is_transient(List *stmt_list);
 static void PlanCacheCallback(Datum arg, Oid relid);
 static void InvalRelid(Oid relid, LOCKMODE lockmode,
 					   InvalRelidContext *context);
@ -322,6 +324,13 @@ StoreCachedPlan(CachedPlanSource *plansource,
 	plan->stmt_list = stmt_list;
 	plan->fully_planned = plansource->fully_planned;
 	plan->dead = false;
+	if (plansource->fully_planned && plan_list_is_transient(stmt_list))
+	{
+		Assert(TransactionIdIsNormal(TransactionXmin));
+		plan->saved_xmin = TransactionXmin;
+	}
+	else
+		plan->saved_xmin = InvalidTransactionId;
 	plan->refcount = 1;			/* for the parent's link */
 	plan->generation = ++(plansource->generation);
 	plan->context = plan_context;
@ -411,6 +420,15 @@ RevalidateCachedPlan(CachedPlanSource *plansource, bool useResOwner)
 		else
 			AcquirePlannerLocks(plan->stmt_list, true);

+		/*
+		 * If plan was transient, check to see if TransactionXmin has
+		 * advanced, and if so invalidate it.
+		 */
+		if (!plan->dead &&
+			TransactionIdIsValid(plan->saved_xmin) &&
+			!TransactionIdEquals(plan->saved_xmin, TransactionXmin))
+			plan->dead = true;
+
 		/*
 		 * By now, if any invalidation has happened, PlanCacheCallback
 		 * will have marked the plan dead.
@ -789,6 +807,28 @@ rowmark_member(List *rowMarks, int rt_index)
 	return false;
 }

+/*
+ * plan_list_is_transient: check if any of the plans in the list are transient.
+ */
+static bool
+plan_list_is_transient(List *stmt_list)
+{
+	ListCell   *lc;
+
+	foreach(lc, stmt_list)
+	{
+		PlannedStmt *plannedstmt = (PlannedStmt *) lfirst(lc);
+		
+		if (!IsA(plannedstmt, PlannedStmt))
+			continue;			/* Ignore utility statements */
+
+		if (plannedstmt->transientPlan)
+			return true;
+	}	
+
+	return false;
+}
+
 /*
 * PlanCacheComputeResultDesc: given a list of either fully-planned statements
 * or Queries, determine the result tupledesc it will produce.  Returns NULL
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.262 2007/07/25 22:16:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.263 2007/09/20 17:56:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -34,6 +34,7 @@
 #include "access/reloptions.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
+#include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_amop.h"
@ -51,6 +52,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
+#include "optimizer/var.h"
 #include "rewrite/rewriteDefine.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
@ -1658,6 +1660,10 @@ RelationReloadIndexInfo(Relation relation)
 		index = (Form_pg_index) GETSTRUCT(tuple);

 		relation->rd_index->indisvalid = index->indisvalid;
+		relation->rd_index->indcheckxmin = index->indcheckxmin;
+		relation->rd_index->indisready = index->indisready;
+		HeapTupleHeaderSetXmin(relation->rd_indextuple->t_data,
+							   HeapTupleHeaderGetXmin(tuple->t_data));

 		ReleaseSysCache(tuple);
 	}
@ -1762,6 +1768,7 @@ RelationClearRelation(Relation relation, bool rebuild)
 	if (relation->rd_options)
 		pfree(relation->rd_options);
 	list_free(relation->rd_indexlist);
+	bms_free(relation->rd_indexattr);
 	if (relation->rd_indexcxt)
 		MemoryContextDelete(relation->rd_indexcxt);

@ -2969,6 +2976,7 @@ RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex)
 	relation->rd_indexvalid = 2;	/* mark list as forced */
 	/* must flag that we have a forced index list */
 	need_eoxact_work = true;
+	/* we deliberately do not change rd_indexattr */
 }

 /*
@ -3140,6 +3148,91 @@ RelationGetIndexPredicate(Relation relation)
 	return result;
 }

+/*
+ * RelationGetIndexAttrBitmap -- get a bitmap of index attribute numbers
+ *
+ * The result has a bit set for each attribute used anywhere in the index
+ * definitions of all the indexes on this relation.  (This includes not only
+ * simple index keys, but attributes used in expressions and partial-index
+ * predicates.)
+ *
+ * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
+ * we can include system attributes (e.g., OID) in the bitmap representation.
+ *
+ * The returned result is palloc'd in the caller's memory context and should
+ * be bms_free'd when not needed anymore.
+ */
+Bitmapset *
+RelationGetIndexAttrBitmap(Relation relation)
+{
+	Bitmapset	*indexattrs;
+	List		*indexoidlist;
+	ListCell	*l;
+	MemoryContext oldcxt;
+
+	/* Quick exit if we already computed the result. */
+	if (relation->rd_indexattr != NULL)
+		return bms_copy(relation->rd_indexattr);
+
+	/* Fast path if definitely no indexes */
+	if (!RelationGetForm(relation)->relhasindex)
+		return NULL;
+
+	/*
+	 * Get cached list of index OIDs
+	 */
+	indexoidlist = RelationGetIndexList(relation);
+
+	/* Fall out if no indexes (but relhasindex was set) */
+	if (indexoidlist == NIL)
+		return NULL;
+
+	/*
+	 * For each index, add referenced attributes to indexattrs.
+	 */
+	indexattrs = NULL;
+	foreach(l, indexoidlist)
+	{
+		Oid			indexOid = lfirst_oid(l);
+		Relation	indexDesc;
+		IndexInfo  *indexInfo;
+		int 		i;
+
+		indexDesc = index_open(indexOid, AccessShareLock);
+
+		/* Extract index key information from the index's pg_index row */
+		indexInfo = BuildIndexInfo(indexDesc);
+
+		/* Collect simple attribute references */
+		for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
+		{
+			int attrnum = indexInfo->ii_KeyAttrNumbers[i];
+
+			if (attrnum != 0)
+				indexattrs = bms_add_member(indexattrs,
+						attrnum - FirstLowInvalidHeapAttributeNumber);
+		}
+
+		/* Collect all attributes used in expressions, too */
+		pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs);
+
+		/* Collect all attributes in the index predicate, too */
+		pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs);
+
+		index_close(indexDesc, AccessShareLock);
+	}
+
+	list_free(indexoidlist);
+
+	/* Now save a copy of the bitmap in the relcache entry. */
+	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+	relation->rd_indexattr = bms_copy(indexattrs);
+	MemoryContextSwitchTo(oldcxt);
+
+	/* We return our original working copy for caller to play with */
+	return indexattrs;
+}
+

 /*
 *	load_relcache_init_file, write_relcache_init_file
@ -3465,6 +3558,7 @@ load_relcache_init_file(void)
 			rel->rd_refcnt = 0;
 		rel->rd_indexvalid = 0;
 		rel->rd_indexlist = NIL;
+		rel->rd_indexattr = NULL;
 		rel->rd_oidindex = InvalidOid;
 		rel->rd_createSubid = InvalidSubTransactionId;
 		rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.126 2007/06/09 18:49:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.127 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -153,6 +153,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
 extern bool heap_release_fetch(Relation relation, Snapshot snapshot,
 				   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
 				   Relation stats_relation);
+extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer,
+								   Snapshot snapshot, bool *all_dead);
+extern bool heap_hot_search(ItemPointer tid, Relation relation,
+							Snapshot snapshot, bool *all_dead);

 extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
 					ItemPointer tid);
@ -183,6 +187,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 extern void heap_markpos(HeapScanDesc scan);
 extern void heap_restrpos(HeapScanDesc scan);

+extern void heap_sync(Relation relation);
+
 extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr);
 extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec);
 extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr);
@ -192,7 +198,10 @@ extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
 			  ItemPointerData from,
 			  Buffer newbuf, HeapTuple newtup);
 extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
-			   OffsetNumber *unused, int uncnt);
+			   OffsetNumber *redirected, int nredirected,
+			   OffsetNumber *nowdead, int ndead,
+			   OffsetNumber *nowunused, int nunused,
+			   bool redirect_move);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
 								  TransactionId cutoff_xid,
 								  OffsetNumber *offsets, int offcnt);
@ -240,7 +249,13 @@ extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup);
 extern HeapTuple heap_addheader(int natts, bool withoid,
 			   Size structlen, void *structure);

-extern void heap_sync(Relation relation);
+/* in heap/pruneheap.c */
+extern void heap_page_prune_opt(Relation relation, Buffer buffer,
+								TransactionId OldestXmin);
+extern int	heap_page_prune(Relation relation, Buffer buffer,
+							TransactionId OldestXmin,
+							bool redirect_move, bool report_stats);
+extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);

 /* in heap/syncscan.c */
 extern void ss_report_location(Relation rel, BlockNumber location);
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.93 2007/04/06 04:21:43 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.94 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -184,8 +184,12 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 /*
 * information stored in t_infomask2:
 */
-#define HEAP_NATTS_MASK			0x7FF	/* 11 bits for number of attributes */
-/* bits 0xF800 are currently unused */
+#define HEAP_NATTS_MASK			0x07FF	/* 11 bits for number of attributes */
+/* bits 0x3800 are available */
+#define HEAP_HOT_UPDATED		0x4000	/* tuple was HOT-updated */
+#define HEAP_ONLY_TUPLE			0x8000	/* this is heap-only tuple */
+
+#define HEAP2_XACT_MASK			0xC000	/* visibility-related bits */

 /*
 * HeapTupleHeader accessor macros
@ -201,7 +205,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;

 #define HeapTupleHeaderSetXmin(tup, xid) \
 ( \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmin) \
+	(tup)->t_choice.t_heap.t_xmin = (xid) \
 )

 #define HeapTupleHeaderGetXmax(tup) \
@ -211,7 +215,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;

 #define HeapTupleHeaderSetXmax(tup, xid) \
 ( \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmax) \
+	(tup)->t_choice.t_heap.t_xmax = (xid) \
 )

 /*
@ -255,7 +259,7 @@ do { \
 #define HeapTupleHeaderSetXvac(tup, xid) \
 do { \
 	Assert((tup)->t_infomask & HEAP_MOVED); \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field3.t_xvac); \
+	(tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \
 } while (0)

 #define HeapTupleHeaderGetDatumLength(tup) \
@ -298,6 +302,43 @@ do { \
 	*((Oid *) ((char *)(tup) + (tup)->t_hoff - sizeof(Oid))) = (oid); \
 } while (0)

+/*
+ * Note that we stop considering a tuple HOT-updated as soon as it is known
+ * aborted or the would-be updating transaction is known aborted.  For best
+ * efficiency, check tuple visibility before using this macro, so that the
+ * INVALID bits will be as up to date as possible.
+ */
+#define HeapTupleHeaderIsHotUpdated(tup) \
+( \
+	((tup)->t_infomask2 & HEAP_HOT_UPDATED) != 0 && \
+	((tup)->t_infomask & (HEAP_XMIN_INVALID | HEAP_XMAX_INVALID)) == 0 \
+)
+
+#define HeapTupleHeaderSetHotUpdated(tup) \
+( \
+	(tup)->t_infomask2 |= HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderClearHotUpdated(tup) \
+( \
+	(tup)->t_infomask2 &= ~HEAP_HOT_UPDATED \
+)
+
+#define HeapTupleHeaderIsHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 & HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderSetHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \
+)
+
+#define HeapTupleHeaderClearHeapOnly(tup) \
+( \
+  (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \
+)
+
 #define HeapTupleHeaderGetNatts(tup) \
 	((tup)->t_infomask2 & HEAP_NATTS_MASK)

@ -331,6 +372,11 @@ do { \
 * fit on one heap page.  (Note that indexes could have more, because they
 * use a smaller tuple header.)  We arrive at the divisor because each tuple
 * must be maxaligned, and it must have an associated item pointer.
+ *
+ * Note: with HOT, there could theoretically be more line pointers (not actual
+ * tuples) than this on a heap page.  However we constrain the number of line
+ * pointers to this anyway, to avoid excessive line-pointer bloat and not
+ * require increases in the size of work arrays.
 */
 #define MaxHeapTuplesPerPage	\
 	((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
@ -484,6 +530,24 @@ typedef HeapTupleData *HeapTuple;
 #define HeapTupleHasExternal(tuple) \
 		(((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0)

+#define HeapTupleIsHotUpdated(tuple) \
+		HeapTupleHeaderIsHotUpdated((tuple)->t_data)
+
+#define HeapTupleSetHotUpdated(tuple) \
+		HeapTupleHeaderSetHotUpdated((tuple)->t_data)
+
+#define HeapTupleClearHotUpdated(tuple) \
+		HeapTupleHeaderClearHotUpdated((tuple)->t_data)
+
+#define HeapTupleIsHeapOnly(tuple) \
+		HeapTupleHeaderIsHeapOnly((tuple)->t_data)
+
+#define HeapTupleSetHeapOnly(tuple) \
+		HeapTupleHeaderSetHeapOnly((tuple)->t_data)
+
+#define HeapTupleClearHeapOnly(tuple) \
+		HeapTupleHeaderClearHeapOnly((tuple)->t_data)
+
 #define HeapTupleGetOid(tuple) \
 		HeapTupleHeaderGetOid((tuple)->t_data)

@ -497,27 +561,30 @@ typedef HeapTupleData *HeapTuple;
 * XLOG allows to store some information in high 4 bits of log
 * record xl_info field.  We use 3 for opcode and one for init bit.
 */
-#define XLOG_HEAP_INSERT	0x00
-#define XLOG_HEAP_DELETE	0x10
-#define XLOG_HEAP_UPDATE	0x20
-#define XLOG_HEAP_MOVE		0x30
-#define XLOG_HEAP_CLEAN		0x40
-#define XLOG_HEAP_NEWPAGE	0x50
-#define XLOG_HEAP_LOCK		0x60
-#define XLOG_HEAP_INPLACE	0x70
-#define XLOG_HEAP_OPMASK	0x70
+#define XLOG_HEAP_INSERT		0x00
+#define XLOG_HEAP_DELETE		0x10
+#define XLOG_HEAP_UPDATE		0x20
+#define XLOG_HEAP_MOVE			0x30
+#define XLOG_HEAP_HOT_UPDATE	0x40
+#define XLOG_HEAP_NEWPAGE		0x50
+#define XLOG_HEAP_LOCK			0x60
+#define XLOG_HEAP_INPLACE		0x70
+
+#define XLOG_HEAP_OPMASK		0x70
 /*
 * When we insert 1st item on new page in INSERT/UPDATE
 * we can (and we do) restore entire page in redo
 */
-#define XLOG_HEAP_INIT_PAGE 0x80
+#define XLOG_HEAP_INIT_PAGE 	0x80
 /*
 * We ran out of opcodes, so heapam.c now has a second RmgrId.  These opcodes
 * are associated with RM_HEAP2_ID, but are not logically different from
 * the ones above associated with RM_HEAP_ID.  We apply XLOG_HEAP_OPMASK,
 * although currently XLOG_HEAP_INIT_PAGE is not used for any of these.
 */
-#define XLOG_HEAP2_FREEZE	0x00
+#define XLOG_HEAP2_FREEZE		0x00
+#define XLOG_HEAP2_CLEAN		0x10
+#define XLOG_HEAP2_CLEAN_MOVE	0x20

 /*
 * All what we need to find changed tuple
@ -569,7 +636,7 @@ typedef struct xl_heap_insert

 #define SizeOfHeapInsert	(offsetof(xl_heap_insert, target) + SizeOfHeapTid)

-/* This is what we need to know about update|move */
+/* This is what we need to know about update|move|hot_update */
 typedef struct xl_heap_update
 {
 	xl_heaptid	target;			/* deleted tuple id */
@ -580,15 +647,34 @@ typedef struct xl_heap_update

 #define SizeOfHeapUpdate	(offsetof(xl_heap_update, newtid) + SizeOfIptrData)

-/* This is what we need to know about vacuum page cleanup */
+/*
+ * This is what we need to know about vacuum page cleanup/redirect
+ *
+ * The array of OffsetNumbers following the fixed part of the record contains:
+ *	* for each redirected item: the item offset, then the offset redirected to
+ *	* for each now-dead item: the item offset
+ *	* for each now-unused item: the item offset
+ * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused.
+ * Note that nunused is not explicitly stored, but may be found by reference
+ * to the total record length.
+ *
+ * If the opcode is CLEAN_MOVE instead of CLEAN, then each redirection pair
+ * should be interpreted as physically moving the "to" item pointer to the
+ * "from" slot, rather than placing a redirection item in the "from" slot.
+ * The moved pointers should be replaced by LP_UNUSED items (there will not
+ * be explicit entries in the "now-unused" list for this).  Also, the
+ * HEAP_ONLY bit in the moved tuples must be turned off.
+ */
 typedef struct xl_heap_clean
 {
 	RelFileNode node;
 	BlockNumber block;
-	/* UNUSED OFFSET NUMBERS FOLLOW AT THE END */
+	uint16		nredirected;
+	uint16		ndead;
+	/* OFFSET NUMBERS FOLLOW */
 } xl_heap_clean;

-#define SizeOfHeapClean (offsetof(xl_heap_clean, block) + sizeof(BlockNumber))
+#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))

 /* This is for replacing a page's contents in toto */
 /* NB: this is used for indexes as well as heaps */
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.56 2007/06/09 18:49:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.57 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -82,6 +82,9 @@ typedef struct IndexScanDescData
 	HeapTupleData xs_ctup;		/* current heap tuple, if any */
 	Buffer		xs_cbuf;		/* current heap buffer in scan, if any */
 	/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
+	TransactionId xs_prev_xmax;	/* previous HOT chain member's XMAX, if any */
+	OffsetNumber xs_next_hot;	/* next member of HOT chain, if any */
+	bool		xs_hot_dead;	/* T if all members of HOT chain are dead */
 } IndexScanDescData;

 typedef IndexScanDescData *IndexScanDesc;
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -37,7 +37,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.425 2007/09/18 17:41:17 adunstan Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.426 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -53,6 +53,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200709181
+#define CATALOG_VERSION_NO	200709201

 #endif
--- a/src/include/catalog/pg_attribute.h
+++ b/src/include/catalog/pg_attribute.h
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_attribute.h,v 1.132 2007/09/03 00:39:21 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_attribute.h,v 1.133 2007/09/20 17:56:32 tgl Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@ -471,10 +471,12 @@ DATA(insert ( 1259 tableoid			26 0  4  -7 0 -1 -1 t p i t f f t 0));
 { 0, {"indisprimary"},		16, -1, 1, 5, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
 { 0, {"indisclustered"},	16, -1, 1, 6, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
 { 0, {"indisvalid"},		16, -1, 1, 7, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
-{ 0, {"indkey"},			22, -1, -1, 8, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indclass"},			30, -1, -1, 9, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indoption"},			22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
-{ 0, {"indexprs"},			25, -1, -1, 11, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \
-{ 0, {"indpred"},			25, -1, -1, 12, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }
+{ 0, {"indcheckxmin"},		16, -1, 1, 8, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
+{ 0, {"indisready"},		16, -1, 1, 9, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \
+{ 0, {"indkey"},			22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indclass"},			30, -1, -1, 11, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indoption"},			22, -1, -1, 12, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \
+{ 0, {"indexprs"},			25, -1, -1, 13, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \
+{ 0, {"indpred"},			25, -1, -1, 14, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }

 #endif   /* PG_ATTRIBUTE_H */
--- a/src/include/catalog/pg_index.h
+++ b/src/include/catalog/pg_index.h
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_index.h,v 1.43 2007/01/09 02:14:15 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_index.h,v 1.44 2007/09/20 17:56:32 tgl Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@ -42,6 +42,8 @@ CATALOG(pg_index,2610) BKI_WITHOUT_OIDS
 	bool		indisprimary;	/* is this index for primary key? */
 	bool		indisclustered; /* is this the index last clustered by? */
 	bool		indisvalid;		/* is this index valid for use by queries? */
+	bool		indcheckxmin;	/* must we wait for xmin to be old? */
+	bool		indisready;		/* is this index ready for inserts? */

 	/* VARIABLE LENGTH FIELDS: */
 	int2vector	indkey;			/* column numbers of indexed cols, or 0 */
@ -65,7 +67,7 @@ typedef FormData_pg_index *Form_pg_index;
 *		compiler constants for pg_index
 * ----------------
 */
-#define Natts_pg_index					12
+#define Natts_pg_index					14
 #define Anum_pg_index_indexrelid		1
 #define Anum_pg_index_indrelid			2
 #define Anum_pg_index_indnatts			3
@ -73,11 +75,13 @@ typedef FormData_pg_index *Form_pg_index;
 #define Anum_pg_index_indisprimary		5
 #define Anum_pg_index_indisclustered	6
 #define Anum_pg_index_indisvalid		7
-#define Anum_pg_index_indkey			8
-#define Anum_pg_index_indclass			9
-#define Anum_pg_index_indoption			10
-#define Anum_pg_index_indexprs			11
-#define Anum_pg_index_indpred			12
+#define Anum_pg_index_indcheckxmin		8
+#define Anum_pg_index_indisready		9
+#define Anum_pg_index_indkey			10
+#define Anum_pg_index_indclass			11
+#define Anum_pg_index_indoption			12
+#define Anum_pg_index_indexprs			13
+#define Anum_pg_index_indpred			14

 /*
 * Index AMs that support ordered scans must support these two indoption
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.470 2007/09/18 17:41:17 adunstan Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.471 2007/09/20 17:56:32 tgl Exp $
 *
 * NOTES
 *	  The script catalog/genbki.sh reads this file and generates .bki
@ -2873,6 +2873,8 @@ DATA(insert OID = 1932 (  pg_stat_get_tuples_updated	PGNSP PGUID 12 1 0 f f t f
 DESCR("statistics: number of tuples updated");
 DATA(insert OID = 1933 (  pg_stat_get_tuples_deleted	PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_deleted - _null_ _null_ ));
 DESCR("statistics: number of tuples deleted");
+DATA(insert OID = 1972 (  pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_hot_updated - _null_ _null_ ));
+DESCR("statistics: number of tuples hot updated");
 DATA(insert OID = 2878 (  pg_stat_get_live_tuples	PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_live_tuples - _null_ _null_ ));
 DESCR("statistics: number of live tuples");
 DATA(insert OID = 2879 (  pg_stat_get_dead_tuples	PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_dead_tuples - _null_ _null_ ));
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.177 2007/08/15 21:39:50 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.178 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -37,7 +37,12 @@
 *		Predicate			partial-index predicate, or NIL if none
 *		PredicateState		exec state for predicate, or NIL if none
 *		Unique				is it a unique index?
+ *		ReadyForInserts		is it valid for inserts?
 *		Concurrent			are we doing a concurrent index build?
+ *		BrokenHotChain		did we detect any broken HOT chains?
+ *
+ * ii_Concurrent and ii_BrokenHotChain are used only during index build;
+ * they're conventionally set to false otherwise.
 * ----------------
 */
 typedef struct IndexInfo
@ -50,7 +55,9 @@ typedef struct IndexInfo
 	List	   *ii_Predicate;	/* list of Expr */
 	List	   *ii_PredicateState;		/* list of ExprState */
 	bool		ii_Unique;
+	bool		ii_ReadyForInserts;
 	bool		ii_Concurrent;
+	bool		ii_BrokenHotChain;
 } IndexInfo;

 /* ----------------
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.94 2007/04/27 22:05:49 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.95 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -39,6 +39,8 @@ typedef struct PlannedStmt

 	bool		canSetTag;		/* do I set the command result tag? */

+	bool		transientPlan;	/* redo plan when TransactionXmin changes? */
+
 	struct Plan *planTree;		/* tree of Plan nodes */

 	List	   *rtable;			/* list of RangeTblEntry nodes */
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.145 2007/08/31 01:44:06 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.146 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -71,6 +71,8 @@ typedef struct PlannerGlobal
 	Bitmapset  *rewindPlanIDs;	/* indices of subplans that require REWIND */

 	List	   *finalrtable;	/* "flat" rangetable for executor */
+
+	bool		transientPlan;	/* redo plan when TransactionXmin changes? */
 } PlannerGlobal;

 /* macro for fetching the Plan associated with a SubPlan node */
--- a/src/include/optimizer/var.h
+++ b/src/include/optimizer/var.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/optimizer/var.h,v 1.35 2007/01/05 22:19:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/var.h,v 1.36 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -18,6 +18,7 @@


 extern Relids pull_varnos(Node *node);
+extern void pull_varattnos(Node *node, Bitmapset **varattnos);
 extern bool contain_var_reference(Node *node, int varno, int varattno,
 					  int levelsup);
 extern bool contain_var_clause(Node *node);
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@ -5,7 +5,7 @@
 *
 *	Copyright (c) 2001-2007, PostgreSQL Global Development Group
 *
- *	$PostgreSQL: pgsql/src/include/pgstat.h,v 1.65 2007/09/11 03:28:05 tgl Exp $
+ *	$PostgreSQL: pgsql/src/include/pgstat.h,v 1.66 2007/09/20 17:56:32 tgl Exp $
 * ----------
 */
 #ifndef PGSTAT_H
@ -55,10 +55,10 @@ typedef int64 PgStat_Counter;
 * the index AM, while tuples_fetched is the number of tuples successfully
 * fetched by heap_fetch under the control of simple indexscans for this index.
 *
- * tuples_inserted/tuples_updated/tuples_deleted count attempted actions,
+ * tuples_inserted/updated/deleted/hot_updated count attempted actions,
 * regardless of whether the transaction committed.  new_live_tuples and
 * new_dead_tuples are properly adjusted depending on commit or abort.
- * Note that new_live_tuples can be negative!
+ * Note that new_live_tuples and new_dead_tuples can be negative!
 * ----------
 */
 typedef struct PgStat_TableCounts
@ -71,6 +71,7 @@ typedef struct PgStat_TableCounts
 	PgStat_Counter t_tuples_inserted;
 	PgStat_Counter t_tuples_updated;
 	PgStat_Counter t_tuples_deleted;
+	PgStat_Counter t_tuples_hot_updated;

 	PgStat_Counter t_new_live_tuples;
 	PgStat_Counter t_new_dead_tuples;
@ -323,7 +324,7 @@ typedef union PgStat_Msg
 * ------------------------------------------------------------
 */

-#define PGSTAT_FILE_FORMAT_ID	0x01A5BC96
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC97

 /* ----------
 * PgStat_StatDBEntry			The collector's data per database
@ -367,6 +368,7 @@ typedef struct PgStat_StatTabEntry
 	PgStat_Counter tuples_inserted;
 	PgStat_Counter tuples_updated;
 	PgStat_Counter tuples_deleted;
+	PgStat_Counter tuples_hot_updated;

 	PgStat_Counter n_live_tuples;
 	PgStat_Counter n_dead_tuples;
@ -545,8 +547,9 @@ extern void pgstat_initstats(Relation rel);
 	} while (0)

 extern void pgstat_count_heap_insert(Relation rel);
-extern void pgstat_count_heap_update(Relation rel);
+extern void pgstat_count_heap_update(Relation rel, bool hot);
 extern void pgstat_count_heap_delete(Relation rel);
+extern void pgstat_update_heap_dead_tuples(Relation rel, int delta);

 extern void AtEOXact_PgStat(bool isCommit);
 extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth);
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.106 2007/07/25 12:22:53 mha Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.107 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -156,6 +156,7 @@ extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 extern bool ConditionalLockBuffer(Buffer buffer);
 extern void LockBufferForCleanup(Buffer buffer);
+extern bool ConditionalLockBufferForCleanup(Buffer buffer);

 extern void AbortBufferIO(void);

--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.73 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.74 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -140,10 +140,21 @@ typedef PageHeaderData *PageHeader;
 * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
 * pd_lower.  This should be considered a hint rather than the truth, since
 * changes to it are not WAL-logged.
+ *
+ * PD_PRUNABLE is set if there are any prunable tuples in the page.
+ * This should be considered a hint rather than the truth, since
+ * the transaction which generates a prunable tuple may or may not commit.
+ * Also there is a lag before a tuple is declared dead.
+ *
+ * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
+ * page for its new tuple version; this suggests that a prune is needed.
+ * Again, this is just a hint.
 */
 #define PD_HAS_FREE_LINES	0x0001	/* are there any unused line pointers? */
+#define PD_PRUNABLE			0x0002	/* are there any prunable tuples? */
+#define PD_PAGE_FULL		0x0004	/* not enough free space for new tuple? */

-#define PD_VALID_FLAG_BITS	0x0001	/* OR of all valid pd_flags bits */
+#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */

 /*
 * Page layout version number 0 is for pre-7.3 Postgres releases.
@ -337,6 +348,20 @@ typedef PageHeaderData *PageHeader;
 #define PageClearHasFreeLinePointers(page) \
 	(((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES)

+#define PageIsPrunable(page) \
+	(((PageHeader) (page))->pd_flags & PD_PRUNABLE)
+#define PageSetPrunable(page) \
+	(((PageHeader) (page))->pd_flags |= PD_PRUNABLE)
+#define PageClearPrunable(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_PRUNABLE)
+
+#define PageIsFull(page) \
+	(((PageHeader) (page))->pd_flags & PD_PAGE_FULL)
+#define PageSetFull(page) \
+	(((PageHeader) (page))->pd_flags |= PD_PAGE_FULL)
+#define PageClearFull(page) \
+	(((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL)
+

 /* ----------------------------------------------------------------
 *		extern declarations
@ -346,12 +371,13 @@ typedef PageHeaderData *PageHeader;
 extern void PageInit(Page page, Size pageSize, Size specialSize);
 extern bool PageHeaderIsValid(PageHeader page);
 extern OffsetNumber PageAddItem(Page page, Item item, Size size,
-			OffsetNumber offsetNumber, bool overwrite);
+			OffsetNumber offsetNumber, bool overwrite, bool is_heap);
 extern Page PageGetTempPage(Page page, Size specialSize);
 extern void PageRestoreTempPage(Page tempPage, Page oldPage);
-extern int	PageRepairFragmentation(Page page, OffsetNumber *unused);
+extern void PageRepairFragmentation(Page page);
 extern Size PageGetFreeSpace(Page page);
 extern Size PageGetExactFreeSpace(Page page);
+extern Size PageGetHeapFreeSpace(Page page);
 extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
 extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);

--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/plancache.h,v 1.7 2007/06/05 20:00:41 wieck Exp $
+ * $PostgreSQL: pgsql/src/include/utils/plancache.h,v 1.8 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -75,6 +75,8 @@ typedef struct CachedPlan
 	List	   *stmt_list;		/* list of statement or Query nodes */
 	bool		fully_planned;	/* do we cache planner or rewriter output? */
 	bool		dead;			/* if true, do not use */
+	TransactionId saved_xmin;	/* if valid, replan when TransactionXmin
+								 * changes from this value */
 	int			refcount;		/* count of live references to this struct */
 	int			generation;		/* counter, starting at 1, for replans */
 	MemoryContext context;		/* context containing this CachedPlan */
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.101 2007/05/27 03:50:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.102 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -19,6 +19,7 @@
 #include "catalog/pg_class.h"
 #include "catalog/pg_index.h"
 #include "fmgr.h"
+#include "nodes/bitmapset.h"
 #include "rewrite/prs2lock.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
@ -145,6 +146,7 @@ typedef struct RelationData
 	TupleDesc	rd_att;			/* tuple descriptor */
 	Oid			rd_id;			/* relation's object id */
 	List	   *rd_indexlist;	/* list of OIDs of indexes on relation */
+	Bitmapset  *rd_indexattr;	/* identifies columns used in indexes */
 	Oid			rd_oidindex;	/* OID of unique index on OID, if any */
 	LockInfoData rd_lockInfo;	/* lock mgr's info for locking relation */
 	RuleLock   *rd_rules;		/* rewrite rules */
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.59 2007/03/29 00:15:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.60 2007/09/20 17:56:32 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -29,6 +29,7 @@ extern List *RelationGetIndexList(Relation relation);
 extern Oid	RelationGetOidIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
+extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation);

 extern void RelationSetIndexList(Relation relation,
 					 List *indexIds, Oid oidIndex);
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@ -415,6 +415,7 @@ Table "public.concur_heap"
 f2     | text | 
 Indexes:
    "concur_index2" UNIQUE, btree (f1)
+    "concur_index3" UNIQUE, btree (f2) INVALID
    "concur_index1" btree (f2, f1)
    "concur_index4" btree (f2) WHERE f1 = 'a'::text
    "concur_index5" btree (f2) WHERE f1 = 'x'::text
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@ -1291,13 +1291,13 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
 pg_shadow                | SELECT pg_authid.rolname AS usename, pg_authid.oid AS usesysid, pg_authid.rolcreatedb AS usecreatedb, pg_authid.rolsuper AS usesuper, pg_authid.rolcatupdate AS usecatupd, pg_authid.rolpassword AS passwd, (pg_authid.rolvaliduntil)::abstime AS valuntil, pg_authid.rolconfig AS useconfig FROM pg_authid WHERE pg_authid.rolcanlogin;
 pg_stat_activity         | SELECT d.oid AS datid, d.datname, pg_stat_get_backend_pid(s.backendid) AS procpid, pg_stat_get_backend_userid(s.backendid) AS usesysid, u.rolname AS usename, pg_stat_get_backend_activity(s.backendid) AS current_query, pg_stat_get_backend_waiting(s.backendid) AS waiting, pg_stat_get_backend_xact_start(s.backendid) AS xact_start, pg_stat_get_backend_activity_start(s.backendid) AS query_start, pg_stat_get_backend_start(s.backendid) AS backend_start, pg_stat_get_backend_client_addr(s.backendid) AS client_addr, pg_stat_get_backend_client_port(s.backendid) AS client_port FROM pg_database d, (SELECT pg_stat_get_backend_idset() AS backendid) s, pg_authid u WHERE ((pg_stat_get_backend_dbid(s.backendid) = d.oid) AND (pg_stat_get_backend_userid(s.backendid) = u.oid));
 pg_stat_all_indexes      | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, pg_stat_get_numscans(i.oid) AS idx_scan, pg_stat_get_tuples_returned(i.oid) AS idx_tup_read, pg_stat_get_tuples_fetched(i.oid) AS idx_tup_fetch FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"]));
- pg_stat_all_tables       | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname;
+ pg_stat_all_tables       | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname;
 pg_stat_bgwriter         | SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints_timed, pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req, pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint, pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean, pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean;
 pg_stat_database         | SELECT d.oid AS datid, d.datname, pg_stat_get_db_numbackends(d.oid) AS numbackends, pg_stat_get_db_xact_commit(d.oid) AS xact_commit, pg_stat_get_db_xact_rollback(d.oid) AS xact_rollback, (pg_stat_get_db_blocks_fetched(d.oid) - pg_stat_get_db_blocks_hit(d.oid)) AS blks_read, pg_stat_get_db_blocks_hit(d.oid) AS blks_hit, pg_stat_get_db_tuples_returned(d.oid) AS tup_returned, pg_stat_get_db_tuples_fetched(d.oid) AS tup_fetched, pg_stat_get_db_tuples_inserted(d.oid) AS tup_inserted, pg_stat_get_db_tuples_updated(d.oid) AS tup_updated, pg_stat_get_db_tuples_deleted(d.oid) AS tup_deleted FROM pg_database d;
 pg_stat_sys_indexes      | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_indexes.schemaname ~ '^pg_toast'::text));
- pg_stat_sys_tables       | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
+ pg_stat_sys_tables       | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
 pg_stat_user_indexes     | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_indexes.schemaname !~ '^pg_toast'::text));
- pg_stat_user_tables      | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
+ pg_stat_user_tables      | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text));
 pg_statio_all_indexes    | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, (pg_stat_get_blocks_fetched(i.oid) - pg_stat_get_blocks_hit(i.oid)) AS idx_blks_read, pg_stat_get_blocks_hit(i.oid) AS idx_blks_hit FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"]));
 pg_statio_all_sequences  | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS blks_read, pg_stat_get_blocks_hit(c.oid) AS blks_hit FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'S'::"char");
 pg_statio_all_tables     | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS heap_blks_read, pg_stat_get_blocks_hit(c.oid) AS heap_blks_hit, (sum((pg_stat_get_blocks_fetched(i.indexrelid) - pg_stat_get_blocks_hit(i.indexrelid))))::bigint AS idx_blks_read, (sum(pg_stat_get_blocks_hit(i.indexrelid)))::bigint AS idx_blks_hit, (pg_stat_get_blocks_fetched(t.oid) - pg_stat_get_blocks_hit(t.oid)) AS toast_blks_read, pg_stat_get_blocks_hit(t.oid) AS toast_blks_hit, (pg_stat_get_blocks_fetched(x.oid) - pg_stat_get_blocks_hit(x.oid)) AS tidx_blks_read, pg_stat_get_blocks_hit(x.oid) AS tidx_blks_hit FROM ((((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_class t ON ((c.reltoastrelid = t.oid))) LEFT JOIN pg_class x ON ((t.reltoastidxid = x.oid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname, t.oid, x.oid;