Teach tuplestore.c to throw away data before the "mark" point when the caller

is using mark/restore but not rewind or backward-scan capability. Insert a materialize plan node between a mergejoin and its inner child if the inner child is a sort that is expected to spill to disk. The materialize shields the sort from the need to do mark/restore and thereby allows it to perform its final merge pass on-the-fly; while the materialize itself is normally cheap since it won't spill to disk unless the number of tuples with equal key values exceeds work_mem. Greg Stark, with some kibitzing from Tom Lane.
2007-05-21 17:57:35 +00:00 · 2007-05-21 17:57:35 +00:00 · 2415ad9831
commit 2415ad9831
parent 3963574d13
8 changed files with 236 additions and 40 deletions
--- a/src/backend/executor/nodeMaterial.c
+++ b/src/backend/executor/nodeMaterial.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.58 2007/01/05 22:19:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.59 2007/05/21 17:57:33 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -56,10 +56,10 @@ ExecMaterial(MaterialState *node)
 	/*
 	 * If first time through, and we need a tuplestore, initialize it.
 	 */
-	if (tuplestorestate == NULL && node->randomAccess)
+	if (tuplestorestate == NULL && node->eflags != 0)
 	{
 		tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
-
+		tuplestore_set_eflags(tuplestorestate, node->eflags);
 		node->tuplestorestate = (void *) tuplestorestate;
 	}

@ -162,14 +162,14 @@ ExecInitMaterial(Material *node, EState *estate, int eflags)
 	matstate->ss.ps.state = estate;

 	/*
-	 * We must have random access to the subplan output to do backward scan or
-	 * mark/restore.  We also prefer to materialize the subplan output if we
-	 * might be called on to rewind and replay it many times. However, if none
-	 * of these cases apply, we can skip storing the data.
+	 * We must have a tuplestore buffering the subplan output to do backward
+	 * scan or mark/restore.  We also prefer to materialize the subplan output
+	 * if we might be called on to rewind and replay it many times. However,
+	 * if none of these cases apply, we can skip storing the data.
 	 */
-	matstate->randomAccess = (eflags & (EXEC_FLAG_REWIND |
-										EXEC_FLAG_BACKWARD |
-										EXEC_FLAG_MARK)) != 0;
+	matstate->eflags = (eflags & (EXEC_FLAG_REWIND |
+								  EXEC_FLAG_BACKWARD |
+								  EXEC_FLAG_MARK));

 	matstate->eof_underlying = false;
 	matstate->tuplestorestate = NULL;
@ -255,7 +255,7 @@ ExecEndMaterial(MaterialState *node)
 void
 ExecMaterialMarkPos(MaterialState *node)
 {
-	Assert(node->randomAccess);
+	Assert(node->eflags & EXEC_FLAG_MARK);

 	/*
 	 * if we haven't materialized yet, just return.
@ -275,7 +275,7 @@ ExecMaterialMarkPos(MaterialState *node)
 void
 ExecMaterialRestrPos(MaterialState *node)
 {
-	Assert(node->randomAccess);
+	Assert(node->eflags & EXEC_FLAG_MARK);

 	/*
 	 * if we haven't materialized yet, just return.
@ -300,7 +300,7 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
 {
 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);

-	if (node->randomAccess)
+	if (node->eflags != 0)
 	{
 		/*
 		 * If we haven't materialized yet, just return. If outerplan' chgParam
@ -312,15 +312,21 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)

 		/*
 		 * If subnode is to be rescanned then we forget previous stored
-		 * results; we have to re-read the subplan and re-store.
+		 * results; we have to re-read the subplan and re-store.  Also,
+		 * if we told tuplestore it needn't support rescan, we lose and
+		 * must re-read.  (This last should not happen in common cases;
+		 * else our caller lied by not passing EXEC_FLAG_REWIND to us.)
 		 *
 		 * Otherwise we can just rewind and rescan the stored output. The
 		 * state of the subnode does not change.
 		 */
-		if (((PlanState *) node)->lefttree->chgParam != NULL)
+		if (((PlanState *) node)->lefttree->chgParam != NULL ||
+			(node->eflags & EXEC_FLAG_REWIND) == 0)
 		{
 			tuplestore_end((Tuplestorestate *) node->tuplestorestate);
 			node->tuplestorestate = NULL;
+			if (((PlanState *) node)->lefttree->chgParam == NULL)
+				ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
 			node->eof_underlying = false;
 		}
 		else
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.87 2007/02/02 00:07:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.88 2007/05/21 17:57:33 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -706,6 +706,9 @@ ExecMergeJoin(MergeJoinState *node)
 				}
 				else
 				{
+					/* Mark before advancing, if wanted */
+					if (node->mj_ExtraMarks)
+						ExecMarkPos(innerPlan);
 					/* Stay in same state to fetch next inner tuple */
 					if (doFillInner)
 					{
@ -830,6 +833,9 @@ ExecMergeJoin(MergeJoinState *node)
 				 * now we get the next inner tuple, if any.  If there's none,
 				 * advance to next outer tuple (which may be able to join to
 				 * previously marked tuples).
+				 *
+				 * NB: must NOT do "extraMarks" here, since we may need to
+				 * return to previously marked tuples.
 				 */
 				innerTupleSlot = ExecProcNode(innerPlan);
 				node->mj_InnerTupleSlot = innerTupleSlot;
@ -1140,6 +1146,9 @@ ExecMergeJoin(MergeJoinState *node)
 				break;

 				/*
+				 * SKIPOUTER_ADVANCE: advance over an outer tuple that is
+				 * known not to join to any inner tuple.
+				 *
 				 * Before advancing, we check to see if we must emit an
 				 * outer-join fill tuple for this outer tuple.
 				 */
@ -1204,6 +1213,9 @@ ExecMergeJoin(MergeJoinState *node)
 				break;

 				/*
+				 * SKIPINNER_ADVANCE: advance over an inner tuple that is
+				 * known not to join to any outer tuple.
+				 *
 				 * Before advancing, we check to see if we must emit an
 				 * outer-join fill tuple for this inner tuple.
 				 */
@ -1225,6 +1237,10 @@ ExecMergeJoin(MergeJoinState *node)
 						return result;
 				}

+				/* Mark before advancing, if wanted */
+				if (node->mj_ExtraMarks)
+					ExecMarkPos(innerPlan);
+
 				/*
 				 * now we get the next inner tuple, if any
 				 */
@ -1295,6 +1311,10 @@ ExecMergeJoin(MergeJoinState *node)
 						return result;
 				}

+				/* Mark before advancing, if wanted */
+				if (node->mj_ExtraMarks)
+					ExecMarkPos(innerPlan);
+
 				/*
 				 * now we get the next inner tuple, if any
 				 */
@ -1425,6 +1445,22 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
 	innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
 											  eflags | EXEC_FLAG_MARK);

+	/*
+	 * For certain types of inner child nodes, it is advantageous to issue
+	 * MARK every time we advance past an inner tuple we will never return
+	 * to.  For other types, MARK on a tuple we cannot return to is a waste
+	 * of cycles.  Detect which case applies and set mj_ExtraMarks if we
+	 * want to issue "unnecessary" MARK calls.
+	 *
+	 * Currently, only Material wants the extra MARKs, and it will be helpful
+	 * only if eflags doesn't specify REWIND.
+	 */
+	if (IsA(innerPlan(node), Material) &&
+		(eflags & EXEC_FLAG_REWIND) == 0)
+		mergestate->mj_ExtraMarks = true;
+	else
+		mergestate->mj_ExtraMarks = false;
+
 #define MERGEJOIN_NSLOTS 4

 	/*
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -54,7 +54,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.183 2007/05/21 17:57:33 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1038,6 +1038,23 @@ cost_sort(Path *path, PlannerInfo *root,
 	path->total_cost = startup_cost + run_cost;
 }

+/*
+ * sort_exceeds_work_mem
+ *	  Given a finished Sort plan node, detect whether it is expected to
+ *	  spill to disk (ie, will need more than work_mem workspace)
+ *
+ * This assumes there will be no available LIMIT.
+ */
+bool
+sort_exceeds_work_mem(Sort *sort)
+{
+	double		input_bytes = relation_byte_size(sort->plan.plan_rows,
+												 sort->plan.plan_width);
+	long		work_mem_bytes = work_mem * 1024L;
+
+	return (input_bytes > work_mem_bytes);
+}
+
 /*
 * cost_material
 *	  Determines and returns the cost of materializing a relation, including
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.231 2007/05/21 17:57:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1600,6 +1600,30 @@ create_mergejoin_plan(PlannerInfo *root,
 	else
 		innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;

+	/*
+	 * If inner plan is a sort that is expected to spill to disk, add a
+	 * materialize node to shield it from the need to handle mark/restore.
+	 * This will allow it to perform the last merge pass on-the-fly, while
+	 * in most cases not requiring the materialize to spill to disk.
+	 *
+	 * XXX really, Sort oughta do this for itself, probably, to avoid the
+	 * overhead of a separate plan node.
+	 */
+	if (IsA(inner_plan, Sort) &&
+		sort_exceeds_work_mem((Sort *) inner_plan))
+	{
+		Plan	   *matplan = (Plan *) make_material(inner_plan);
+
+		/*
+		 * We assume the materialize will not spill to disk, and therefore
+		 * charge just cpu_tuple_cost per tuple.
+		 */
+		copy_plan_costsize(matplan, inner_plan);
+		matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
+
+		inner_plan = matplan;
+	}
+
 	/*
 	 * Compute the opfamily/strategy/nullsfirst arrays needed by the executor.
 	 * The information is in the pathkeys for the two inputs, but we need to
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@ -20,10 +20,12 @@
 * maxKBytes, we dump all the tuples into a temp file and then read from that
 * when needed.
 *
- * When the caller requests random access to the data, we write the temp file
+ * When the caller requests backward-scan capability, we write the temp file
 * in a format that allows either forward or backward scan.  Otherwise, only
- * forward scan is allowed.  But rewind and markpos/restorepos are allowed
- * in any case.
+ * forward scan is allowed.  Rewind and markpos/restorepos are normally allowed
+ * but can be turned off via tuplestore_set_eflags; turning off both backward
+ * scan and rewind enables truncation of the tuplestore at the mark point
+ * (if any) for minimal memory usage.
 *
 * Because we allow reading before writing is complete, there are two
 * interesting positions in the temp file: the current read position and
@ -36,7 +38,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.30 2007/01/05 22:19:47 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.31 2007/05/21 17:57:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -44,6 +46,7 @@
 #include "postgres.h"

 #include "access/heapam.h"
+#include "executor/executor.h"
 #include "storage/buffile.h"
 #include "utils/memutils.h"
 #include "utils/tuplestore.h"
@ -66,7 +69,7 @@ typedef enum
 struct Tuplestorestate
 {
 	TupStoreStatus status;		/* enumerated value as shown above */
-	bool		randomAccess;	/* did caller request random access? */
+	int			eflags;			/* capability flags */
 	bool		interXact;		/* keep open through transactions? */
 	long		availMem;		/* remaining memory available, in bytes */
 	BufFile    *myfile;			/* underlying file, or NULL if none */
@ -157,11 +160,11 @@ struct Tuplestorestate
 * may or may not match the in-memory representation of the tuple ---
 * any conversion needed is the job of the writetup and readtup routines.
 *
- * If state->randomAccess is true, then the stored representation of the
- * tuple must be followed by another "unsigned int" that is a copy of the
+ * If state->eflags & EXEC_FLAG_BACKWARD, then the stored representation of
+ * the tuple must be followed by another "unsigned int" that is a copy of the
 * length --- so the total tape space used is actually sizeof(unsigned int)
 * more than the stored length value.  This allows read-backwards.	When
- * randomAccess is not true, the write/read routines may omit the extra
+ * EXEC_FLAG_BACKWARD is not set, the write/read routines may omit the extra
 * length word.
 *
 * writetup is expected to write both length words as well as the tuple
@ -192,11 +195,12 @@ struct Tuplestorestate
 */


-static Tuplestorestate *tuplestore_begin_common(bool randomAccess,
+static Tuplestorestate *tuplestore_begin_common(int eflags,
 						bool interXact,
 						int maxKBytes);
 static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple);
 static void dumptuples(Tuplestorestate *state);
+static void tuplestore_trim(Tuplestorestate *state, int ntuples);
 static unsigned int getlen(Tuplestorestate *state, bool eofOK);
 static void *copytup_heap(Tuplestorestate *state, void *tup);
 static void writetup_heap(Tuplestorestate *state, void *tup);
@ -209,14 +213,14 @@ static void *readtup_heap(Tuplestorestate *state, unsigned int len);
 * Initialize for a tuple store operation.
 */
 static Tuplestorestate *
-tuplestore_begin_common(bool randomAccess, bool interXact, int maxKBytes)
+tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
 {
 	Tuplestorestate *state;

 	state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate));

 	state->status = TSS_INMEM;
-	state->randomAccess = randomAccess;
+	state->eflags = eflags;
 	state->interXact = interXact;
 	state->availMem = maxKBytes * 1024L;
 	state->myfile = NULL;
@ -255,9 +259,18 @@ tuplestore_begin_common(bool randomAccess, bool interXact, int maxKBytes)
 Tuplestorestate *
 tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
 {
-	Tuplestorestate *state = tuplestore_begin_common(randomAccess,
-													 interXact,
-													 maxKBytes);
+	Tuplestorestate *state;
+	int		eflags;
+
+	/*
+	 * This interpretation of the meaning of randomAccess is compatible
+	 * with the pre-8.3 behavior of tuplestores.
+	 */
+	eflags = randomAccess ?
+		(EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND | EXEC_FLAG_MARK) :
+		(EXEC_FLAG_REWIND | EXEC_FLAG_MARK);
+
+	state = tuplestore_begin_common(eflags, interXact, maxKBytes);

 	state->copytup = copytup_heap;
 	state->writetup = writetup_heap;
@ -266,6 +279,30 @@ tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
 	return state;
 }

+/*
+ * tuplestore_set_eflags
+ *
+ * Set capability flags at a finer grain than is allowed by
+ * tuplestore_begin_xxx.  This must be called before inserting any data
+ * into the tuplestore.
+ *
+ * eflags is a bitmask following the meanings used for executor node
+ * startup flags (see executor.h).  tuplestore pays attention to these bits:
+ *		EXEC_FLAG_REWIND		need rewind to start
+ *		EXEC_FLAG_BACKWARD		need backward fetch
+ *		EXEC_FLAG_MARK			need mark/restore
+ * If tuplestore_set_eflags is not called, REWIND and MARK are allowed,
+ * and BACKWARD is set per "randomAccess" in the tuplestore_begin_xxx call.
+ */
+void
+tuplestore_set_eflags(Tuplestorestate *state, int eflags)
+{
+	Assert(state->status == TSS_INMEM);
+	Assert(state->memtupcount == 0);
+
+	state->eflags = eflags;
+}
+
 /*
 * tuplestore_end
 *
@ -420,6 +457,9 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple)
 * Fetch the next tuple in either forward or back direction.
 * Returns NULL if no more tuples.	If should_free is set, the
 * caller must pfree the returned tuple when done with it.
+ *
+ * Backward scan is only allowed if randomAccess was set true or
+ * EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags().
 */
 static void *
 tuplestore_gettuple(Tuplestorestate *state, bool forward,
@ -428,7 +468,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
 	unsigned int tuplen;
 	void	   *tup;

-	Assert(forward || state->randomAccess);
+	Assert(forward || (state->eflags & EXEC_FLAG_BACKWARD));

 	switch (state->status)
 	{
@ -643,6 +683,8 @@ dumptuples(Tuplestorestate *state)
 void
 tuplestore_rescan(Tuplestorestate *state)
 {
+	Assert(state->eflags & EXEC_FLAG_REWIND);
+
 	switch (state->status)
 	{
 		case TSS_INMEM:
@ -671,10 +713,26 @@ tuplestore_rescan(Tuplestorestate *state)
 void
 tuplestore_markpos(Tuplestorestate *state)
 {
+	Assert(state->eflags & EXEC_FLAG_MARK);
+
 	switch (state->status)
 	{
 		case TSS_INMEM:
 			state->markpos_current = state->current;
+			/*
+			 * We can truncate the tuplestore if neither backward scan nor
+			 * rewind capability are required by the caller.  There will
+			 * never be a need to back up past the mark point.
+			 *
+			 * Note: you might think we could remove all the tuples before
+			 * "current", since that one is the next to be returned.  However,
+			 * since tuplestore_gettuple returns a direct pointer to our
+			 * internal copy of the tuple, it's likely that the caller has
+			 * still got the tuple just before "current" referenced in a slot.
+			 * Don't free it yet.
+			 */
+			if (!(state->eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND)))
+				tuplestore_trim(state, 1);
 			break;
 		case TSS_WRITEFILE:
 			if (state->eof_reached)
@ -708,6 +766,8 @@ tuplestore_markpos(Tuplestorestate *state)
 void
 tuplestore_restorepos(Tuplestorestate *state)
 {
+	Assert(state->eflags & EXEC_FLAG_MARK);
+
 	switch (state->status)
 	{
 		case TSS_INMEM:
@ -733,6 +793,55 @@ tuplestore_restorepos(Tuplestorestate *state)
 	}
 }

+/*
+ * tuplestore_trim	- remove all but ntuples tuples before current
+ */
+static void
+tuplestore_trim(Tuplestorestate *state, int ntuples)
+{
+	int			nremove;
+	int			i;
+
+	/*
+	 * We don't bother trimming temp files since it usually would mean more
+	 * work than just letting them sit in kernel buffers until they age out.
+	 */
+	if (state->status != TSS_INMEM)
+		return;
+
+	nremove = state->current - ntuples;
+	if (nremove <= 0)
+		return;					/* nothing to do */
+	Assert(nremove <= state->memtupcount);
+
+	/* Release no-longer-needed tuples */
+	for (i = 0; i < nremove; i++)
+	{
+		FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
+		pfree(state->memtuples[i]);
+	}
+
+	/*
+	 * Slide the array down and readjust pointers.  This may look pretty
+	 * stupid, but we expect that there will usually not be very many
+	 * tuple-pointers to move, so this isn't that expensive; and it keeps
+	 * a lot of other logic simple.
+	 *
+	 * In fact, in the current usage for merge joins, it's demonstrable that
+	 * there will always be exactly one non-removed tuple; so optimize that
+	 * case.
+	 */
+	if (nremove + 1 == state->memtupcount)
+		state->memtuples[0] = state->memtuples[nremove];
+	else
+		memmove(state->memtuples, state->memtuples + nremove,
+				(state->memtupcount - nremove) * sizeof(void *));
+
+	state->memtupcount -= nremove;
+	state->current -= nremove;
+	state->markpos_current -= nremove;
+}
+

 /*
 * Tape interface routines
@ -783,7 +892,7 @@ writetup_heap(Tuplestorestate *state, void *tup)

 	if (BufFileWrite(state->myfile, (void *) tuple, tuplen) != (size_t) tuplen)
 		elog(ERROR, "write failed");
-	if (state->randomAccess)	/* need trailing length word? */
+	if (state->eflags & EXEC_FLAG_BACKWARD)	/* need trailing length word? */
 		if (BufFileWrite(state->myfile, (void *) &tuplen,
 						 sizeof(tuplen)) != sizeof(tuplen))
 			elog(ERROR, "write failed");
@ -804,7 +913,7 @@ readtup_heap(Tuplestorestate *state, unsigned int len)
 	if (BufFileRead(state->myfile, (void *) ((char *) tuple + sizeof(int)),
 					len - sizeof(int)) != (size_t) (len - sizeof(int)))
 		elog(ERROR, "unexpected end of data");
-	if (state->randomAccess)	/* need trailing length word? */
+	if (state->eflags & EXEC_FLAG_BACKWARD)	/* need trailing length word? */
 		if (BufFileRead(state->myfile, (void *) &tuplen,
 						sizeof(tuplen)) != sizeof(tuplen))
 			elog(ERROR, "unexpected end of data");
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.174 2007/05/17 19:35:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.175 2007/05/21 17:57:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1180,6 +1180,7 @@ typedef struct NestLoopState
 *		NumClauses		   number of mergejoinable join clauses
 *		Clauses			   info for each mergejoinable clause
 *		JoinState		   current "state" of join.  see execdefs.h
+ *		ExtraMarks		   true to issue extra Mark operations on inner scan
 *		FillOuter		   true if should emit unjoined outer tuples anyway
 *		FillInner		   true if should emit unjoined inner tuples anyway
 *		MatchedOuter	   true if found a join match for current outer tuple
@ -1202,6 +1203,7 @@ typedef struct MergeJoinState
 	int			mj_NumClauses;
 	MergeJoinClause mj_Clauses; /* array of length mj_NumClauses */
 	int			mj_JoinState;
+	bool		mj_ExtraMarks;
 	bool		mj_FillOuter;
 	bool		mj_FillInner;
 	bool		mj_MatchedOuter;
@ -1281,7 +1283,7 @@ typedef struct HashJoinState
 typedef struct MaterialState
 {
 	ScanState	ss;				/* its first field is NodeTag */
-	bool		randomAccess;	/* need random access to subplan output? */
+	int			eflags;			/* capability flags to pass to tuplestore */
 	bool		eof_underlying; /* reached end of underlying plan? */
 	void	   *tuplestorestate;	/* private state of tuplestore.c */
 } MaterialState;
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.86 2007/05/04 01:13:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.87 2007/05/21 17:57:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -75,6 +75,7 @@ extern void cost_valuesscan(Path *path, PlannerInfo *root,
 extern void cost_sort(Path *path, PlannerInfo *root,
 		  List *pathkeys, Cost input_cost, double tuples, int width,
 		  double limit_tuples);
+extern bool sort_exceeds_work_mem(Sort *sort);
 extern void cost_material(Path *path,
 			  Cost input_cost, double tuples, int width);
 extern void cost_agg(Path *path, PlannerInfo *root,
--- a/src/include/utils/tuplestore.h
+++ b/src/include/utils/tuplestore.h
@ -22,7 +22,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/tuplestore.h,v 1.20 2007/01/05 22:20:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/tuplestore.h,v 1.21 2007/05/21 17:57:35 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -46,6 +46,8 @@ extern Tuplestorestate *tuplestore_begin_heap(bool randomAccess,
 					  bool interXact,
 					  int maxKBytes);

+extern void tuplestore_set_eflags(Tuplestorestate *state, int eflags);
+
 extern void tuplestore_puttupleslot(Tuplestorestate *state,
 						TupleTableSlot *slot);
 extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple);
@ -53,7 +55,6 @@ extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple);
 /* tuplestore_donestoring() used to be required, but is no longer used */
 #define tuplestore_donestoring(state)	((void) 0)

-/* backwards scan is only allowed if randomAccess was specified 'true' */
 extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
 						TupleTableSlot *slot);
 extern bool tuplestore_advance(Tuplestorestate *state, bool forward);