diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 114db38116..f68c992213 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4573,6 +4573,20 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-incrementalsort" xreflabel="enable_incrementalsort">
+      <term><varname>enable_incrementalsort</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_incrementalsort</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of incremental sort steps.
+        The default is <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-indexscan" xreflabel="enable_indexscan">
       <term><varname>enable_indexscan</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 58477ac83a..0dfc3e80e2 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -291,7 +291,47 @@ EXPLAIN SELECT * FROM tenk1 WHERE unique1 = 42;
     often see this plan type for queries that fetch just a single row.  It's
     also often used for queries that have an <literal>ORDER BY</literal> condition
     that matches the index order, because then no extra sorting step is needed
-    to satisfy the <literal>ORDER BY</literal>.
+    to satisfy the <literal>ORDER BY</literal>.  In this example, adding
+    <literal>ORDER BY unique1</literal> would use the same plan because the
+    index already implicitly provides the requested ordering.
+   </para>
+
+   <para>
+     The planner may implement an <literal>ORDER BY</literal> clause in several
+     ways.  The above example shows that such an ordering clause may be
+     implemented implicitly.  The planner may also add an explicit
+     <literal>sort</literal> step:
+
+<screen>
+EXPLAIN SELECT * FROM tenk1 ORDER BY unique1;
+                            QUERY PLAN
+-------------------------------------------------------------------
+ Sort  (cost=1109.39..1134.39 rows=10000 width=244)
+   Sort Key: unique1
+   ->  Seq Scan on tenk1  (cost=0.00..445.00 rows=10000 width=244)
+</screen>
+
+    If the a part of the plan guarantess an ordering on a prefix of the
+    required sort keys, then the planner may instead decide to use an
+    <literal>incremental sort</literal> step:
+
+<screen>
+EXPLAIN SELECT * FROM tenk1 ORDER BY four, ten LIMIT 100;
+                                              QUERY PLAN
+------------------------------------------------------------------------------------------------------
+ Limit  (cost=521.06..538.05 rows=100 width=244)
+   ->  Incremental Sort  (cost=521.06..2220.95 rows=10000 width=244)
+         Sort Key: four, ten
+         Presorted Key: four
+         ->  Index Scan using index_tenk1_on_four on tenk1  (cost=0.29..1510.08 rows=10000 width=244)
+</screen>
+
+    Compared to regular sorts, sorting incrementally allows returning tuples
+    before the entire result set has been sorted, which particularly enables
+    optimizations with <literal>LIMIT</literal> queries.  It may also reduce
+    memory usage and the likelihood of spilling sorts to disk, but it comes at
+    the cost of the increased overhead of splitting the result set into multiple
+    sorting batches.
    </para>
 
    <para>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index bb58f92851..62c86ecdc5 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -82,6 +82,8 @@ static void show_upper_qual(List *qual, const char *qlabel,
 							ExplainState *es);
 static void show_sort_keys(SortState *sortstate, List *ancestors,
 						   ExplainState *es);
+static void show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+									   List *ancestors, ExplainState *es);
 static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
 								   ExplainState *es);
 static void show_agg_keys(AggState *astate, List *ancestors,
@@ -95,7 +97,7 @@ static void show_grouping_set_keys(PlanState *planstate,
 static void show_group_keys(GroupState *gstate, List *ancestors,
 							ExplainState *es);
 static void show_sort_group_keys(PlanState *planstate, const char *qlabel,
-								 int nkeys, AttrNumber *keycols,
+								 int nkeys, int nPresortedKeys, AttrNumber *keycols,
 								 Oid *sortOperators, Oid *collations, bool *nullsFirst,
 								 List *ancestors, ExplainState *es);
 static void show_sortorder_options(StringInfo buf, Node *sortexpr,
@@ -103,6 +105,8 @@ static void show_sortorder_options(StringInfo buf, Node *sortexpr,
 static void show_tablesample(TableSampleClause *tsc, PlanState *planstate,
 							 List *ancestors, ExplainState *es);
 static void show_sort_info(SortState *sortstate, ExplainState *es);
+static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
+									   ExplainState *es);
 static void show_hash_info(HashState *hashstate, ExplainState *es);
 static void show_hashagg_info(AggState *hashstate, ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
@@ -1278,6 +1282,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Sort:
 			pname = sname = "Sort";
 			break;
+		case T_IncrementalSort:
+			pname = sname = "Incremental Sort";
+			break;
 		case T_Group:
 			pname = sname = "Group";
 			break;
@@ -1937,6 +1944,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			show_sort_keys(castNode(SortState, planstate), ancestors, es);
 			show_sort_info(castNode(SortState, planstate), es);
 			break;
+		case T_IncrementalSort:
+			show_incremental_sort_keys(castNode(IncrementalSortState, planstate),
+									   ancestors, es);
+			show_incremental_sort_info(castNode(IncrementalSortState, planstate),
+									   es);
+			break;
 		case T_MergeAppend:
 			show_merge_append_keys(castNode(MergeAppendState, planstate),
 								   ancestors, es);
@@ -2270,12 +2283,29 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es)
 	Sort	   *plan = (Sort *) sortstate->ss.ps.plan;
 
 	show_sort_group_keys((PlanState *) sortstate, "Sort Key",
-						 plan->numCols, plan->sortColIdx,
+						 plan->numCols, 0, plan->sortColIdx,
 						 plan->sortOperators, plan->collations,
 						 plan->nullsFirst,
 						 ancestors, es);
 }
 
+/*
+ * Show the sort keys for a IncrementalSort node.
+ */
+static void
+show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+						   List *ancestors, ExplainState *es)
+{
+	IncrementalSort *plan = (IncrementalSort *) incrsortstate->ss.ps.plan;
+
+	show_sort_group_keys((PlanState *) incrsortstate, "Sort Key",
+						 plan->sort.numCols, plan->nPresortedCols,
+						 plan->sort.sortColIdx,
+						 plan->sort.sortOperators, plan->sort.collations,
+						 plan->sort.nullsFirst,
+						 ancestors, es);
+}
+
 /*
  * Likewise, for a MergeAppend node.
  */
@@ -2286,7 +2316,7 @@ show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
 	MergeAppend *plan = (MergeAppend *) mstate->ps.plan;
 
 	show_sort_group_keys((PlanState *) mstate, "Sort Key",
-						 plan->numCols, plan->sortColIdx,
+						 plan->numCols, 0, plan->sortColIdx,
 						 plan->sortOperators, plan->collations,
 						 plan->nullsFirst,
 						 ancestors, es);
@@ -2310,7 +2340,7 @@ show_agg_keys(AggState *astate, List *ancestors,
 			show_grouping_sets(outerPlanState(astate), plan, ancestors, es);
 		else
 			show_sort_group_keys(outerPlanState(astate), "Group Key",
-								 plan->numCols, plan->grpColIdx,
+								 plan->numCols, 0, plan->grpColIdx,
 								 NULL, NULL, NULL,
 								 ancestors, es);
 
@@ -2379,7 +2409,7 @@ show_grouping_set_keys(PlanState *planstate,
 	if (sortnode)
 	{
 		show_sort_group_keys(planstate, "Sort Key",
-							 sortnode->numCols, sortnode->sortColIdx,
+							 sortnode->numCols, 0, sortnode->sortColIdx,
 							 sortnode->sortOperators, sortnode->collations,
 							 sortnode->nullsFirst,
 							 ancestors, es);
@@ -2436,7 +2466,7 @@ show_group_keys(GroupState *gstate, List *ancestors,
 	/* The key columns refer to the tlist of the child plan */
 	ancestors = lcons(plan, ancestors);
 	show_sort_group_keys(outerPlanState(gstate), "Group Key",
-						 plan->numCols, plan->grpColIdx,
+						 plan->numCols, 0, plan->grpColIdx,
 						 NULL, NULL, NULL,
 						 ancestors, es);
 	ancestors = list_delete_first(ancestors);
@@ -2449,13 +2479,14 @@ show_group_keys(GroupState *gstate, List *ancestors,
  */
 static void
 show_sort_group_keys(PlanState *planstate, const char *qlabel,
-					 int nkeys, AttrNumber *keycols,
+					 int nkeys, int nPresortedKeys, AttrNumber *keycols,
 					 Oid *sortOperators, Oid *collations, bool *nullsFirst,
 					 List *ancestors, ExplainState *es)
 {
 	Plan	   *plan = planstate->plan;
 	List	   *context;
 	List	   *result = NIL;
+	List	   *resultPresorted = NIL;
 	StringInfoData sortkeybuf;
 	bool		useprefix;
 	int			keyno;
@@ -2495,9 +2526,13 @@ show_sort_group_keys(PlanState *planstate, const char *qlabel,
 								   nullsFirst[keyno]);
 		/* Emit one property-list item per sort key */
 		result = lappend(result, pstrdup(sortkeybuf.data));
+		if (keyno < nPresortedKeys)
+			resultPresorted = lappend(resultPresorted, exprstr);
 	}
 
 	ExplainPropertyList(qlabel, result, es);
+	if (nPresortedKeys > 0)
+		ExplainPropertyList("Presorted Key", resultPresorted, es);
 }
 
 /*
@@ -2711,6 +2746,196 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 	}
 }
 
+/*
+ * Incremental sort nodes sort in (a potentially very large number of) batches,
+ * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into
+ * an intelligible summary.
+ *
+ * This function is used for both a non-parallel node and each worker in a
+ * parallel incremental sort node.
+ */
+static void
+show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
+								 const char *groupLabel, bool indent, ExplainState *es)
+{
+	ListCell   *methodCell;
+	List	   *methodNames = NIL;
+
+	/* Generate a list of sort methods used across all groups. */
+	for (int bit = 0; bit < sizeof(bits32); ++bit)
+	{
+		if (groupInfo->sortMethods & (1 << bit))
+		{
+			TuplesortMethod sortMethod = (1 << bit);
+			const char *methodName;
+
+			methodName = tuplesort_method_name(sortMethod);
+			methodNames = lappend(methodNames, unconstify(char *, methodName));
+		}
+	}
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		if (indent)
+			appendStringInfoSpaces(es->str, es->indent * 2);
+		appendStringInfo(es->str, "%s Groups: %ld Sort Method", groupLabel,
+						 groupInfo->groupCount);
+		/* plural/singular based on methodNames size */
+		if (list_length(methodNames) > 1)
+			appendStringInfo(es->str, "s: ");
+		else
+			appendStringInfo(es->str, ": ");
+		foreach(methodCell, methodNames)
+		{
+			appendStringInfo(es->str, "%s", (char *) methodCell->ptr_value);
+			if (foreach_current_index(methodCell) < list_length(methodNames) - 1)
+				appendStringInfo(es->str, ", ");
+		}
+
+		if (groupInfo->maxMemorySpaceUsed > 0)
+		{
+			long		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			const char *spaceTypeName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+			appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB",
+							 spaceTypeName, avgSpace,
+							 groupInfo->maxMemorySpaceUsed);
+		}
+
+		if (groupInfo->maxDiskSpaceUsed > 0)
+		{
+			long		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+
+			const char *spaceTypeName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+			/* Add a semicolon separator only if memory stats were printed. */
+			if (groupInfo->maxMemorySpaceUsed > 0)
+				appendStringInfo(es->str, ";");
+			appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB",
+							 spaceTypeName, avgSpace,
+							 groupInfo->maxDiskSpaceUsed);
+		}
+	}
+	else
+	{
+		StringInfoData groupName;
+
+		initStringInfo(&groupName);
+		appendStringInfo(&groupName, "%s Groups", groupLabel);
+		ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es);
+		ExplainPropertyInteger("Group Count", NULL, groupInfo->groupCount, es);
+
+		ExplainPropertyList("Sort Methods Used", methodNames, es);
+
+		if (groupInfo->maxMemorySpaceUsed > 0)
+		{
+			long		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			const char *spaceTypeName;
+			StringInfoData memoryName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+			initStringInfo(&memoryName);
+			appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName);
+			ExplainOpenGroup("Sort Space", memoryName.data, true, es);
+
+			ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+			ExplainPropertyInteger("Maximum Sort Space Used", "kB",
+								   groupInfo->maxMemorySpaceUsed, es);
+
+			ExplainCloseGroup("Sort Spaces", memoryName.data, true, es);
+		}
+		if (groupInfo->maxDiskSpaceUsed > 0)
+		{
+			long		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+			const char *spaceTypeName;
+			StringInfoData diskName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+			initStringInfo(&diskName);
+			appendStringInfo(&diskName, "Sort Space %s", spaceTypeName);
+			ExplainOpenGroup("Sort Space", diskName.data, true, es);
+
+			ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+			ExplainPropertyInteger("Maximum Sort Space Used", "kB",
+								   groupInfo->maxDiskSpaceUsed, es);
+
+			ExplainCloseGroup("Sort Spaces", diskName.data, true, es);
+		}
+
+		ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es);
+	}
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show tuplesort stats for a incremental sort node
+ */
+static void
+show_incremental_sort_info(IncrementalSortState *incrsortstate,
+						   ExplainState *es)
+{
+	IncrementalSortGroupInfo *fullsortGroupInfo;
+	IncrementalSortGroupInfo *prefixsortGroupInfo;
+
+	fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo;
+
+	if (!(es->analyze && fullsortGroupInfo->groupCount > 0))
+		return;
+
+	show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", true, es);
+	prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo;
+	if (prefixsortGroupInfo->groupCount > 0)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			appendStringInfo(es->str, " ");
+		show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es);
+	}
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+		appendStringInfo(es->str, "\n");
+
+	if (incrsortstate->shared_info != NULL)
+	{
+		int			n;
+		bool		indent_first_line;
+
+		for (n = 0; n < incrsortstate->shared_info->num_workers; n++)
+		{
+			IncrementalSortInfo *incsort_info =
+			&incrsortstate->shared_info->sinfo[n];
+
+			/*
+			 * If a worker hasn't process any sort groups at all, then exclude
+			 * it from output since it either didn't launch or didn't
+			 * contribute anything meaningful.
+			 */
+			fullsortGroupInfo = &incsort_info->fullsortGroupInfo;
+			prefixsortGroupInfo = &incsort_info->prefixsortGroupInfo;
+			if (fullsortGroupInfo->groupCount == 0 &&
+				prefixsortGroupInfo->groupCount == 0)
+				continue;
+
+			if (es->workers_state)
+				ExplainOpenWorker(n, es);
+
+			indent_first_line = es->workers_state == NULL || es->verbose;
+			show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort",
+											 indent_first_line, es);
+			if (prefixsortGroupInfo->groupCount > 0)
+			{
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfo(es->str, " ");
+				show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es);
+			}
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+				appendStringInfo(es->str, "\n");
+
+			if (es->workers_state)
+				ExplainCloseWorker(n, es);
+		}
+	}
+}
+
 /*
  * Show information on hash buckets/batches.
  */
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index a983800e4b..f990c6473a 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -46,6 +46,7 @@ OBJS = \
 	nodeGroup.o \
 	nodeHash.o \
 	nodeHashjoin.o \
+	nodeIncrementalSort.o \
 	nodeIndexonlyscan.o \
 	nodeIndexscan.o \
 	nodeLimit.o \
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index b12aeb3334..e2154ba86a 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -30,6 +30,7 @@
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeLimit.h"
@@ -252,6 +253,10 @@ ExecReScan(PlanState *node)
 			ExecReScanSort((SortState *) node);
 			break;
 
+		case T_IncrementalSortState:
+			ExecReScanIncrementalSort((IncrementalSortState *) node);
+			break;
+
 		case T_GroupState:
 			ExecReScanGroup((GroupState *) node);
 			break;
@@ -557,8 +562,17 @@ ExecSupportsBackwardScan(Plan *node)
 		case T_CteScan:
 		case T_Material:
 		case T_Sort:
+			/* these don't evaluate tlist */
 			return true;
 
+		case T_IncrementalSort:
+
+			/*
+			 * Unlike full sort, incremental sort keeps only a single group of
+			 * tuples in memory, so it can't scan backwards.
+			 */
+			return false;
+
 		case T_LockRows:
 		case T_Limit:
 			return ExecSupportsBackwardScan(outerPlan(node));
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index b7d0719953..41cb41481d 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -31,6 +31,7 @@
 #include "executor/nodeForeignscan.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeSeqscan.h"
@@ -283,6 +284,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortEstimate((SortState *) planstate, e->pcxt);
 			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
+			break;
 
 		default:
 			break;
@@ -496,6 +501,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
 			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
+			break;
 
 		default:
 			break;
@@ -972,6 +981,7 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 			break;
 		case T_HashState:
 		case T_SortState:
+		case T_IncrementalSortState:
 			/* these nodes have DSM state, but no reinitialization is required */
 			break;
 
@@ -1032,6 +1042,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_SortState:
 			ExecSortRetrieveInstrumentation((SortState *) planstate);
 			break;
+		case T_IncrementalSortState:
+			ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate);
+			break;
 		case T_HashState:
 			ExecHashRetrieveInstrumentation((HashState *) planstate);
 			break;
@@ -1318,6 +1331,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
 			break;
+		case T_IncrementalSortState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
+												pwcxt);
+			break;
 
 		default:
 			break;
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 7b2e84f402..5662e7d742 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -88,6 +88,7 @@
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeLimit.h"
@@ -313,6 +314,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 												estate, eflags);
 			break;
 
+		case T_IncrementalSort:
+			result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
+														   estate, eflags);
+			break;
+
 		case T_Group:
 			result = (PlanState *) ExecInitGroup((Group *) node,
 												 estate, eflags);
@@ -693,6 +699,10 @@ ExecEndNode(PlanState *node)
 			ExecEndSort((SortState *) node);
 			break;
 
+		case T_IncrementalSortState:
+			ExecEndIncrementalSort((IncrementalSortState *) node);
+			break;
+
 		case T_GroupState:
 			ExecEndGroup((GroupState *) node);
 			break;
@@ -839,6 +849,30 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node)
 			sortState->bound = tuples_needed;
 		}
 	}
+	else if (IsA(child_node, IncrementalSortState))
+	{
+		/*
+		 * If it is an IncrementalSort node, notify it that it can use bounded
+		 * sort.
+		 *
+		 * Note: it is the responsibility of nodeIncrementalSort.c to react
+		 * properly to changes of these parameters.  If we ever redesign this,
+		 * it'd be a good idea to integrate this signaling with the
+		 * parameter-change mechanism.
+		 */
+		IncrementalSortState *sortState = (IncrementalSortState *) child_node;
+
+		if (tuples_needed < 0)
+		{
+			/* make sure flag gets reset if needed upon rescan */
+			sortState->bounded = false;
+		}
+		else
+		{
+			sortState->bounded = true;
+			sortState->bound = tuples_needed;
+		}
+	}
 	else if (IsA(child_node, AppendState))
 	{
 		/*
diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c
new file mode 100644
index 0000000000..bcab7c054c
--- /dev/null
+++ b/src/backend/executor/nodeIncrementalSort.c
@@ -0,0 +1,1263 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.c
+ *	  Routines to handle incremental sorting of relations.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeIncrementalSort.c
+ *
+ * DESCRIPTION
+ *
+ *	Incremental sort is an optimized variant of multikey sort for cases
+ *	when the input is already sorted by a prefix of the sort keys.  For
+ *	example when a sort by (key1, key2 ... keyN) is requested, and the
+ *	input is already sorted by (key1, key2 ... keyM), M < N, we can
+ *	divide the input into groups where keys (key1, ... keyM) are equal,
+ *	and only sort on the remaining columns.
+ *
+ *	Consider the following example.  We have input tuples consisting of
+ *	two integers (X, Y) already presorted by X, while it's required to
+ *	sort them by both X and Y.  Let input tuples be following.
+ *
+ *	(1, 5)
+ *	(1, 2)
+ *	(2, 9)
+ *	(2, 1)
+ *	(2, 5)
+ *	(3, 3)
+ *	(3, 7)
+ *
+ *	An incremental sort algorithm would split the input into the following
+ *	groups, which have equal X, and then sort them by Y individually:
+ *
+ *		(1, 5) (1, 2)
+ *		(2, 9) (2, 1) (2, 5)
+ *		(3, 3) (3, 7)
+ *
+ *	After sorting these groups and putting them altogether, we would get
+ *	the following result which is sorted by X and Y, as requested:
+ *
+ *	(1, 2)
+ *	(1, 5)
+ *	(2, 1)
+ *	(2, 5)
+ *	(2, 9)
+ *	(3, 3)
+ *	(3, 7)
+ *
+ *	Incremental sort may be more efficient than plain sort, particularly
+ *	on large datasets, as it reduces the amount of data to sort at once,
+ *	making it more likely it fits into work_mem (eliminating the need to
+ *	spill to disk).  But the main advantage of incremental sort is that
+ *	it can start producing rows early, before sorting the whole dataset,
+ *	which is a significant benefit especially for queries with LIMIT.
+ *
+ *	The algorithm we've implemented here is modified from the theoretical
+ *	base described above by operating in two different modes:
+ *	  - Fetching a minimum number of tuples without checking prefix key
+ *	    group membership and sorting on all columns when safe.
+ *	  - Fetching all tuples for a single prefix key group and sorting on
+ *	    solely the unsorted columns.
+ *	We always begin in the first mode, and employ a heuristic to switch
+ *	into the second mode if we believe it's beneficial.
+ *
+ *	Sorting incrementally can potentially use less memory, avoid fetching
+ *	and sorting all tuples in the the dataset, and begin returning tuples
+ *	before the entire result set is available.
+ *
+ *	The hybrid mode approach allows us to optimize for both very small
+ *	groups (where the overhead of a new tuplesort is high) and very	large
+ *	groups (where we can lower cost by not having to sort on already sorted
+ *	columns), albeit at some extra cost while switching between modes.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIncrementalSort.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * We need to store the instrumentation information in either local node's sort
+ * info or, for a parallel worker process, in the shared info (this avoids
+ * having to additionally memcpy the info from local memory to shared memory
+ * at each instrumentation call). This macro expands to choose the proper sort
+ * state and group info.
+ *
+ * Arguments:
+ * - node: type IncrementalSortState *
+ * - groupName: the token fullsort or prefixsort
+ */
+#define INSTRUMENT_SORT_GROUP(node, groupName) \
+	if (node->ss.ps.instrument != NULL) \
+	{ \
+		if (node->shared_info && node->am_worker) \
+		{ \
+			Assert(IsParallelWorker()); \
+			Assert(ParallelWorkerNumber <= node->shared_info->num_workers); \
+			instrumentSortedGroup(&node->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, node->groupName##_state); \
+		} else { \
+			instrumentSortedGroup(&node->incsort_info.groupName##GroupInfo, node->groupName##_state); \
+		} \
+	}
+
+/* ----------------------------------------------------------------
+ * instrumentSortedGroup
+ *
+ * Because incremental sort processes (potentially many) sort batches, we need
+ * to capture tuplesort stats each time we finalize a sort state. This summary
+ * data is later used for EXPLAIN ANALYZE output.
+ * ----------------------------------------------------------------
+ */
+static void
+instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo,
+					  Tuplesortstate *sortState)
+{
+	TuplesortInstrumentation sort_instr;
+	groupInfo->groupCount++;
+
+	tuplesort_get_stats(sortState, &sort_instr);
+
+	/* Calculate total and maximum memory and disk space used. */
+	switch (sort_instr.spaceType)
+	{
+		case SORT_SPACE_TYPE_DISK:
+			groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed;
+			if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed)
+				groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed;
+
+			break;
+		case SORT_SPACE_TYPE_MEMORY:
+			groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed;
+			if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed)
+				groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed;
+
+			break;
+	}
+
+	/* Track each sort method we've used. */
+	groupInfo->sortMethods |= sort_instr.sortMethod;
+}
+
+/* ----------------------------------------------------------------
+ * preparePresortedCols
+ *
+ * Prepare information for presorted_keys comparisons.
+ * ----------------------------------------------------------------
+ */
+static void
+preparePresortedCols(IncrementalSortState *node)
+{
+	IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+	node->presorted_keys =
+		(PresortedKeyData *) palloc(plannode->nPresortedCols *
+									sizeof(PresortedKeyData));
+
+	/* Pre-cache comparison functions for each pre-sorted key. */
+	for (int i = 0; i < plannode->nPresortedCols; i++)
+	{
+		Oid			equalityOp,
+					equalityFunc;
+		PresortedKeyData *key;
+
+		key = &node->presorted_keys[i];
+		key->attno = plannode->sort.sortColIdx[i];
+
+		equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i],
+													 NULL);
+		if (!OidIsValid(equalityOp))
+			elog(ERROR, "missing equality operator for ordering operator %u",
+				 plannode->sort.sortOperators[i]);
+
+		equalityFunc = get_opcode(equalityOp);
+		if (!OidIsValid(equalityFunc))
+			elog(ERROR, "missing function for operator %u", equalityOp);
+
+		/* Lookup the comparison function */
+		fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext);
+
+		/* We can initialize the callinfo just once and re-use it */
+		key->fcinfo = palloc0(SizeForFunctionCallInfo(2));
+		InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2,
+								 plannode->sort.collations[i], NULL, NULL);
+		key->fcinfo->args[0].isnull = false;
+		key->fcinfo->args[1].isnull = false;
+	}
+}
+
+/* ----------------------------------------------------------------
+ * isCurrentGroup
+ *
+ * Check whether a given tuple belongs to the current sort group by comparing
+ * the presorted column values to the pivot tuple of the current group.
+ * ----------------------------------------------------------------
+ */
+static bool
+isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple)
+{
+	int			nPresortedCols;
+
+	nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols;
+
+	/*
+	 * That the input is sorted by keys * (0, ... n) implies that the tail
+	 * keys are more likely to change. Therefore we do our comparison starting
+	 * from the last pre-sorted column to optimize for early detection of
+	 * inequality and minimizing the number of function calls..
+	 */
+	for (int i = nPresortedCols - 1; i >= 0; i--)
+	{
+		Datum		datumA,
+					datumB,
+					result;
+		bool		isnullA,
+					isnullB;
+		AttrNumber	attno = node->presorted_keys[i].attno;
+		PresortedKeyData *key;
+
+		datumA = slot_getattr(pivot, attno, &isnullA);
+		datumB = slot_getattr(tuple, attno, &isnullB);
+
+		/* Special case for NULL-vs-NULL, else use standard comparison */
+		if (isnullA || isnullB)
+		{
+			if (isnullA == isnullB)
+				continue;
+			else
+				return false;
+		}
+
+		key = &node->presorted_keys[i];
+
+		key->fcinfo->args[0].value = datumA;
+		key->fcinfo->args[1].value = datumB;
+
+		/* just for paranoia's sake, we reset isnull each time */
+		key->fcinfo->isnull = false;
+
+		result = FunctionCallInvoke(key->fcinfo);
+
+		/* Check for null result, since caller is clearly not expecting one */
+		if (key->fcinfo->isnull)
+			elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid);
+
+		if (!DatumGetBool(result))
+			return false;
+	}
+	return true;
+}
+
+/* ----------------------------------------------------------------
+ * switchToPresortedPrefixMode
+ *
+ * When we determine that we've likely encountered a large batch of tuples all
+ * having the same presorted prefix values, we want to optimize tuplesort by
+ * only sorting on unsorted suffix keys.
+ *
+ * The problem is that we've already accumulated several tuples in another
+ * tuplesort configured to sort by all columns (assuming that there may be
+ * more than one prefix key group). So to switch to presorted prefix mode we
+ * have to go back and look at all the tuples we've already accumulated to
+ * verify they're all part of the same prefix key group before sorting them
+ * solely by unsorted suffix keys.
+ *
+ * While it's likely that all already fetch tuples are all part of a single
+ * prefix group, we also have to handle the possibility that there is at least
+ * one different prefix key group before the large prefix key group.
+ * ----------------------------------------------------------------
+ */
+static void
+switchToPresortedPrefixMode(PlanState *pstate)
+{
+	IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+	ScanDirection dir;
+	int64		nTuples = 0;
+	bool		lastTuple = false;
+	bool		firstTuple = true;
+	TupleDesc	tupDesc;
+	PlanState  *outerNode;
+	IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+	dir = node->ss.ps.state->es_direction;
+	outerNode = outerPlanState(node);
+	tupDesc = ExecGetResultType(outerNode);
+
+	/* Configure the prefix sort state the first time around. */
+	if (node->prefixsort_state == NULL)
+	{
+		Tuplesortstate *prefixsort_state;
+		int			nPresortedCols = plannode->nPresortedCols;
+
+		/*
+		 * Optimize the sort by assuming the prefix columns are all equal and
+		 * thus we only need to sort by any remaining columns.
+		 */
+		prefixsort_state = tuplesort_begin_heap(tupDesc,
+												plannode->sort.numCols - nPresortedCols,
+												&(plannode->sort.sortColIdx[nPresortedCols]),
+												&(plannode->sort.sortOperators[nPresortedCols]),
+												&(plannode->sort.collations[nPresortedCols]),
+												&(plannode->sort.nullsFirst[nPresortedCols]),
+												work_mem,
+												NULL,
+												false);
+		node->prefixsort_state = prefixsort_state;
+	}
+	else
+	{
+		/* Next group of presorted data */
+		tuplesort_reset(node->prefixsort_state);
+	}
+
+	/*
+	 * If the current node has a bound, then it's reasonably likely that a
+	 * large prefix key group will benefit from bounded sort, so configure the
+	 * tuplesort to allow for that optimization.
+	 */
+	if (node->bounded)
+	{
+		SO1_printf("Setting bound on presorted prefix tuplesort to: %ld\n",
+				   node->bound - node->bound_Done);
+		tuplesort_set_bound(node->prefixsort_state,
+							node->bound - node->bound_Done);
+	}
+
+	/*
+	 * Copy as many tuples as we can (i.e., in the same prefix key group) from
+	 * the full sort state to the prefix sort state.
+	 */
+	for (;;)
+	{
+		lastTuple = node->n_fullsort_remaining - nTuples == 1;
+
+		/*
+		 * When we encounter multiple prefix key groups inside the full sort
+		 * tuplesort we have to carry over the last read tuple into the next
+		 * batch.
+		 */
+		if (firstTuple && !TupIsNull(node->transfer_tuple))
+		{
+			tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+			nTuples++;
+
+			/* The carried over tuple is our new group pivot tuple. */
+			ExecCopySlot(node->group_pivot, node->transfer_tuple);
+		}
+		else
+		{
+			tuplesort_gettupleslot(node->fullsort_state,
+								   ScanDirectionIsForward(dir),
+								   false, node->transfer_tuple, NULL);
+
+			/*
+			 * If this is our first time through the loop, then we need to
+			 * save the first tuple we get as our new group pivot.
+			 */
+			if (TupIsNull(node->group_pivot))
+				ExecCopySlot(node->group_pivot, node->transfer_tuple);
+
+			if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple))
+			{
+				tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+				nTuples++;
+			}
+			else
+			{
+				/*
+				 * The tuple isn't part of the current batch so we need to
+				 * carry it over into the next batch of tuples we transfer out
+				 * of the full sort tuplesort into the presorted prefix
+				 * tuplesort. We don't actually have to do anything special to
+				 * save the tuple since we've already loaded it into the
+				 * node->transfer_tuple slot, and, even though that slot
+				 * points to memory inside the full sort tuplesort, we can't
+				 * reset that tuplesort anyway until we've fully transferred
+				 * out of its tuples, so this reference is safe. We do need to
+				 * reset the group pivot tuple though since we've finished the
+				 * current prefix key group.
+				 */
+				ExecClearTuple(node->group_pivot);
+				break;
+			}
+		}
+
+		firstTuple = false;
+
+		/*
+		 * If we've copied all of the tuples from the full sort state into the
+		 * prefix sort state, then we don't actually know that we've yet found
+		 * the last tuple in that prefix key group until we check the next
+		 * tuple from the outer plan node, so we retain the current group
+		 * pivot tuple prefix key group comparison.
+		 */
+		if (lastTuple)
+			break;
+	}
+
+	/*
+	 * Track how many tuples remain in the full sort batch so that we know if
+	 * we need to sort multiple prefix key groups before processing tuples
+	 * remaining in the large single prefix key group we think we've
+	 * encountered.
+	 */
+	SO1_printf("Moving %ld tuples to presorted prefix tuplesort\n", nTuples);
+	node->n_fullsort_remaining -= nTuples;
+	SO1_printf("Setting n_fullsort_remaining to %ld\n", node->n_fullsort_remaining);
+
+	if (lastTuple)
+	{
+		/*
+		 * We've confirmed that all tuples remaining in the full sort batch is
+		 * in the same prefix key group and moved all of those tuples into the
+		 * presorted prefix tuplesort. Now we can save our pivot comparison
+		 * tuple and continue fetching tuples from the outer execution node to
+		 * load into the presorted prefix tuplesort.
+		 */
+		ExecCopySlot(node->group_pivot, node->transfer_tuple);
+		SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n");
+		node->execution_status = INCSORT_LOADPREFIXSORT;
+
+		/*
+		 * Make sure we clear the transfer tuple slot so that next time we
+		 * encounter a large prefix key group we don't incorrectly assume we
+		 * have a tuple carried over from the previous group.
+		 */
+		ExecClearTuple(node->transfer_tuple);
+	}
+	else
+	{
+		/*
+		 * We finished a group but didn't consume all of the tuples from the
+		 * full sort state, so we'll sort this batch, let the outer node read
+		 * out all of those tuples, and then come back around to find another
+		 * batch.
+		 */
+		SO1_printf("Sorting presorted prefix tuplesort with %ld tuples\n", nTuples);
+		tuplesort_performsort(node->prefixsort_state);
+
+		INSTRUMENT_SORT_GROUP(node, prefixsort)
+
+		if (node->bounded)
+		{
+			/*
+			 * If the current node has a bound and we've already sorted n
+			 * tuples, then the functional bound remaining is (original bound
+			 * - n), so store the current number of processed tuples for use
+			 * in configuring sorting bound.
+			 */
+			SO2_printf("Changing bound_Done from %ld to %ld\n",
+					   Min(node->bound, node->bound_Done + nTuples), node->bound_Done);
+			node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+		}
+
+		SO_printf("Setting execution_status to INCSORT_READPREFIXSORT  (switchToPresortedPrefixMode)\n");
+		node->execution_status = INCSORT_READPREFIXSORT;
+	}
+}
+
+/*
+ * Sorting many small groups with tuplesort is inefficient. In order to
+ * cope with this problem we don't start a new group until the current one
+ * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also
+ * means we can't assume small groups of tuples all have the same prefix keys.)
+ * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking
+ * for the new group as soon as we've met our bound to avoid fetching more
+ * tuples than we absolutely have to fetch.
+ */
+#define DEFAULT_MIN_GROUP_SIZE 32
+
+/*
+ * While we've optimized for small prefix key groups by not starting our prefix
+ * key comparisons until we've reached a minimum number of tuples, we don't want
+ * that optimization to cause us to lose out on the benefits of being able to
+ * assume a large group of tuples is fully presorted by its prefix keys.
+ * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic
+ * for determining when we believe we've encountered a large group, and, if we
+ * get to that point without finding a new prefix key group we transition to
+ * presorted prefix key mode.
+ */
+#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE)
+
+/* ----------------------------------------------------------------
+ *		ExecIncrementalSort
+ *
+ *		Assuming that outer subtree returns tuple presorted by some prefix
+ *		of target sort columns, performs incremental sort.
+ *
+ *		Conditions:
+ *		  -- none.
+ *
+ *		Initial States:
+ *		  -- the outer child is prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIncrementalSort(PlanState *pstate)
+{
+	IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+	EState	   *estate;
+	ScanDirection dir;
+	Tuplesortstate *read_sortstate;
+	Tuplesortstate *fullsort_state;
+	TupleTableSlot *slot;
+	IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan;
+	PlanState  *outerNode;
+	TupleDesc	tupDesc;
+	int64		nTuples = 0;
+	int64		minGroupSize;
+
+	CHECK_FOR_INTERRUPTS();
+
+	estate = node->ss.ps.state;
+	dir = estate->es_direction;
+	fullsort_state = node->fullsort_state;
+
+	/*
+	 * If a previous iteration has sorted a batch, then we need to check to
+	 * see if there are any remaining tuples in that batch that we can return
+	 * before moving on to other execution states.
+	 */
+	if (node->execution_status == INCSORT_READFULLSORT
+		|| node->execution_status == INCSORT_READPREFIXSORT)
+	{
+		/*
+		 * Return next tuple from the current sorted group set if available.
+		 */
+		read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+			fullsort_state : node->prefixsort_state;
+		slot = node->ss.ps.ps_ResultTupleSlot;
+
+		/*
+		 * We have to populate the slot from the tuplesort before checking
+		 * outerNodeDone because it will set the slot to NULL if no more
+		 * tuples remain. If the tuplesort is empty, but we don't have any
+		 * more tuples available for sort from the outer node, then
+		 * outerNodeDone will have been set so we'll return that now-empty
+		 * slot to the caller.
+		 */
+		if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+								   false, slot, NULL) || node->outerNodeDone)
+
+			/*
+			 * Note: there isn't a good test case for the node->outerNodeDone
+			 * check directly, but we need it for any plan where the outer
+			 * node will fail when trying to fetch too many tuples.
+			 */
+			return slot;
+		else if (node->n_fullsort_remaining > 0)
+		{
+			/*
+			 * When we transition to presorted prefix mode, we might have
+			 * accumulated at least one additional prefix key group in the
+			 * full sort tuplesort. The first call to
+			 * switchToPresortedPrefixMode() will have pulled the first one of
+			 * those groups out, and we've returned those tuples to the parent
+			 * node, but if at this point we still have tuples remaining in
+			 * the full sort state (i.e., n_fullsort_remaining > 0), then we
+			 * need to re-execute the prefix mode transition function to pull
+			 * out the next prefix key group.
+			 */
+			SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (%ld)\n",
+					   node->n_fullsort_remaining);
+			switchToPresortedPrefixMode(pstate);
+		}
+		else
+		{
+			/*
+			 * If we don't have any sorted tuples to read and we're not
+			 * currently transitioning into presorted prefix sort mode, then
+			 * it's time to start the process all over again by building a new
+			 * group in the full sort state.
+			 */
+			SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n");
+			node->execution_status = INCSORT_LOADFULLSORT;
+		}
+	}
+
+	/*
+	 * Scan the subplan in the forward direction while creating the sorted
+	 * data.
+	 */
+	estate->es_direction = ForwardScanDirection;
+
+	outerNode = outerPlanState(node);
+	tupDesc = ExecGetResultType(outerNode);
+
+	/* Load tuples into the full sort state. */
+	if (node->execution_status == INCSORT_LOADFULLSORT)
+	{
+		/*
+		 * Initialize sorting structures.
+		 */
+		if (fullsort_state == NULL)
+		{
+			/*
+			 * Initialize presorted column support structures for
+			 * isCurrentGroup(). It's correct to do this along with the
+			 * initial intialization for the full sort state (and not for the
+			 * prefix sort state) since we always load the full sort state
+			 * first.
+			 */
+			preparePresortedCols(node);
+
+			/*
+			 * Since we optimize small prefix key groups by accumulating a
+			 * minimum number of tuples before sorting, we can't assume that a
+			 * group of tuples all have the same prefix key values. Hence we
+			 * setup the full sort tuplesort to sort by all requested sort
+			 * keys.
+			 */
+			fullsort_state = tuplesort_begin_heap(tupDesc,
+												  plannode->sort.numCols,
+												  plannode->sort.sortColIdx,
+												  plannode->sort.sortOperators,
+												  plannode->sort.collations,
+												  plannode->sort.nullsFirst,
+												  work_mem,
+												  NULL,
+												  false);
+			node->fullsort_state = fullsort_state;
+		}
+		else
+		{
+			/* Reset sort for the next batch. */
+			tuplesort_reset(fullsort_state);
+		}
+
+		/*
+		 * Calculate the remaining tuples left if bounded and configure both
+		 * bounded sort and the minimum group size accordingly.
+		 */
+		if (node->bounded)
+		{
+			int64		currentBound = node->bound - node->bound_Done;
+
+			/*
+			 * Bounded sort isn't likely to be a useful optimization for full
+			 * sort mode since we limit full sort mode to a relatively small
+			 * number of tuples and tuplesort doesn't switch over to top-n
+			 * heap sort anyway unless it hits (2 * bound) tuples.
+			 */
+			if (currentBound < DEFAULT_MIN_GROUP_SIZE)
+				tuplesort_set_bound(fullsort_state, currentBound);
+
+			minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound);
+		}
+		else
+			minGroupSize = DEFAULT_MIN_GROUP_SIZE;
+
+		/*
+		 * Because we have to read the next tuple to find out that we've
+		 * encountered a new prefix key group, on subsequent groups we have to
+		 * carry over that extra tuple and add it to the new group's sort here
+		 * before we read any new tuples from the outer node.
+		 */
+		if (!TupIsNull(node->group_pivot))
+		{
+			tuplesort_puttupleslot(fullsort_state, node->group_pivot);
+			nTuples++;
+
+			/*
+			 * We're in full sort mode accumulating a minimum number of tuples
+			 * and not checking for prefix key equality yet, so we can't
+			 * assume the group pivot tuple will reamin the same -- unless
+			 * we're using a minimum group size of 1, in which case the pivot
+			 * is obviously still the pviot.
+			 */
+			if (nTuples != minGroupSize)
+				ExecClearTuple(node->group_pivot);
+		}
+
+
+		/*
+		 * Pull as many tuples from the outer node as possible given our
+		 * current operating mode.
+		 */
+		for (;;)
+		{
+			slot = ExecProcNode(outerNode);
+
+			/*
+			 * If the outer node can't provide us any more tuples, then we can
+			 * sort the current group and return those tuples.
+			 */
+			if (TupIsNull(slot))
+			{
+				/*
+				 * We need to know later if the outer node has completed to be
+				 * able to distinguish between being done with a batch and
+				 * being done with the whole node.
+				 */
+				node->outerNodeDone = true;
+
+				SO1_printf("Sorting fullsort with %ld tuples\n", nTuples);
+				tuplesort_performsort(fullsort_state);
+
+				INSTRUMENT_SORT_GROUP(node, fullsort)
+
+				SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n");
+				node->execution_status = INCSORT_READFULLSORT;
+				break;
+			}
+
+			/* Accumulate the next group of presorted tuples. */
+			if (nTuples < minGroupSize)
+			{
+				/*
+				 * If we haven't yet hit our target minimum group size, then
+				 * we don't need to bother checking for inclusion in the
+				 * current prefix group since at this point we'll assume that
+				 * we'll full sort this batch to avoid a large number of very
+				 * tiny (and thus inefficient) sorts.
+				 */
+				tuplesort_puttupleslot(fullsort_state, slot);
+				nTuples++;
+
+				/*
+				 * If we've reach our minimum group size, then we need to
+				 * store the most recent tuple as a pivot.
+				 */
+				if (nTuples == minGroupSize)
+					ExecCopySlot(node->group_pivot, slot);
+			}
+			else
+			{
+				/*
+				 * If we've already accumulated enough tuples to reach our
+				 * minimum group size, then we need to compare any additional
+				 * tuples to our pivot tuple to see if we reach the end of
+				 * that prefix key group. Only after we find changed prefix
+				 * keys can we guarantee sort stability of the tuples we've
+				 * already accumulated.
+				 */
+				if (isCurrentGroup(node, node->group_pivot, slot))
+				{
+					/*
+					 * As long as the prefix keys match the pivot tuple then
+					 * load the tuple into the tuplesort.
+					 */
+					tuplesort_puttupleslot(fullsort_state, slot);
+					nTuples++;
+				}
+				else
+				{
+					/*
+					 * Since the tuple we fetched isn't part of the current
+					 * prefix key group we don't want to  sort it as part of
+					 * the current batch. Instead we use the group_pivot slot
+					 * to carry it over to the next batch (even though we
+					 * won't actually treat it as a group pivot).
+					 */
+					ExecCopySlot(node->group_pivot, slot);
+
+					if (node->bounded)
+					{
+						/*
+						 * If the current node has a bound, and we've already
+						 * sorted n tuples, then the functional bound
+						 * remaining is (original bound - n), so store the
+						 * current number of processed tuples for later use
+						 * configuring the sort state's bound.
+						 */
+						SO2_printf("Changing bound_Done from %ld to %ld\n",
+								   node->bound_Done,
+								   Min(node->bound, node->bound_Done + nTuples));
+						node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+					}
+
+					/*
+					 * Once we find changed prefix keys we can complete the
+					 * sort and transition modes to reading out the sorted
+					 * tuples.
+					 */
+					SO1_printf("Sorting fullsort tuplesort with %ld tuples\n",
+							   nTuples);
+					tuplesort_performsort(fullsort_state);
+
+					INSTRUMENT_SORT_GROUP(node, fullsort)
+
+					SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n");
+					node->execution_status = INCSORT_READFULLSORT;
+					break;
+				}
+			}
+
+			/*
+			 * Unless we've alrady transitioned modes to reading from the full
+			 * sort state, then we assume that having read at least
+			 * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're
+			 * processing a large group of tuples all having equal prefix keys
+			 * (but haven't yet found the final tuple in that prefix key
+			 * group), so we need to transition in to presorted prefix mode.
+			 */
+			if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE &&
+				node->execution_status != INCSORT_READFULLSORT)
+			{
+				/*
+				 * The group pivot we have stored has already been put into
+				 * the tuplesort; we don't want to carry it over. Since we
+				 * haven't yet found the end of the prefix key group, it might
+				 * seem like we should keep this, but we don't actually know
+				 * how many prefix key groups might be represented in the full
+				 * sort state, so we'll let the mode transition function
+				 * manage this state for us.
+				 */
+				ExecClearTuple(node->group_pivot);
+
+				/*
+				 * Unfortunately the tuplesort API doesn't include a way to
+				 * retrieve tuples unless a sort has been performed, so we
+				 * perform the sort even though we could just as easily rely
+				 * on FIFO retrieval semantics when transferring them to the
+				 * presorted prefix tuplesort.
+				 */
+				SO1_printf("Sorting fullsort tuplesort with %ld tuples\n", nTuples);
+				tuplesort_performsort(fullsort_state);
+
+				INSTRUMENT_SORT_GROUP(node, fullsort)
+
+				/*
+				 * If the full sort tuplesort happened to switch into top-n
+				 * heapsort mode then we will only be able to retrieve
+				 * currentBound tuples (since the tuplesort will have only
+				 * retained the top-n tuples). This is safe even though we
+				 * haven't yet completed fetching the current prefix key group
+				 * because the tuples we've "lost" already sorted "below" the
+				 * retained ones, and we're already contractually guaranteed
+				 * to not need any more than the currentBound tuples.
+				 */
+				if (tuplesort_used_bound(node->fullsort_state))
+				{
+					int64		currentBound = node->bound - node->bound_Done;
+
+					SO2_printf("Read %ld tuples, but setting to %ld because we used bounded sort\n",
+							   nTuples, Min(currentBound, nTuples));
+					nTuples = Min(currentBound, nTuples);
+				}
+
+				SO1_printf("Setting n_fullsort_remaining to %ld and calling switchToPresortedPrefixMode()\n",
+						   nTuples);
+
+				/*
+				 * We might have multiple prefix key groups in the full sort
+				 * state, so the mode transition function needs to know the it
+				 * needs to move from the fullsort to presorted prefix sort.
+				 */
+				node->n_fullsort_remaining = nTuples;
+
+				/* Transition the tuples to the presorted prefix tuplesort. */
+				switchToPresortedPrefixMode(pstate);
+
+				/*
+				 * Since we know we had tuples to move to the presorted prefix
+				 * tuplesort, we know that unless that transition has verified
+				 * that all tuples belonged to the same prefix key group (in
+				 * which case we can go straight to continuing to load tuples
+				 * into that tuplesort), we should have a tuple to return
+				 * here.
+				 *
+				 * Either way, the appropriate execution status should have
+				 * been set by switchToPresortedPrefixMode(), so we can drop
+				 * out of the loop here and let the appropriate path kick in.
+				 */
+				break;
+			}
+		}
+	}
+
+	if (node->execution_status == INCSORT_LOADPREFIXSORT)
+	{
+		/*
+		 * We only enter this state after the mode transition function has
+		 * confirmed all remaining tuples from the full sort state have the
+		 * same prefix and moved those tuples to the prefix sort state. That
+		 * function has also set a group pivot tuple (which doesn't need to be
+		 * carried over; it's already been put into the prefix sort state).
+		 */
+		Assert(!TupIsNull(node->group_pivot));
+
+		/*
+		 * Read tuples from the outer node and load them into the prefix sort
+		 * state until we encounter a tuple whose prefix keys don't match the
+		 * current group_pivot tuple, since we can't guarantee sort stability
+		 * until we have all tuples matching those prefix keys.
+		 */
+		for (;;)
+		{
+			slot = ExecProcNode(outerNode);
+
+			/*
+			 * If we've exhausted tuples from the outer node we're done
+			 * loading the prefix sort state.
+			 */
+			if (TupIsNull(slot))
+			{
+				/*
+				 * We need to know later if the outer node has completed to be
+				 * able to distinguish between being done with a batch and
+				 * being done with the whole node.
+				 */
+				node->outerNodeDone = true;
+				break;
+			}
+
+			/*
+			 * If the tuple's prefix keys match our pivot tuple, we're not
+			 * done yet and can load it into the prefix sort state. If not, we
+			 * don't want to  sort it as part of the current batch. Instead we
+			 * use the group_pivot slot to carry it over to the next batch
+			 * (even though we won't actually treat it as a group pivot).
+			 */
+			if (isCurrentGroup(node, node->group_pivot, slot))
+			{
+				tuplesort_puttupleslot(node->prefixsort_state, slot);
+				nTuples++;
+			}
+			else
+			{
+				ExecCopySlot(node->group_pivot, slot);
+				break;
+			}
+		}
+
+		/*
+		 * Perform the sort and begin returning the tuples to the parent plan
+		 * node.
+		 */
+		SO1_printf("Sorting presorted prefix tuplesort with >= %ld tuples\n", nTuples);
+		tuplesort_performsort(node->prefixsort_state);
+
+		INSTRUMENT_SORT_GROUP(node, prefixsort)
+
+		SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n");
+		node->execution_status = INCSORT_READPREFIXSORT;
+
+		if (node->bounded)
+		{
+			/*
+			 * If the current node has a bound, and we've already sorted n
+			 * tuples, then the functional bound remaining is (original bound
+			 * - n), so store the current number of processed tuples for use
+			 * in configuring sorting bound.
+			 */
+			SO2_printf("Changing bound_Done from %ld to %ld\n",
+					   node->bound_Done,
+					   Min(node->bound, node->bound_Done + nTuples));
+			node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+		}
+	}
+
+	/* Restore to user specified direction. */
+	estate->es_direction = dir;
+
+	/*
+	 * Get the first or next tuple from tuplesort. Returns NULL if no more
+	 * tuples.
+	 */
+	read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+		fullsort_state : node->prefixsort_state;
+	slot = node->ss.ps.ps_ResultTupleSlot;
+	(void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+								  false, slot, NULL);
+	return slot;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitIncrementalSort
+ *
+ *		Creates the run-time state information for the sort node
+ *		produced by the planner and initializes its outer subtree.
+ * ----------------------------------------------------------------
+ */
+IncrementalSortState *
+ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags)
+{
+	IncrementalSortState *incrsortstate;
+
+	SO_printf("ExecInitIncrementalSort: initializing sort node\n");
+
+	/*
+	 * Incremental sort can't be used with either EXEC_FLAG_REWIND,
+	 * EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, because we only one of many sort
+	 * batches in the current sort state.
+	 */
+	Assert((eflags & (EXEC_FLAG_BACKWARD |
+					  EXEC_FLAG_MARK)) == 0);
+
+	/* Initialize state structure. */
+	incrsortstate = makeNode(IncrementalSortState);
+	incrsortstate->ss.ps.plan = (Plan *) node;
+	incrsortstate->ss.ps.state = estate;
+	incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort;
+
+	incrsortstate->execution_status = INCSORT_LOADFULLSORT;
+	incrsortstate->bounded = false;
+	incrsortstate->outerNodeDone = false;
+	incrsortstate->bound_Done = 0;
+	incrsortstate->fullsort_state = NULL;
+	incrsortstate->prefixsort_state = NULL;
+	incrsortstate->group_pivot = NULL;
+	incrsortstate->transfer_tuple = NULL;
+	incrsortstate->n_fullsort_remaining = 0;
+	incrsortstate->presorted_keys = NULL;
+
+	if (incrsortstate->ss.ps.instrument != NULL)
+	{
+		IncrementalSortGroupInfo *fullsortGroupInfo =
+		&incrsortstate->incsort_info.fullsortGroupInfo;
+		IncrementalSortGroupInfo *prefixsortGroupInfo =
+		&incrsortstate->incsort_info.prefixsortGroupInfo;
+
+		fullsortGroupInfo->groupCount = 0;
+		fullsortGroupInfo->maxDiskSpaceUsed = 0;
+		fullsortGroupInfo->totalDiskSpaceUsed = 0;
+		fullsortGroupInfo->maxMemorySpaceUsed = 0;
+		fullsortGroupInfo->totalMemorySpaceUsed = 0;
+		fullsortGroupInfo->sortMethods = 0;
+		prefixsortGroupInfo->groupCount = 0;
+		prefixsortGroupInfo->maxDiskSpaceUsed = 0;
+		prefixsortGroupInfo->totalDiskSpaceUsed = 0;
+		prefixsortGroupInfo->maxMemorySpaceUsed = 0;
+		prefixsortGroupInfo->totalMemorySpaceUsed = 0;
+		prefixsortGroupInfo->sortMethods = 0;
+	}
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * Sort nodes don't initialize their ExprContexts because they never call
+	 * ExecQual or ExecProject.
+	 */
+
+	/*
+	 * Initialize child nodes.
+	 *
+	 * We shield the child node from the need to support REWIND, BACKWARD, or
+	 * MARK/RESTORE.
+	 */
+	eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
+
+	outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * Initialize scan slot and type.
+	 */
+	ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple);
+
+	/*
+	 * Initialize return slot and type. No need to initialize projection info
+	 * because we don't do any projections.
+	 */
+	ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple);
+	incrsortstate->ss.ps.ps_ProjInfo = NULL;
+
+	/*
+	 * Initialize standalone slots to store a tuple for pivot prefix keys and
+	 * for carrying over a tuple from one batch to the next.
+	 */
+	incrsortstate->group_pivot =
+		MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+								 &TTSOpsMinimalTuple);
+	incrsortstate->transfer_tuple =
+		MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+								 &TTSOpsMinimalTuple);
+
+	SO_printf("ExecInitIncrementalSort: sort node initialized\n");
+
+	return incrsortstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecEndIncrementalSort(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIncrementalSort(IncrementalSortState *node)
+{
+	SO_printf("ExecEndIncrementalSort: shutting down sort node\n");
+
+	/* clean out the scan tuple */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	/* must drop stanalone tuple slots from outer node */
+	ExecDropSingleTupleTableSlot(node->group_pivot);
+	ExecDropSingleTupleTableSlot(node->transfer_tuple);
+
+	/*
+	 * Release tuplesort resources.
+	 */
+	if (node->fullsort_state != NULL)
+	{
+		tuplesort_end(node->fullsort_state);
+		node->fullsort_state = NULL;
+	}
+	if (node->prefixsort_state != NULL)
+	{
+		tuplesort_end(node->prefixsort_state);
+		node->prefixsort_state = NULL;
+	}
+
+	/*
+	 * Shut down the subplan.
+	 */
+	ExecEndNode(outerPlanState(node));
+
+	SO_printf("ExecEndIncrementalSort: sort node shutdown\n");
+}
+
+void
+ExecReScanIncrementalSort(IncrementalSortState *node)
+{
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/*
+	 * Incremental sort doesn't support efficient rescan even when paramters
+	 * haven't changed (e.g., rewind) because unlike regular sort we don't
+	 * store all tuples at once for the full sort.
+	 *
+	 * So even if EXEC_FLAG_REWIND is set we just reset all of our state and
+	 * reexecute the sort along with the child node below us.
+	 *
+	 * In theory if we've only fill the full sort with one batch (and haven't
+	 * reset it for a new batch yet) then we could efficiently rewind, but
+	 * that seems a narrow enough case that it's not worth handling specially
+	 * at this time.
+	 */
+
+	/* must drop pointer to sort result tuple */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+	if (node->group_pivot != NULL)
+		ExecClearTuple(node->group_pivot);
+	if (node->transfer_tuple != NULL)
+		ExecClearTuple(node->transfer_tuple);
+
+	node->bounded = false;
+	node->outerNodeDone = false;
+	node->n_fullsort_remaining = 0;
+	node->bound_Done = 0;
+	node->presorted_keys = NULL;
+
+	node->execution_status = INCSORT_LOADFULLSORT;
+
+	/*
+	 * If we've set up either of the sort states yet, we need to reset them.
+	 * We could end them and null out the pointers, but there's no reason to
+	 * repay the setup cost, and because guard setting up pivot comparator
+	 * state similarly, doing so might actually cause a leak.
+	 */
+	if (node->fullsort_state != NULL)
+	{
+		tuplesort_reset(node->fullsort_state);
+		node->fullsort_state = NULL;
+	}
+	if (node->prefixsort_state != NULL)
+	{
+		tuplesort_reset(node->prefixsort_state);
+		node->prefixsort_state = NULL;
+	}
+
+	/*
+	 * If chgParam of subnode is not null, theni the plan will be re-scanned
+	 * by the first ExecProcNode.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecSortEstimate
+ *
+ *		Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo));
+	size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeDSM
+ *
+ *		Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedIncrementalSortInfo, sinfo)
+		+ pcxt->nworkers * sizeof(IncrementalSortInfo);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeWorker
+ *
+ *		Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+	node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortRetrieveInstrumentation
+ *
+ *		Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node)
+{
+	Size		size;
+	SharedIncrementalSortInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedIncrementalSortInfo, sinfo)
+		+ node->shared_info->num_workers * sizeof(IncrementalSortInfo);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 5d1debc196..9d2bfd7ed6 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -93,7 +93,8 @@ ExecSort(PlanState *pstate)
 											  plannode->collations,
 											  plannode->nullsFirst,
 											  work_mem,
-											  NULL, node->randomAccess);
+											  NULL,
+											  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
 		node->tuplesortstate = (void *) tuplesortstate;
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index eaf93c64b8..f9d86859ee 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -927,6 +927,24 @@ _copyMaterial(const Material *from)
 }
 
 
+/*
+ * CopySortFields
+ *
+ *		This function copies the fields of the Sort node.  It is used by
+ *		all the copy functions for classes which inherit from Sort.
+ */
+static void
+CopySortFields(const Sort *from, Sort *newnode)
+{
+	CopyPlanFields((const Plan *) from, (Plan *) newnode);
+
+	COPY_SCALAR_FIELD(numCols);
+	COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber));
+	COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid));
+	COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid));
+	COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool));
+}
+
 /*
  * _copySort
  */
@@ -938,13 +956,29 @@ _copySort(const Sort *from)
 	/*
 	 * copy node superclass fields
 	 */
-	CopyPlanFields((const Plan *) from, (Plan *) newnode);
+	CopySortFields(from, newnode);
 
-	COPY_SCALAR_FIELD(numCols);
-	COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber));
-	COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid));
-	COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid));
-	COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool));
+	return newnode;
+}
+
+
+/*
+ * _copyIncrementalSort
+ */
+static IncrementalSort *
+_copyIncrementalSort(const IncrementalSort *from)
+{
+	IncrementalSort *newnode = makeNode(IncrementalSort);
+
+	/*
+	 * copy node superclass fields
+	 */
+	CopySortFields((const Sort *) from, (Sort *) newnode);
+
+	/*
+	 * copy remainder of node
+	 */
+	COPY_SCALAR_FIELD(nPresortedCols);
 
 	return newnode;
 }
@@ -4898,6 +4932,9 @@ copyObjectImpl(const void *from)
 		case T_Sort:
 			retval = _copySort(from);
 			break;
+		case T_IncrementalSort:
+			retval = _copyIncrementalSort(from);
+			break;
 		case T_Group:
 			retval = _copyGroup(from);
 			break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f4aecdcbcd..35ed8c0d53 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -837,10 +837,8 @@ _outMaterial(StringInfo str, const Material *node)
 }
 
 static void
-_outSort(StringInfo str, const Sort *node)
+_outSortInfo(StringInfo str, const Sort *node)
 {
-	WRITE_NODE_TYPE("SORT");
-
 	_outPlanInfo(str, (const Plan *) node);
 
 	WRITE_INT_FIELD(numCols);
@@ -850,6 +848,24 @@ _outSort(StringInfo str, const Sort *node)
 	WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
 }
 
+static void
+_outSort(StringInfo str, const Sort *node)
+{
+	WRITE_NODE_TYPE("SORT");
+
+	_outSortInfo(str, node);
+}
+
+static void
+_outIncrementalSort(StringInfo str, const IncrementalSort *node)
+{
+	WRITE_NODE_TYPE("INCREMENTALSORT");
+
+	_outSortInfo(str, (const Sort *) node);
+
+	WRITE_INT_FIELD(nPresortedCols);
+}
+
 static void
 _outUnique(StringInfo str, const Unique *node)
 {
@@ -3786,6 +3802,9 @@ outNode(StringInfo str, const void *obj)
 			case T_Sort:
 				_outSort(str, obj);
 				break;
+			case T_IncrementalSort:
+				_outIncrementalSort(str, obj);
+				break;
 			case T_Unique:
 				_outUnique(str, obj);
 				break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index d5b23a3479..2a2f39bf04 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2150,12 +2150,13 @@ _readMaterial(void)
 }
 
 /*
- * _readSort
+ * ReadCommonSort
+ *	Assign the basic stuff of all nodes that inherit from Sort
  */
-static Sort *
-_readSort(void)
+static void
+ReadCommonSort(Sort *local_node)
 {
-	READ_LOCALS(Sort);
+	READ_TEMP_LOCALS();
 
 	ReadCommonPlan(&local_node->plan);
 
@@ -2164,6 +2165,32 @@ _readSort(void)
 	READ_OID_ARRAY(sortOperators, local_node->numCols);
 	READ_OID_ARRAY(collations, local_node->numCols);
 	READ_BOOL_ARRAY(nullsFirst, local_node->numCols);
+}
+
+/*
+ * _readSort
+ */
+static Sort *
+_readSort(void)
+{
+	READ_LOCALS_NO_FIELDS(Sort);
+
+	ReadCommonSort(local_node);
+
+	READ_DONE();
+}
+
+/*
+ * _readIncrementalSort
+ */
+static IncrementalSort *
+_readIncrementalSort(void)
+{
+	READ_LOCALS(IncrementalSort);
+
+	ReadCommonSort(&local_node->sort);
+
+	READ_INT_FIELD(nPresortedCols);
 
 	READ_DONE();
 }
@@ -2801,6 +2828,8 @@ parseNodeString(void)
 		return_value = _readMaterial();
 	else if (MATCH("SORT", 4))
 		return_value = _readSort();
+	else if (MATCH("INCREMENTALSORT", 15))
+		return_value = _readIncrementalSort();
 	else if (MATCH("GROUP", 5))
 		return_value = _readGroup();
 	else if (MATCH("AGG", 3))
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 905bbe77d8..ccf46dd0aa 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -3881,6 +3881,10 @@ print_path(PlannerInfo *root, Path *path, int indent)
 			ptype = "Sort";
 			subpath = ((SortPath *) path)->subpath;
 			break;
+		case T_IncrementalSortPath:
+			ptype = "IncrementalSort";
+			subpath = ((SortPath *) path)->subpath;
+			break;
 		case T_GroupPath:
 			ptype = "Group";
 			subpath = ((GroupPath *) path)->subpath;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9e7e57f118..0eef5d7707 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -128,6 +128,7 @@ bool		enable_indexonlyscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
+bool		enable_incrementalsort = true;
 bool		enable_hashagg = true;
 bool		enable_hashagg_disk = true;
 bool		enable_groupingsets_hash_disk = false;
@@ -1648,9 +1649,9 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 }
 
 /*
- * cost_sort
- *	  Determines and returns the cost of sorting a relation, including
- *	  the cost of reading the input data.
+ * cost_tuplesort
+ *	  Determines and returns the cost of sorting a relation using tuplesort,
+ *    not including the cost of reading the input data.
  *
  * If the total volume of data to sort is less than sort_mem, we will do
  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
@@ -1677,39 +1678,23 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * specifying nonzero comparison_cost; typically that's used for any extra
  * work that has to be done to prepare the inputs to the comparison operators.
  *
- * 'pathkeys' is a list of sort keys
- * 'input_cost' is the total cost for reading the input data
  * 'tuples' is the number of tuples in the relation
  * 'width' is the average tuple width in bytes
  * 'comparison_cost' is the extra cost per comparison, if any
  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
- *
- * NOTE: some callers currently pass NIL for pathkeys because they
- * can't conveniently supply the sort keys.  Since this routine doesn't
- * currently do anything with pathkeys anyway, that doesn't matter...
- * but if it ever does, it should react gracefully to lack of key data.
- * (Actually, the thing we'd most likely be interested in is just the number
- * of sort keys, which all callers *could* supply.)
  */
-void
-cost_sort(Path *path, PlannerInfo *root,
-		  List *pathkeys, Cost input_cost, double tuples, int width,
-		  Cost comparison_cost, int sort_mem,
-		  double limit_tuples)
+static void
+cost_tuplesort(Cost *startup_cost, Cost *run_cost,
+			   double tuples, int width,
+			   Cost comparison_cost, int sort_mem,
+			   double limit_tuples)
 {
-	Cost		startup_cost = input_cost;
-	Cost		run_cost = 0;
 	double		input_bytes = relation_byte_size(tuples, width);
 	double		output_bytes;
 	double		output_tuples;
 	long		sort_mem_bytes = sort_mem * 1024L;
 
-	if (!enable_sort)
-		startup_cost += disable_cost;
-
-	path->rows = tuples;
-
 	/*
 	 * We want to be sure the cost of a sort is never estimated as zero, even
 	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
@@ -1748,7 +1733,7 @@ cost_sort(Path *path, PlannerInfo *root,
 		 *
 		 * Assume about N log2 N comparisons
 		 */
-		startup_cost += comparison_cost * tuples * LOG2(tuples);
+		*startup_cost = comparison_cost * tuples * LOG2(tuples);
 
 		/* Disk costs */
 
@@ -1759,7 +1744,7 @@ cost_sort(Path *path, PlannerInfo *root,
 			log_runs = 1.0;
 		npageaccesses = 2.0 * npages * log_runs;
 		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
-		startup_cost += npageaccesses *
+		*startup_cost += npageaccesses *
 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
 	}
 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
@@ -1770,12 +1755,12 @@ cost_sort(Path *path, PlannerInfo *root,
 		 * factor is a bit higher than for quicksort.  Tweak it so that the
 		 * cost curve is continuous at the crossover point.
 		 */
-		startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
+		*startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
 	}
 	else
 	{
 		/* We'll use plain quicksort on all the input tuples */
-		startup_cost += comparison_cost * tuples * LOG2(tuples);
+		*startup_cost = comparison_cost * tuples * LOG2(tuples);
 	}
 
 	/*
@@ -1786,8 +1771,143 @@ cost_sort(Path *path, PlannerInfo *root,
 	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
 	 * counting the LIMIT otherwise.
 	 */
-	run_cost += cpu_operator_cost * tuples;
+	*run_cost = cpu_operator_cost * tuples;
+}
 
+/*
+ * cost_incremental_sort
+ * 	Determines and returns the cost of sorting a relation incrementally, when
+ *  the input path is presorted by a prefix of the pathkeys.
+ *
+ * 'presorted_keys' is the number of leading pathkeys by which the input path
+ * is sorted.
+ *
+ * We estimate the number of groups into which the relation is divided by the
+ * leading pathkeys, and then calculate the cost of sorting a single group
+ * with tuplesort using cost_tuplesort().
+ */
+void
+cost_incremental_sort(Path *path,
+					  PlannerInfo *root, List *pathkeys, int presorted_keys,
+					  Cost input_startup_cost, Cost input_total_cost,
+					  double input_tuples, int width, Cost comparison_cost, int sort_mem,
+					  double limit_tuples)
+{
+	Cost		startup_cost = 0,
+				run_cost = 0,
+				input_run_cost = input_total_cost - input_startup_cost;
+	double		group_tuples,
+				input_groups;
+	Cost		group_startup_cost,
+				group_run_cost,
+				group_input_run_cost;
+	List	   *presortedExprs = NIL;
+	ListCell   *l;
+	int			i = 0;
+
+	Assert(presorted_keys != 0);
+
+	/*
+	 * We want to be sure the cost of a sort is never estimated as zero, even
+	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
+	 */
+	if (input_tuples < 2.0)
+		input_tuples = 2.0;
+
+	/* Extract presorted keys as list of expressions */
+	foreach(l, pathkeys)
+	{
+		PathKey    *key = (PathKey *) lfirst(l);
+		EquivalenceMember *member = (EquivalenceMember *)
+		linitial(key->pk_eclass->ec_members);
+
+		presortedExprs = lappend(presortedExprs, member->em_expr);
+
+		i++;
+		if (i >= presorted_keys)
+			break;
+	}
+
+	/* Estimate number of groups with equal presorted keys */
+	input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL);
+	group_tuples = input_tuples / input_groups;
+	group_input_run_cost = input_run_cost / input_groups;
+
+	/*
+	 * Estimate average cost of sorting of one group where presorted keys are
+	 * equal.  Incremental sort is sensitive to distribution of tuples to the
+	 * groups, where we're relying on quite rough assumptions.  Thus, we're
+	 * pessimistic about incremental sort performance and increase its average
+	 * group size by half.
+	 */
+	cost_tuplesort(&group_startup_cost, &group_run_cost,
+				   1.5 * group_tuples, width, comparison_cost, sort_mem,
+				   limit_tuples);
+
+	/*
+	 * Startup cost of incremental sort is the startup cost of its first group
+	 * plus the cost of its input.
+	 */
+	startup_cost += group_startup_cost
+		+ input_startup_cost + group_input_run_cost;
+
+	/*
+	 * After we started producing tuples from the first group, the cost of
+	 * producing all the tuples is given by the cost to finish processing this
+	 * group, plus the total cost to process the remaining groups, plus the
+	 * remaining cost of input.
+	 */
+	run_cost += group_run_cost
+		+ (group_run_cost + group_startup_cost) * (input_groups - 1)
+		+ group_input_run_cost * (input_groups - 1);
+
+	/*
+	 * Incremental sort adds some overhead by itself. Firstly, it has to
+	 * detect the sort groups. This is roughly equal to one extra copy and
+	 * comparison per tuple. Secondly, it has to reset the tuplesort context
+	 * for every group.
+	 */
+	run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
+	run_cost += 2.0 * cpu_tuple_cost * input_groups;
+
+	path->rows = input_tuples;
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
+}
+
+/*
+ * cost_sort
+ *	  Determines and returns the cost of sorting a relation, including
+ *	  the cost of reading the input data.
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys.  Since this routine doesn't
+ * currently do anything with pathkeys anyway, that doesn't matter...
+ * but if it ever does, it should react gracefully to lack of key data.
+ * (Actually, the thing we'd most likely be interested in is just the number
+ * of sort keys, which all callers *could* supply.)
+ */
+void
+cost_sort(Path *path, PlannerInfo *root,
+		  List *pathkeys, Cost input_cost, double tuples, int width,
+		  Cost comparison_cost, int sort_mem,
+		  double limit_tuples)
+
+{
+	Cost		startup_cost;
+	Cost		run_cost;
+
+	cost_tuplesort(&startup_cost, &run_cost,
+				   tuples, width,
+				   comparison_cost, sort_mem,
+				   limit_tuples);
+
+	if (!enable_sort)
+		startup_cost += disable_cost;
+
+	startup_cost += input_cost;
+
+	path->rows = tuples;
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
 }
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 71b9d42c99..21e3f5a987 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -334,6 +334,60 @@ pathkeys_contained_in(List *keys1, List *keys2)
 	return false;
 }
 
+/*
+ * pathkeys_count_contained_in
+ *    Same as pathkeys_contained_in, but also sets length of longest
+ *    common prefix of keys1 and keys2.
+ */
+bool
+pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common)
+{
+	int			n = 0;
+	ListCell   *key1,
+			   *key2;
+
+	/*
+	 * See if we can avoiding looping through both lists. This optimization
+	 * gains us several percent in planning time in a worst-case test.
+	 */
+	if (keys1 == keys2)
+	{
+		*n_common = list_length(keys1);
+		return true;
+	}
+	else if (keys1 == NIL)
+	{
+		*n_common = 0;
+		return true;
+	}
+	else if (keys2 == NIL)
+	{
+		*n_common = 0;
+		return false;
+	}
+
+	/*
+	 * If both lists are non-empty, iterate through both to find out how many
+	 * items are shared.
+	 */
+	forboth(key1, keys1, key2, keys2)
+	{
+		PathKey    *pathkey1 = (PathKey *) lfirst(key1);
+		PathKey    *pathkey2 = (PathKey *) lfirst(key2);
+
+		if (pathkey1 != pathkey2)
+		{
+			*n_common = n;
+			return false;
+		}
+		n++;
+	}
+
+	/* If we ended with a null value, then we've processed the whole list. */
+	*n_common = n;
+	return (key1 == NULL);
+}
+
 /*
  * get_cheapest_path_for_pathkeys
  *	  Find the cheapest path (according to the specified criterion) that
@@ -1786,26 +1840,26 @@ right_merge_direction(PlannerInfo *root, PathKey *pathkey)
  *		Count the number of pathkeys that are useful for meeting the
  *		query's requested output ordering.
  *
- * Unlike merge pathkeys, this is an all-or-nothing affair: it does us
- * no good to order by just the first key(s) of the requested ordering.
- * So the result is always either 0 or list_length(root->query_pathkeys).
+ * Because we the have the possibility of incremental sort, a prefix list of
+ * keys is potentially useful for improving the performance of the requested
+ * ordering. Thus we return 0, if no valuable keys are found, or the number
+ * of leading keys shared by the list and the requested ordering..
  */
 static int
 pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
 {
+	int			n_common_pathkeys;
+
 	if (root->query_pathkeys == NIL)
 		return 0;				/* no special ordering requested */
 
 	if (pathkeys == NIL)
 		return 0;				/* unordered path */
 
-	if (pathkeys_contained_in(root->query_pathkeys, pathkeys))
-	{
-		/* It's useful ... or at least the first N keys are */
-		return list_length(root->query_pathkeys);
-	}
+	(void) pathkeys_count_contained_in(root->query_pathkeys, pathkeys,
+										&n_common_pathkeys);
 
-	return 0;					/* path ordering not useful */
+	return n_common_pathkeys;
 }
 
 /*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index fc25908dc6..6d26bfbeb5 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -98,6 +98,8 @@ static Plan *create_projection_plan(PlannerInfo *root,
 									int flags);
 static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe);
 static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags);
+static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root,
+													IncrementalSortPath *best_path, int flags);
 static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path);
 static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path,
 										int flags);
@@ -244,6 +246,10 @@ static MergeJoin *make_mergejoin(List *tlist,
 static Sort *make_sort(Plan *lefttree, int numCols,
 					   AttrNumber *sortColIdx, Oid *sortOperators,
 					   Oid *collations, bool *nullsFirst);
+static IncrementalSort *make_incrementalsort(Plan *lefttree,
+											 int numCols, int nPresortedCols,
+											 AttrNumber *sortColIdx, Oid *sortOperators,
+											 Oid *collations, bool *nullsFirst);
 static Plan *prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
 										Relids relids,
 										const AttrNumber *reqColIdx,
@@ -258,6 +264,8 @@ static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec,
 												 Relids relids);
 static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
 									 Relids relids);
+static IncrementalSort *make_incrementalsort_from_pathkeys(Plan *lefttree,
+														   List *pathkeys, Relids relids, int nPresortedCols);
 static Sort *make_sort_from_groupcols(List *groupcls,
 									  AttrNumber *grpColIdx,
 									  Plan *lefttree);
@@ -460,6 +468,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
 											 (SortPath *) best_path,
 											 flags);
 			break;
+		case T_IncrementalSort:
+			plan = (Plan *) create_incrementalsort_plan(root,
+														(IncrementalSortPath *) best_path,
+														flags);
+			break;
 		case T_Group:
 			plan = (Plan *) create_group_plan(root,
 											  (GroupPath *) best_path);
@@ -1994,6 +2007,32 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
 	return plan;
 }
 
+/*
+ * create_incrementalsort_plan
+ *
+ *	  Do the same as create_sort_plan, but create IncrementalSort plan.
+ */
+static IncrementalSort *
+create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path,
+							int flags)
+{
+	IncrementalSort *plan;
+	Plan	   *subplan;
+
+	/* See comments in create_sort_plan() above */
+	subplan = create_plan_recurse(root, best_path->spath.subpath,
+								  flags | CP_SMALL_TLIST);
+	plan = make_incrementalsort_from_pathkeys(subplan,
+											  best_path->spath.path.pathkeys,
+											  IS_OTHER_REL(best_path->spath.subpath->parent) ?
+											  best_path->spath.path.parent->relids : NULL,
+											  best_path->nPresortedCols);
+
+	copy_generic_path_info(&plan->sort.plan, (Path *) best_path);
+
+	return plan;
+}
+
 /*
  * create_group_plan
  *
@@ -5090,6 +5129,12 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
 	Plan	   *lefttree = plan->plan.lefttree;
 	Path		sort_path;		/* dummy for result of cost_sort */
 
+	/*
+	 * This function shouldn't have to deal with IncrementalSort plans because
+	 * they are only created from corresponding Path nodes.
+	 */
+	Assert(IsA(plan, Sort));
+
 	cost_sort(&sort_path, root, NIL,
 			  lefttree->total_cost,
 			  lefttree->plan_rows,
@@ -5677,9 +5722,12 @@ make_sort(Plan *lefttree, int numCols,
 		  AttrNumber *sortColIdx, Oid *sortOperators,
 		  Oid *collations, bool *nullsFirst)
 {
-	Sort	   *node = makeNode(Sort);
-	Plan	   *plan = &node->plan;
+	Sort	   *node;
+	Plan	   *plan;
 
+	node = makeNode(Sort);
+
+	plan = &node->plan;
 	plan->targetlist = lefttree->targetlist;
 	plan->qual = NIL;
 	plan->lefttree = lefttree;
@@ -5693,6 +5741,37 @@ make_sort(Plan *lefttree, int numCols,
 	return node;
 }
 
+/*
+ * make_incrementalsort --- basic routine to build an IncrementalSort plan node
+ *
+ * Caller must have built the sortColIdx, sortOperators, collations, and
+ * nullsFirst arrays already.
+ */
+static IncrementalSort *
+make_incrementalsort(Plan *lefttree, int numCols, int nPresortedCols,
+					 AttrNumber *sortColIdx, Oid *sortOperators,
+					 Oid *collations, bool *nullsFirst)
+{
+	IncrementalSort *node;
+	Plan	   *plan;
+
+	node = makeNode(IncrementalSort);
+
+	plan = &node->sort.plan;
+	plan->targetlist = lefttree->targetlist;
+	plan->qual = NIL;
+	plan->lefttree = lefttree;
+	plan->righttree = NULL;
+	node->nPresortedCols = nPresortedCols;
+	node->sort.numCols = numCols;
+	node->sort.sortColIdx = sortColIdx;
+	node->sort.sortOperators = sortOperators;
+	node->sort.collations = collations;
+	node->sort.nullsFirst = nullsFirst;
+
+	return node;
+}
+
 /*
  * prepare_sort_from_pathkeys
  *	  Prepare to sort according to given pathkeys
@@ -6039,6 +6118,42 @@ make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids)
 					 collations, nullsFirst);
 }
 
+/*
+ * make_incrementalsort_from_pathkeys
+ *	  Create sort plan to sort according to given pathkeys
+ *
+ *	  'lefttree' is the node which yields input tuples
+ *	  'pathkeys' is the list of pathkeys by which the result is to be sorted
+ *	  'relids' is the set of relations required by prepare_sort_from_pathkeys()
+ *	  'nPresortedCols' is the number of presorted columns in input tuples
+ */
+static IncrementalSort *
+make_incrementalsort_from_pathkeys(Plan *lefttree, List *pathkeys,
+								   Relids relids, int nPresortedCols)
+{
+	int			numsortkeys;
+	AttrNumber *sortColIdx;
+	Oid		   *sortOperators;
+	Oid		   *collations;
+	bool	   *nullsFirst;
+
+	/* Compute sort column info, and adjust lefttree as needed */
+	lefttree = prepare_sort_from_pathkeys(lefttree, pathkeys,
+										  relids,
+										  NULL,
+										  false,
+										  &numsortkeys,
+										  &sortColIdx,
+										  &sortOperators,
+										  &collations,
+										  &nullsFirst);
+
+	/* Now build the Sort node */
+	return make_incrementalsort(lefttree, numsortkeys, nPresortedCols,
+								sortColIdx, sortOperators,
+								collations, nullsFirst);
+}
+
 /*
  * make_sort_from_sortclauses
  *	  Create sort plan to sort according to given sortclauses
@@ -6774,6 +6889,7 @@ is_projection_capable_path(Path *path)
 		case T_Hash:
 		case T_Material:
 		case T_Sort:
+		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
 		case T_LockRows:
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f52226ccec..aeb83841d7 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4924,13 +4924,16 @@ create_distinct_paths(PlannerInfo *root,
  * Build a new upperrel containing Paths for ORDER BY evaluation.
  *
  * All paths in the result must satisfy the ORDER BY ordering.
- * The only new path we need consider is an explicit sort on the
- * cheapest-total existing path.
+ * The only new paths we need consider are an explicit full sort
+ * and incremental sort on the cheapest-total existing path.
  *
  * input_rel: contains the source-data Paths
  * target: the output tlist the result Paths must emit
  * limit_tuples: estimated bound on the number of output tuples,
  *		or -1 if no LIMIT or couldn't estimate
+ *
+ * XXX This only looks at sort_pathkeys. I wonder if it needs to look at the
+ * other pathkeys (grouping, ...) like generate_useful_gather_paths.
  */
 static RelOptInfo *
 create_ordered_paths(PlannerInfo *root,
@@ -4964,29 +4967,77 @@ create_ordered_paths(PlannerInfo *root,
 
 	foreach(lc, input_rel->pathlist)
 	{
-		Path	   *path = (Path *) lfirst(lc);
+		Path	   *input_path = (Path *) lfirst(lc);
+		Path	   *sorted_path = input_path;
 		bool		is_sorted;
+		int			presorted_keys;
 
-		is_sorted = pathkeys_contained_in(root->sort_pathkeys,
-										  path->pathkeys);
-		if (path == cheapest_input_path || is_sorted)
+		is_sorted = pathkeys_count_contained_in(root->sort_pathkeys,
+												 input_path->pathkeys, &presorted_keys);
+
+		if (is_sorted)
 		{
-			if (!is_sorted)
+			/* Use the input path as is, but add a projection step if needed */
+			if (sorted_path->pathtarget != target)
+				sorted_path = apply_projection_to_path(root, ordered_rel,
+													   sorted_path, target);
+
+			add_path(ordered_rel, sorted_path);
+		}
+		else
+		{
+			/*
+			 * Try adding an explicit sort, but only to the cheapest total path
+			 * since a full sort should generally add the same cost to all
+			 * paths.
+			 */
+			if (input_path == cheapest_input_path)
 			{
-				/* An explicit sort here can take advantage of LIMIT */
-				path = (Path *) create_sort_path(root,
-												 ordered_rel,
-												 path,
-												 root->sort_pathkeys,
-												 limit_tuples);
+				/*
+				 * Sort the cheapest input path. An explicit sort here can
+				 * take advantage of LIMIT.
+				 */
+				sorted_path = (Path *) create_sort_path(root,
+														ordered_rel,
+														input_path,
+														root->sort_pathkeys,
+														limit_tuples);
+				/* Add projection step if needed */
+				if (sorted_path->pathtarget != target)
+					sorted_path = apply_projection_to_path(root, ordered_rel,
+														   sorted_path, target);
+
+				add_path(ordered_rel, sorted_path);
 			}
 
-			/* Add projection step if needed */
-			if (path->pathtarget != target)
-				path = apply_projection_to_path(root, ordered_rel,
-												path, target);
+			/*
+			 * If incremental sort is enabled, then try it as well. Unlike with
+			 * regular sorts, we can't just look at the cheapest path, because
+			 * the cost of incremental sort depends on how well presorted the
+			 * path is. Additionally incremental sort may enable a cheaper
+			 * startup path to win out despite higher total cost.
+			 */
+			if (!enable_incrementalsort)
+				continue;
 
-			add_path(ordered_rel, path);
+			/* Likewise, if the path can't be used for incremental sort. */
+			if (!presorted_keys)
+				continue;
+
+			/* Also consider incremental sort. */
+			sorted_path = (Path *) create_incremental_sort_path(root,
+																ordered_rel,
+																input_path,
+																root->sort_pathkeys,
+																presorted_keys,
+																limit_tuples);
+
+			/* Add projection step if needed */
+			if (sorted_path->pathtarget != target)
+				sorted_path = apply_projection_to_path(root, ordered_rel,
+													   sorted_path, target);
+
+			add_path(ordered_rel, sorted_path);
 		}
 	}
 
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 3dcded506b..2b676bf406 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -678,6 +678,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
 
 		case T_Material:
 		case T_Sort:
+		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
 
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 3650e8329d..b02fcb9bfe 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2688,6 +2688,7 @@ finalize_plan(PlannerInfo *root, Plan *plan,
 		case T_Hash:
 		case T_Material:
 		case T_Sort:
+		case T_IncrementalSort:
 		case T_Unique:
 		case T_SetOp:
 		case T_Group:
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 8ba8122ee2..4538ed88e0 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2753,6 +2753,57 @@ create_set_projection_path(PlannerInfo *root,
 	return pathnode;
 }
 
+/*
+ * create_incremental_sort_path
+ *	  Creates a pathnode that represents performing an incremental sort.
+ *
+ * 'rel' is the parent relation associated with the result
+ * 'subpath' is the path representing the source of data
+ * 'pathkeys' represents the desired sort order
+ * 'presorted_keys' is the number of keys by which the input path is
+ *		already sorted
+ * 'limit_tuples' is the estimated bound on the number of output tuples,
+ *		or -1 if no LIMIT or couldn't estimate
+ */
+SortPath *
+create_incremental_sort_path(PlannerInfo *root,
+							 RelOptInfo *rel,
+							 Path *subpath,
+							 List *pathkeys,
+							 int presorted_keys,
+							 double limit_tuples)
+{
+	IncrementalSortPath *sort = makeNode(IncrementalSortPath);
+	SortPath   *pathnode = &sort->spath;
+
+	pathnode->path.pathtype = T_IncrementalSort;
+	pathnode->path.parent = rel;
+	/* Sort doesn't project, so use source path's pathtarget */
+	pathnode->path.pathtarget = subpath->pathtarget;
+	/* For now, assume we are above any joins, so no parameterization */
+	pathnode->path.param_info = NULL;
+	pathnode->path.parallel_aware = false;
+	pathnode->path.parallel_safe = rel->consider_parallel &&
+		subpath->parallel_safe;
+	pathnode->path.parallel_workers = subpath->parallel_workers;
+	pathnode->path.pathkeys = pathkeys;
+
+	pathnode->subpath = subpath;
+
+	cost_incremental_sort(&pathnode->path,
+						  root, pathkeys, presorted_keys,
+						  subpath->startup_cost,
+						  subpath->total_cost,
+						  subpath->rows,
+						  subpath->pathtarget->width,
+						  0.0,	/* XXX comparison_cost shouldn't be 0? */
+						  work_mem, limit_tuples);
+
+	sort->nPresortedCols = presorted_keys;
+
+	return pathnode;
+}
+
 /*
  * create_sort_path
  *	  Creates a pathnode that represents performing an explicit sort.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 477af5d552..03a22d71ac 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -991,6 +991,15 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of incremental sort steps."),
+			NULL
+		},
+		&enable_incrementalsort,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of hashed aggregation plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 91fa185053..1ae8b77306 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -360,6 +360,7 @@
 #enable_parallel_append = on
 #enable_seqscan = on
 #enable_sort = on
+#enable_incrementalsort = on
 #enable_tidscan = on
 #enable_partitionwise_join = off
 #enable_partitionwise_aggregate = off
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d02e676aa3..cc33a85731 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -125,6 +125,16 @@
 #define PARALLEL_SORT(state)	((state)->shared == NULL ? 0 : \
 								 (state)->worker >= 0 ? 1 : 2)
 
+/*
+ * Initial size of memtuples array.  We're trying to select this size so that
+ * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
+ * allocation might possibly be lowered.  However, we don't consider array sizes
+ * less than 1024.
+ *
+ */
+#define INITIAL_MEMTUPSIZE Max(1024, \
+	ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
+
 /* GUC variables */
 #ifdef TRACE_SORT
 bool		trace_sort = false;
@@ -241,6 +251,14 @@ struct Tuplesortstate
 	int64		allowedMem;		/* total memory allowed, in bytes */
 	int			maxTapes;		/* number of tapes (Knuth's T) */
 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
+	int64		maxSpace;		/* maximum amount of space occupied among sort
+								 * of groups, either in-memory or on-disk */
+	bool		isMaxSpaceDisk; /* true when maxSpace is value for on-disk
+								 * space, false when it's value for in-memory
+								 * space */
+	TupSortStatus	maxSpaceStatus;	/* sort status when maxSpace was reached */
+	MemoryContext	maincontext;	/* memory context for tuple sort metadata that
+								 * persists across multiple batches */
 	MemoryContext sortcontext;	/* memory context holding most sort data */
 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
@@ -591,6 +609,7 @@ struct Sharedsort
 static Tuplesortstate *tuplesort_begin_common(int workMem,
 											  SortCoordinate coordinate,
 											  bool randomAccess);
+static void tuplesort_begin_batch(Tuplesortstate *state);
 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 static bool consider_abort_common(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state, bool mergeruns);
@@ -647,6 +666,8 @@ static void worker_freeze_result_tape(Tuplesortstate *state);
 static void worker_nomergeruns(Tuplesortstate *state);
 static void leader_takeover_tapes(Tuplesortstate *state);
 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
+static void tuplesort_free(Tuplesortstate *state);
+static void tuplesort_updatemax(Tuplesortstate *state);
 
 /*
  * Special versions of qsort just for SortTuple objects.  qsort_tuple() sorts
@@ -682,8 +703,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 					   bool randomAccess)
 {
 	Tuplesortstate *state;
+	MemoryContext maincontext;
 	MemoryContext sortcontext;
-	MemoryContext tuplecontext;
 	MemoryContext oldcontext;
 
 	/* See leader_takeover_tapes() remarks on randomAccess support */
@@ -691,31 +712,31 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 		elog(ERROR, "random access disallowed under parallel sort");
 
 	/*
-	 * Create a working memory context for this sort operation. All data
-	 * needed by the sort will live inside this context.
+	 * Memory context surviving tuplesort_reset.  This memory context holds
+	 * data which is useful to keep while sorting multiple similar batches.
 	 */
-	sortcontext = AllocSetContextCreate(CurrentMemoryContext,
+	maincontext = AllocSetContextCreate(CurrentMemoryContext,
 										"TupleSort main",
 										ALLOCSET_DEFAULT_SIZES);
 
 	/*
-	 * Caller tuple (e.g. IndexTuple) memory context.
-	 *
-	 * A dedicated child context used exclusively for caller passed tuples
-	 * eases memory management.  Resetting at key points reduces
-	 * fragmentation. Note that the memtuples array of SortTuples is allocated
-	 * in the parent context, not this context, because there is no need to
-	 * free memtuples early.
+	 * Create a working memory context for one sort operation.  The content of
+	 * this context is deleted by tuplesort_reset.
 	 */
-	tuplecontext = AllocSetContextCreate(sortcontext,
-										 "Caller tuples",
-										 ALLOCSET_DEFAULT_SIZES);
+	sortcontext = AllocSetContextCreate(maincontext,
+										"TupleSort sort",
+										ALLOCSET_DEFAULT_SIZES);
 
 	/*
-	 * Make the Tuplesortstate within the per-sort context.  This way, we
+	 * Additionally a working memory context for tuples is setup in
+	 * tuplesort_begin_batch.
+	 */
+
+	/*
+	 * Make the Tuplesortstate within the per-sortstate context.  This way, we
 	 * don't need a separate pfree() operation for it at shutdown.
 	 */
-	oldcontext = MemoryContextSwitchTo(sortcontext);
+	oldcontext = MemoryContextSwitchTo(maincontext);
 
 	state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
 
@@ -724,11 +745,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 		pg_rusage_init(&state->ru_start);
 #endif
 
-	state->status = TSS_INITIAL;
 	state->randomAccess = randomAccess;
-	state->bounded = false;
 	state->tuples = true;
-	state->boundUsed = false;
 
 	/*
 	 * workMem is forced to be at least 64KB, the current minimum valid value
@@ -737,38 +755,21 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 	 * with very little memory.
 	 */
 	state->allowedMem = Max(workMem, 64) * (int64) 1024;
-	state->availMem = state->allowedMem;
 	state->sortcontext = sortcontext;
-	state->tuplecontext = tuplecontext;
-	state->tapeset = NULL;
-
-	state->memtupcount = 0;
+	state->maincontext = maincontext;
 
 	/*
 	 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
 	 * see comments in grow_memtuples().
 	 */
-	state->memtupsize = Max(1024,
-							ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
-
-	state->growmemtuples = true;
-	state->slabAllocatorUsed = false;
-	state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
-
-	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
-
-	/* workMem must be large enough for the minimal memtuples array */
-	if (LACKMEM(state))
-		elog(ERROR, "insufficient memory allowed for sort");
-
-	state->currentRun = 0;
+	state->memtupsize = INITIAL_MEMTUPSIZE;
+	state->memtuples = NULL;
 
 	/*
-	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
-	 * inittapes(), if needed
+	 * After all of the other non-parallel-related state, we setup all of the
+	 * state needed for each batch.
 	 */
-
-	state->result_tape = -1;	/* flag that result tape has not been formed */
+	tuplesort_begin_batch(state);
 
 	/*
 	 * Initialize parallel-related state based on coordination information
@@ -802,6 +803,77 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 	return state;
 }
 
+/*
+ *		tuplesort_begin_batch
+ *
+ * Setup, or reset, all state need for processing a new set of tuples with this
+ * sort state. Called both from tuplesort_begin_common (the first time sorting
+ * with this sort state) and tuplesort_reseti (for subsequent usages).
+ */
+static void
+tuplesort_begin_batch(Tuplesortstate *state)
+{
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
+
+	/*
+	 * Caller tuple (e.g. IndexTuple) memory context.
+	 *
+	 * A dedicated child context used exclusively for caller passed tuples
+	 * eases memory management.  Resetting at key points reduces
+	 * fragmentation. Note that the memtuples array of SortTuples is allocated
+	 * in the parent context, not this context, because there is no need to
+	 * free memtuples early.
+	 */
+	state->tuplecontext = AllocSetContextCreate(state->sortcontext,
+												"Caller tuples",
+												ALLOCSET_DEFAULT_SIZES);
+
+	state->status = TSS_INITIAL;
+	state->bounded = false;
+	state->boundUsed = false;
+
+	state->availMem = state->allowedMem;
+
+	state->tapeset = NULL;
+
+	state->memtupcount = 0;
+
+	/*
+	 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
+	 * see comments in grow_memtuples().
+	 */
+	state->growmemtuples = true;
+	state->slabAllocatorUsed = false;
+	if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
+	{
+		pfree(state->memtuples);
+		state->memtuples = NULL;
+		state->memtupsize = INITIAL_MEMTUPSIZE;
+	}
+	if (state->memtuples == NULL)
+	{
+		state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
+		USEMEM(state, GetMemoryChunkSpace(state->memtuples));
+	}
+
+	/* workMem must be large enough for the minimal memtuples array */
+	if (LACKMEM(state))
+		elog(ERROR, "insufficient memory allowed for sort");
+
+	state->currentRun = 0;
+
+	/*
+	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
+	 * inittapes(), if needed
+	 */
+
+	state->result_tape = -1;	/* flag that result tape has not been formed */
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
 Tuplesortstate *
 tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
@@ -814,7 +886,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 	MemoryContext oldcontext;
 	int			i;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 	AssertArg(nkeys > 0);
 
@@ -890,7 +962,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 
 	Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -985,7 +1057,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	MemoryContext oldcontext;
 	int			i;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -1063,7 +1135,7 @@ tuplesort_begin_index_hash(Relation heapRel,
 												   randomAccess);
 	MemoryContext oldcontext;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -1106,7 +1178,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 	int16		typlen;
 	bool		typbyval;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -1224,16 +1296,23 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
 }
 
 /*
- * tuplesort_end
+ * tuplesort_used_bound
  *
- *	Release resources and clean up.
- *
- * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
- * pointing to garbage.  Be careful not to attempt to use or free such
- * pointers afterwards!
+ * Allow callers to find out if the sort state was able to use a bound.
  */
-void
-tuplesort_end(Tuplesortstate *state)
+bool
+tuplesort_used_bound(Tuplesortstate *state)
+{
+	return state->boundUsed;
+}
+
+/*
+ * tuplesort_free
+ *
+ *	Internal routine for freeing resources of tuplesort.
+ */
+static void
+tuplesort_free(Tuplesortstate *state)
 {
 	/* context swap probably not needed, but let's be safe */
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
@@ -1291,10 +1370,104 @@ tuplesort_end(Tuplesortstate *state)
 	MemoryContextSwitchTo(oldcontext);
 
 	/*
-	 * Free the per-sort memory context, thereby releasing all working memory,
-	 * including the Tuplesortstate struct itself.
+	 * Free the per-sort memory context, thereby releasing all working memory.
 	 */
-	MemoryContextDelete(state->sortcontext);
+	MemoryContextReset(state->sortcontext);
+}
+
+/*
+ * tuplesort_end
+ *
+ *	Release resources and clean up.
+ *
+ * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
+ * pointing to garbage.  Be careful not to attempt to use or free such
+ * pointers afterwards!
+ */
+void
+tuplesort_end(Tuplesortstate *state)
+{
+	tuplesort_free(state);
+
+	/*
+	 * Free the main memory context, including the Tuplesortstate struct
+	 * itself.
+	 */
+	MemoryContextDelete(state->maincontext);
+}
+
+/*
+ * tuplesort_updatemax
+ *
+ *	Update maximum resource usage statistics.
+ */
+static void
+tuplesort_updatemax(Tuplesortstate *state)
+{
+	int64		spaceUsed;
+	bool		isSpaceDisk;
+
+	/*
+	 * Note: it might seem we should provide both memory and disk usage for a
+	 * disk-based sort.  However, the current code doesn't track memory space
+	 * accurately once we have begun to return tuples to the caller (since we
+	 * don't account for pfree's the caller is expected to do), so we cannot
+	 * rely on availMem in a disk sort.  This does not seem worth the overhead
+	 * to fix.  Is it worth creating an API for the memory context code to
+	 * tell us how much is actually used in sortcontext?
+	 */
+	if (state->tapeset)
+	{
+		isSpaceDisk = true;
+		spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
+	}
+	else
+	{
+		isSpaceDisk = false;
+		spaceUsed = state->allowedMem - state->availMem;
+	}
+
+	/*
+	 * Sort evicts data to the disk when it didn't manage to fit those data to
+	 * the main memory.  This is why we assume space used on the disk to be
+	 * more important for tracking resource usage than space used in memory.
+	 * Note that amount of space occupied by some tuple set on the disk might
+	 * be less than amount of space occupied by the same tuple set in the
+	 * memory due to more compact representation.
+	 */
+	if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
+		(isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
+	{
+		state->maxSpace = spaceUsed;
+		state->isMaxSpaceDisk = isSpaceDisk;
+		state->maxSpaceStatus = state->status;
+	}
+}
+
+/*
+ * tuplesort_reset
+ *
+ *	Reset the tuplesort.  Reset all the data in the tuplesort, but leave the
+ *	meta-information in.  After tuplesort_reset, tuplesort is ready to start
+ *	a new sort.  This allows avoiding recreation of tuple sort states (and
+ *	save resources) when sorting multiple small batches.
+ */
+void
+tuplesort_reset(Tuplesortstate *state)
+{
+	tuplesort_updatemax(state);
+	tuplesort_free(state);
+
+	/*
+	 * After we've freed up per-batch memory, re-setup all of the state common
+	 * to both the first batch and any subsequent batch.
+	 */
+	tuplesort_begin_batch(state);
+
+	state->lastReturnedTuple = NULL;
+	state->slabMemoryBegin = NULL;
+	state->slabMemoryEnd = NULL;
+	state->slabFreeHead = NULL;
 }
 
 /*
@@ -2591,8 +2764,7 @@ mergeruns(Tuplesortstate *state)
 	 * Reset tuple memory.  We've freed all the tuples that we previously
 	 * allocated.  We will use the slab allocator from now on.
 	 */
-	MemoryContextDelete(state->tuplecontext);
-	state->tuplecontext = NULL;
+	MemoryContextResetOnly(state->tuplecontext);
 
 	/*
 	 * We no longer need a large memtuples array.  (We will allocate a smaller
@@ -2642,7 +2814,8 @@ mergeruns(Tuplesortstate *state)
 	 * from each input tape.
 	 */
 	state->memtupsize = numInputTapes;
-	state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
+	state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext,
+														numInputTapes * sizeof(SortTuple));
 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
 
 	/*
@@ -3138,18 +3311,15 @@ tuplesort_get_stats(Tuplesortstate *state,
 	 * to fix.  Is it worth creating an API for the memory context code to
 	 * tell us how much is actually used in sortcontext?
 	 */
-	if (state->tapeset)
-	{
-		stats->spaceType = SORT_SPACE_TYPE_DISK;
-		stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
-	}
-	else
-	{
-		stats->spaceType = SORT_SPACE_TYPE_MEMORY;
-		stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
-	}
+	tuplesort_updatemax(state);
 
-	switch (state->status)
+	if (state->isMaxSpaceDisk)
+		stats->spaceType = SORT_SPACE_TYPE_DISK;
+	else
+		stats->spaceType = SORT_SPACE_TYPE_MEMORY;
+	stats->spaceUsed = (state->maxSpace + 1023) / 1024;
+
+	switch (state->maxSpaceStatus)
 	{
 		case TSS_SORTEDINMEM:
 			if (state->boundUsed)
diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h
index 2e9920111f..4af6e0013d 100644
--- a/src/include/executor/execdebug.h
+++ b/src/include/executor/execdebug.h
@@ -86,10 +86,12 @@
 #define SO_nodeDisplay(l)				nodeDisplay(l)
 #define SO_printf(s)					printf(s)
 #define SO1_printf(s, p)				printf(s, p)
+#define SO2_printf(s, p1, p2)			printf(s, p1, p2)
 #else
 #define SO_nodeDisplay(l)
 #define SO_printf(s)
 #define SO1_printf(s, p)
+#define SO2_printf(s, p1, p2)
 #endif							/* EXEC_SORTDEBUG */
 
 /* ----------------
diff --git a/src/include/executor/nodeIncrementalSort.h b/src/include/executor/nodeIncrementalSort.h
new file mode 100644
index 0000000000..e62c02a4f3
--- /dev/null
+++ b/src/include/executor/nodeIncrementalSort.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.h
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/executor/nodeIncrementalSort.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NODEINCREMENTALSORT_H
+#define NODEINCREMENTALSORT_H
+
+#include "access/parallel.h"
+#include "nodes/execnodes.h"
+
+extern IncrementalSortState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags);
+extern void ExecEndIncrementalSort(IncrementalSortState *node);
+extern void ExecReScanIncrementalSort(IncrementalSortState *node);
+
+/* parallel instrumentation support */
+extern void ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt);
+extern void ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt);
+extern void ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pcxt);
+extern void ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node);
+
+#endif							/* NODEINCREMENTALSORT_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0fb5d61a3f..fb490b404c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1982,6 +1982,21 @@ typedef struct MaterialState
 	Tuplestorestate *tuplestorestate;
 } MaterialState;
 
+
+/* ----------------
+ *	 When performing sorting by multiple keys, it's possible that the input
+ *	 dataset is already sorted on a prefix of those keys. We call these
+ *	 "presorted keys".
+ *	 PresortedKeyData represents information about one such key.
+ * ----------------
+ */
+typedef struct PresortedKeyData
+{
+	FmgrInfo	flinfo;			/* comparison function info */
+	FunctionCallInfo fcinfo;	/* comparison function call info */
+	OffsetNumber attno;			/* attribute number in tuple */
+} PresortedKeyData;
+
 /* ----------------
  *	 Shared memory container for per-worker sort information
  * ----------------
@@ -2010,6 +2025,71 @@ typedef struct SortState
 	SharedSortInfo *shared_info;	/* one entry per worker */
 } SortState;
 
+/* ----------------
+ *	 Instrumentation information for IncrementalSort
+ * ----------------
+ */
+typedef struct IncrementalSortGroupInfo
+{
+	int64		groupCount;
+	long		maxDiskSpaceUsed;
+	long		totalDiskSpaceUsed;
+	long		maxMemorySpaceUsed;
+	long		totalMemorySpaceUsed;
+	bits32		sortMethods; /* bitmask of TuplesortMethod */
+} IncrementalSortGroupInfo;
+
+typedef struct IncrementalSortInfo
+{
+	IncrementalSortGroupInfo fullsortGroupInfo;
+	IncrementalSortGroupInfo prefixsortGroupInfo;
+} IncrementalSortInfo;
+
+/* ----------------
+ *	 Shared memory container for per-worker incremental sort information
+ * ----------------
+ */
+typedef struct SharedIncrementalSortInfo
+{
+	int			num_workers;
+	IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER];
+} SharedIncrementalSortInfo;
+
+/* ----------------
+ *	 IncrementalSortState information
+ * ----------------
+ */
+typedef enum
+{
+	INCSORT_LOADFULLSORT,
+	INCSORT_LOADPREFIXSORT,
+	INCSORT_READFULLSORT,
+	INCSORT_READPREFIXSORT,
+} IncrementalSortExecutionStatus;
+
+typedef struct IncrementalSortState
+{
+	ScanState	ss;				/* its first field is NodeTag */
+	bool		bounded;		/* is the result set bounded? */
+	int64		bound;			/* if bounded, how many tuples are needed */
+	bool		outerNodeDone;	/* finished fetching tuples from outer node */
+	int64		bound_Done;		/* value of bound we did the sort with */
+	IncrementalSortExecutionStatus execution_status;
+	int64		n_fullsort_remaining;
+	Tuplesortstate *fullsort_state; /* private state of tuplesort.c */
+	Tuplesortstate *prefixsort_state;	/* private state of tuplesort.c */
+	/* the keys by which the input path is already sorted */
+	PresortedKeyData *presorted_keys;
+
+	IncrementalSortInfo incsort_info;
+
+	/* slot for pivot tuple defining values of presorted keys within group */
+	TupleTableSlot *group_pivot;
+	TupleTableSlot *transfer_tuple;
+	bool		am_worker;		/* are we a worker? */
+	SharedIncrementalSortInfo *shared_info; /* one entry per worker */
+} IncrementalSortState;
+
 /* ---------------------
  *	GroupState information
  * ---------------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 8a76afe8cc..50b1ba5186 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -74,6 +74,7 @@ typedef enum NodeTag
 	T_HashJoin,
 	T_Material,
 	T_Sort,
+	T_IncrementalSort,
 	T_Group,
 	T_Agg,
 	T_WindowAgg,
@@ -130,6 +131,7 @@ typedef enum NodeTag
 	T_HashJoinState,
 	T_MaterialState,
 	T_SortState,
+	T_IncrementalSortState,
 	T_GroupState,
 	T_AggState,
 	T_WindowAggState,
@@ -245,6 +247,7 @@ typedef enum NodeTag
 	T_ProjectionPath,
 	T_ProjectSetPath,
 	T_SortPath,
+	T_IncrementalSortPath,
 	T_GroupPath,
 	T_UpperUniquePath,
 	T_AggPath,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 469c686e3f..a6d206b25a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1638,6 +1638,15 @@ typedef struct SortPath
 	Path	   *subpath;		/* path representing input source */
 } SortPath;
 
+/*
+ * IncrementalSortPath
+ */
+typedef struct IncrementalSortPath
+{
+	SortPath	spath;
+	int			nPresortedCols;	/* number of presorted columns */
+} IncrementalSortPath;
+
 /*
  * GroupPath represents grouping (of presorted input)
  *
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 4869fe7b6d..be8ef54a1e 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -774,6 +774,16 @@ typedef struct Sort
 	bool	   *nullsFirst;		/* NULLS FIRST/LAST directions */
 } Sort;
 
+/* ----------------
+ *		incremental sort node
+ * ----------------
+ */
+typedef struct IncrementalSort
+{
+	Sort		sort;
+	int			nPresortedCols;	/* number of presorted columns */
+} IncrementalSort;
+
 /* ---------------
  *	 group node -
  *		Used for queries with GROUP BY (but no aggregates) specified.
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 735ba09650..9710e5c0a4 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -53,6 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
+extern PGDLLIMPORT bool enable_incrementalsort;
 extern PGDLLIMPORT bool enable_hashagg;
 extern PGDLLIMPORT bool enable_hashagg_disk;
 extern PGDLLIMPORT bool enable_groupingsets_hash_disk;
@@ -103,6 +104,11 @@ extern void cost_sort(Path *path, PlannerInfo *root,
 					  List *pathkeys, Cost input_cost, double tuples, int width,
 					  Cost comparison_cost, int sort_mem,
 					  double limit_tuples);
+extern void cost_incremental_sort(Path *path,
+								  PlannerInfo *root, List *pathkeys, int presorted_keys,
+								  Cost input_startup_cost, Cost input_total_cost,
+								  double input_tuples, int width, Cost comparison_cost, int sort_mem,
+								  double limit_tuples);
 extern void cost_append(AppendPath *path);
 extern void cost_merge_append(Path *path, PlannerInfo *root,
 							  List *pathkeys, int n_streams,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e450fe112a..bcd08af753 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -184,6 +184,12 @@ extern ProjectSetPath *create_set_projection_path(PlannerInfo *root,
 												  RelOptInfo *rel,
 												  Path *subpath,
 												  PathTarget *target);
+extern SortPath *create_incremental_sort_path(PlannerInfo *root,
+											  RelOptInfo *rel,
+											  Path *subpath,
+											  List *pathkeys,
+											  int presorted_keys,
+											  double limit_tuples);
 extern SortPath *create_sort_path(PlannerInfo *root,
 								  RelOptInfo *rel,
 								  Path *subpath,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index c689fe8e26..ab61f306cb 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -185,6 +185,7 @@ typedef enum
 
 extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
 extern bool pathkeys_contained_in(List *keys1, List *keys2);
+extern bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 											Relids required_outer,
 											CostSelector cost_criterion,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index a2fdd3fcd3..8d00a9e501 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -61,14 +61,17 @@ typedef struct SortCoordinateData *SortCoordinate;
  * Data structures for reporting sort statistics.  Note that
  * TuplesortInstrumentation can't contain any pointers because we
  * sometimes put it in shared memory.
+ *
+ * TuplesortMethod is used in a bitmask in Increment Sort's shared memory
+ * instrumentation so needs to have each value be a separate bit.
  */
 typedef enum
 {
-	SORT_TYPE_STILL_IN_PROGRESS = 0,
-	SORT_TYPE_TOP_N_HEAPSORT,
-	SORT_TYPE_QUICKSORT,
-	SORT_TYPE_EXTERNAL_SORT,
-	SORT_TYPE_EXTERNAL_MERGE
+	SORT_TYPE_STILL_IN_PROGRESS = 1 << 0,
+	SORT_TYPE_TOP_N_HEAPSORT = 1 << 1,
+	SORT_TYPE_QUICKSORT = 1 << 2,
+	SORT_TYPE_EXTERNAL_SORT = 1 << 3,
+	SORT_TYPE_EXTERNAL_MERGE = 1 << 4
 } TuplesortMethod;
 
 typedef enum
@@ -215,6 +218,7 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
 											 bool randomAccess);
 
 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
+extern bool tuplesort_used_bound(Tuplesortstate *state);
 
 extern void tuplesort_puttupleslot(Tuplesortstate *state,
 								   TupleTableSlot *slot);
@@ -239,6 +243,8 @@ extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
 
 extern void tuplesort_end(Tuplesortstate *state);
 
+extern void tuplesort_reset(Tuplesortstate *state);
+
 extern void tuplesort_get_stats(Tuplesortstate *state,
 								TuplesortInstrumentation *stats);
 extern const char *tuplesort_method_name(TuplesortMethod m);
diff --git a/src/test/isolation/expected/drop-index-concurrently-1.out b/src/test/isolation/expected/drop-index-concurrently-1.out
index 75dff56bc4..8e6adb66bb 100644
--- a/src/test/isolation/expected/drop-index-concurrently-1.out
+++ b/src/test/isolation/expected/drop-index-concurrently-1.out
@@ -21,7 +21,7 @@ QUERY PLAN
 
 Sort           
   Sort Key: id, data
-  ->  Seq Scan on test_dc
+  ->  Index Scan using test_dc_pkey on test_dc
         Filter: ((data)::text = '34'::text)
 step select2: SELECT * FROM test_dc WHERE data=34 ORDER BY id,data;
 id             data           
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
new file mode 100644
index 0000000000..f130c606c8
--- /dev/null
+++ b/src/test/regress/expected/incremental_sort.out
@@ -0,0 +1,1441 @@
+-- When we have to sort the entire table, incremental sort will
+-- be slower than plain sort, so it should not be used.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+            QUERY PLAN             
+-----------------------------------
+ Sort
+   Sort Key: tenk1.four, tenk1.ten
+   ->  Sort
+         Sort Key: tenk1.four
+         ->  Seq Scan on tenk1
+(5 rows)
+
+-- When there is a LIMIT clause, incremental sort is beneficial because
+-- it only has to sort some of the groups, and not the entire table.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten
+limit 1;
+               QUERY PLAN                
+-----------------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: tenk1.four, tenk1.ten
+         Presorted Key: tenk1.four
+         ->  Sort
+               Sort Key: tenk1.four
+               ->  Seq Scan on tenk1
+(7 rows)
+
+-- When work_mem is not enough to sort the entire table, incremental sort
+-- may be faster if individual groups still fit into work_mem.
+set work_mem to '2MB';
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+            QUERY PLAN             
+-----------------------------------
+ Incremental Sort
+   Sort Key: tenk1.four, tenk1.ten
+   Presorted Key: tenk1.four
+   ->  Sort
+         Sort Key: tenk1.four
+         ->  Seq Scan on tenk1
+(6 rows)
+
+reset work_mem;
+create table t(a integer, b integer);
+create or replace function explain_analyze_without_memory(query text)
+returns table (out_line text) language plpgsql
+as
+$$
+declare
+  line text;
+begin
+  for line in
+    execute 'explain (analyze, costs off, summary off, timing off) ' || query
+  loop
+    out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g');
+    return next;
+  end loop;
+end;
+$$;
+create or replace function explain_analyze_inc_sort_nodes(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+  elements jsonb;
+  element jsonb;
+  matching_nodes jsonb := '[]'::jsonb;
+begin
+  execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements;
+  while jsonb_array_length(elements) > 0 loop
+    element := elements->0;
+    elements := elements - 0;
+    case jsonb_typeof(element)
+    when 'array' then
+      if jsonb_array_length(element) > 0 then
+        elements := elements || element;
+      end if;
+    when 'object' then
+      if element ? 'Plan' then
+        elements := elements || jsonb_build_array(element->'Plan');
+        element := element - 'Plan';
+      else
+        if element ? 'Plans' then
+          elements := elements || jsonb_build_array(element->'Plans');
+          element := element - 'Plans';
+        end if;
+        if (element->>'Node Type')::text = 'Incremental Sort' then
+          matching_nodes := matching_nodes || element;
+        end if;
+      end if;
+    end case;
+  end loop;
+  return matching_nodes;
+end;
+$$;
+create or replace function explain_analyze_inc_sort_nodes_without_memory(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+  nodes jsonb := '[]'::jsonb;
+  node jsonb;
+  group_key text;
+  space_key text;
+begin
+  for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+    for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+      for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+        node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false);
+        node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false);
+      end loop;
+    end loop;
+    nodes := nodes || node;
+  end loop;
+  return nodes;
+end;
+$$;
+create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text)
+returns bool language plpgsql
+as
+$$
+declare
+  node jsonb;
+  group_stats jsonb;
+  group_key text;
+  space_key text;
+begin
+  for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+    for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+      group_stats := node->group_key;
+      for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+        if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then
+          raise exception '% has invalid max space < average space', group_key;
+        end if;
+      end loop;
+    end loop;
+  end loop;
+  return true;
+end;
+$$;
+-- A single large group tested around each mode transition point.
+insert into t(a, b) select 1, i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 31;
+ a | b  
+---+----
+ 1 |  1
+ 1 |  2
+ 1 |  3
+ 1 |  4
+ 1 |  5
+ 1 |  6
+ 1 |  7
+ 1 |  8
+ 1 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+(31 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 32;
+ a | b  
+---+----
+ 1 |  1
+ 1 |  2
+ 1 |  3
+ 1 |  4
+ 1 |  5
+ 1 |  6
+ 1 |  7
+ 1 |  8
+ 1 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+(32 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 33;
+ a | b  
+---+----
+ 1 |  1
+ 1 |  2
+ 1 |  3
+ 1 |  4
+ 1 |  5
+ 1 |  6
+ 1 |  7
+ 1 |  8
+ 1 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+(33 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 65;
+ a | b  
+---+----
+ 1 |  1
+ 1 |  2
+ 1 |  3
+ 1 |  4
+ 1 |  5
+ 1 |  6
+ 1 |  7
+ 1 |  8
+ 1 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+ 1 | 34
+ 1 | 35
+ 1 | 36
+ 1 | 37
+ 1 | 38
+ 1 | 39
+ 1 | 40
+ 1 | 41
+ 1 | 42
+ 1 | 43
+ 1 | 44
+ 1 | 45
+ 1 | 46
+ 1 | 47
+ 1 | 48
+ 1 | 49
+ 1 | 50
+ 1 | 51
+ 1 | 52
+ 1 | 53
+ 1 | 54
+ 1 | 55
+ 1 | 56
+ 1 | 57
+ 1 | 58
+ 1 | 59
+ 1 | 60
+ 1 | 61
+ 1 | 62
+ 1 | 63
+ 1 | 64
+ 1 | 65
+(65 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 66;
+ a | b  
+---+----
+ 1 |  1
+ 1 |  2
+ 1 |  3
+ 1 |  4
+ 1 |  5
+ 1 |  6
+ 1 |  7
+ 1 |  8
+ 1 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+ 1 | 34
+ 1 | 35
+ 1 | 36
+ 1 | 37
+ 1 | 38
+ 1 | 39
+ 1 | 40
+ 1 | 41
+ 1 | 42
+ 1 | 43
+ 1 | 44
+ 1 | 45
+ 1 | 46
+ 1 | 47
+ 1 | 48
+ 1 | 49
+ 1 | 50
+ 1 | 51
+ 1 | 52
+ 1 | 53
+ 1 | 54
+ 1 | 55
+ 1 | 56
+ 1 | 57
+ 1 | 58
+ 1 | 59
+ 1 | 60
+ 1 | 61
+ 1 | 62
+ 1 | 63
+ 1 | 64
+ 1 | 65
+ 1 | 66
+(66 rows)
+
+delete from t;
+-- An initial large group followed by a small group.
+insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 55;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 55;
+ a | b  
+---+----
+ 1 |  1
+ 1 |  2
+ 1 |  3
+ 1 |  4
+ 1 |  5
+ 1 |  6
+ 1 |  7
+ 1 |  8
+ 1 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+ 1 | 34
+ 1 | 35
+ 1 | 36
+ 1 | 37
+ 1 | 38
+ 1 | 39
+ 1 | 40
+ 1 | 41
+ 1 | 42
+ 1 | 43
+ 1 | 44
+ 1 | 45
+ 1 | 46
+ 1 | 47
+ 1 | 48
+ 1 | 49
+ 2 | 50
+ 2 | 51
+ 2 | 52
+ 2 | 53
+ 2 | 54
+ 2 | 55
+(55 rows)
+
+-- Test EXPLAIN ANALYZE with only a fullsort group.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55');
+                                 explain_analyze_without_memory                                 
+------------------------------------------------------------------------------------------------
+ Limit (actual rows=55 loops=1)
+   ->  Incremental Sort (actual rows=55 loops=1)
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         Full-sort Groups: 2 Sort Methods: top-N heapsort, quicksort Memory: avg=NNkB peak=NNkB
+         ->  Sort (actual rows=100 loops=1)
+               Sort Key: t.a
+               Sort Method: quicksort  Memory: NNkB
+               ->  Seq Scan on t (actual rows=100 loops=1)
+(9 rows)
+
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55'));
+                   jsonb_pretty                   
+--------------------------------------------------
+ [                                               +
+     {                                           +
+         "Sort Key": [                           +
+             "t.a",                              +
+             "t.b"                               +
+         ],                                      +
+         "Node Type": "Incremental Sort",        +
+         "Actual Rows": 55,                      +
+         "Actual Loops": 1,                      +
+         "Presorted Key": [                      +
+             "t.a"                               +
+         ],                                      +
+         "Parallel Aware": false,                +
+         "Full-sort Groups": {                   +
+             "Group Count": 2,                   +
+             "Sort Methods Used": [              +
+                 "top-N heapsort",               +
+                 "quicksort"                     +
+             ],                                  +
+             "Sort Space Memory": {              +
+                 "Average Sort Space Used": "NN",+
+                 "Maximum Sort Space Used": "NN" +
+             }                                   +
+         },                                      +
+         "Parent Relationship": "Outer"          +
+     }                                           +
+ ]
+(1 row)
+
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55');
+ explain_analyze_inc_sort_nodes_verify_invariants 
+--------------------------------------------------
+ t
+(1 row)
+
+delete from t;
+-- An initial small group followed by a large group.
+insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 70;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 70;
+ a | b  
+---+----
+ 1 |  1
+ 2 |  2
+ 3 |  3
+ 4 |  4
+ 9 |  5
+ 9 |  6
+ 9 |  7
+ 9 |  8
+ 9 |  9
+ 9 | 10
+ 9 | 11
+ 9 | 12
+ 9 | 13
+ 9 | 14
+ 9 | 15
+ 9 | 16
+ 9 | 17
+ 9 | 18
+ 9 | 19
+ 9 | 20
+ 9 | 21
+ 9 | 22
+ 9 | 23
+ 9 | 24
+ 9 | 25
+ 9 | 26
+ 9 | 27
+ 9 | 28
+ 9 | 29
+ 9 | 30
+ 9 | 31
+ 9 | 32
+ 9 | 33
+ 9 | 34
+ 9 | 35
+ 9 | 36
+ 9 | 37
+ 9 | 38
+ 9 | 39
+ 9 | 40
+ 9 | 41
+ 9 | 42
+ 9 | 43
+ 9 | 44
+ 9 | 45
+ 9 | 46
+ 9 | 47
+ 9 | 48
+ 9 | 49
+ 9 | 50
+ 9 | 51
+ 9 | 52
+ 9 | 53
+ 9 | 54
+ 9 | 55
+ 9 | 56
+ 9 | 57
+ 9 | 58
+ 9 | 59
+ 9 | 60
+ 9 | 61
+ 9 | 62
+ 9 | 63
+ 9 | 64
+ 9 | 65
+ 9 | 66
+ 9 | 67
+ 9 | 68
+ 9 | 69
+ 9 | 70
+(70 rows)
+
+-- Test rescan.
+begin;
+-- We force the planner to choose a plan with incremental sort on the right side
+-- of a nested loop join node. That way we trigger the rescan code path.
+set local enable_hashjoin = off;
+set local enable_mergejoin = off;
+set local enable_material = off;
+set local enable_sort = off;
+explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+                   QUERY PLAN                   
+------------------------------------------------
+ Nested Loop Left Join
+   Join Filter: (t_1.a = t.a)
+   ->  Seq Scan on t
+         Filter: (a = ANY ('{1,2}'::integer[]))
+   ->  Incremental Sort
+         Sort Key: t_1.a, t_1.b
+         Presorted Key: t_1.a
+         ->  Sort
+               Sort Key: t_1.a
+               ->  Seq Scan on t t_1
+(10 rows)
+
+select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+ a | b | a | b 
+---+---+---+---
+ 1 | 1 | 1 | 1
+ 2 | 2 | 2 | 2
+(2 rows)
+
+rollback;
+-- Test EXPLAIN ANALYZE with both fullsort and presorted groups.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70');
+                                                           explain_analyze_without_memory                                                            
+-----------------------------------------------------------------------------------------------------------------------------------------------------
+ Limit (actual rows=70 loops=1)
+   ->  Incremental Sort (actual rows=70 loops=1)
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         Full-sort Groups: 1 Sort Method: quicksort Memory: avg=NNkB peak=NNkB Presorted Groups: 5 Sort Method: quicksort Memory: avg=NNkB peak=NNkB
+         ->  Sort (actual rows=100 loops=1)
+               Sort Key: t.a
+               Sort Method: quicksort  Memory: NNkB
+               ->  Seq Scan on t (actual rows=100 loops=1)
+(9 rows)
+
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70'));
+                   jsonb_pretty                   
+--------------------------------------------------
+ [                                               +
+     {                                           +
+         "Sort Key": [                           +
+             "t.a",                              +
+             "t.b"                               +
+         ],                                      +
+         "Node Type": "Incremental Sort",        +
+         "Actual Rows": 70,                      +
+         "Actual Loops": 1,                      +
+         "Presorted Key": [                      +
+             "t.a"                               +
+         ],                                      +
+         "Parallel Aware": false,                +
+         "Full-sort Groups": {                   +
+             "Group Count": 1,                   +
+             "Sort Methods Used": [              +
+                 "quicksort"                     +
+             ],                                  +
+             "Sort Space Memory": {              +
+                 "Average Sort Space Used": "NN",+
+                 "Maximum Sort Space Used": "NN" +
+             }                                   +
+         },                                      +
+         "Presorted Groups": {                   +
+             "Group Count": 5,                   +
+             "Sort Methods Used": [              +
+                 "quicksort"                     +
+             ],                                  +
+             "Sort Space Memory": {              +
+                 "Average Sort Space Used": "NN",+
+                 "Maximum Sort Space Used": "NN" +
+             }                                   +
+         },                                      +
+         "Parent Relationship": "Outer"          +
+     }                                           +
+ ]
+(1 row)
+
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70');
+ explain_analyze_inc_sort_nodes_verify_invariants 
+--------------------------------------------------
+ t
+(1 row)
+
+delete from t;
+-- Small groups of 10 tuples each tested around each mode transition point.
+insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 31;
+ a | b  
+---+----
+ 0 |  1
+ 0 |  2
+ 0 |  3
+ 0 |  4
+ 0 |  5
+ 0 |  6
+ 0 |  7
+ 0 |  8
+ 0 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+(31 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 32;
+ a | b  
+---+----
+ 0 |  1
+ 0 |  2
+ 0 |  3
+ 0 |  4
+ 0 |  5
+ 0 |  6
+ 0 |  7
+ 0 |  8
+ 0 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+(32 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 33;
+ a | b  
+---+----
+ 0 |  1
+ 0 |  2
+ 0 |  3
+ 0 |  4
+ 0 |  5
+ 0 |  6
+ 0 |  7
+ 0 |  8
+ 0 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+ 3 | 33
+(33 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 65;
+ a | b  
+---+----
+ 0 |  1
+ 0 |  2
+ 0 |  3
+ 0 |  4
+ 0 |  5
+ 0 |  6
+ 0 |  7
+ 0 |  8
+ 0 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+ 3 | 33
+ 3 | 34
+ 3 | 35
+ 3 | 36
+ 3 | 37
+ 3 | 38
+ 3 | 39
+ 4 | 40
+ 4 | 41
+ 4 | 42
+ 4 | 43
+ 4 | 44
+ 4 | 45
+ 4 | 46
+ 4 | 47
+ 4 | 48
+ 4 | 49
+ 5 | 50
+ 5 | 51
+ 5 | 52
+ 5 | 53
+ 5 | 54
+ 5 | 55
+ 5 | 56
+ 5 | 57
+ 5 | 58
+ 5 | 59
+ 6 | 60
+ 6 | 61
+ 6 | 62
+ 6 | 63
+ 6 | 64
+ 6 | 65
+(65 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 66;
+ a | b  
+---+----
+ 0 |  1
+ 0 |  2
+ 0 |  3
+ 0 |  4
+ 0 |  5
+ 0 |  6
+ 0 |  7
+ 0 |  8
+ 0 |  9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+ 3 | 33
+ 3 | 34
+ 3 | 35
+ 3 | 36
+ 3 | 37
+ 3 | 38
+ 3 | 39
+ 4 | 40
+ 4 | 41
+ 4 | 42
+ 4 | 43
+ 4 | 44
+ 4 | 45
+ 4 | 46
+ 4 | 47
+ 4 | 48
+ 4 | 49
+ 5 | 50
+ 5 | 51
+ 5 | 52
+ 5 | 53
+ 5 | 54
+ 5 | 55
+ 5 | 56
+ 5 | 57
+ 5 | 58
+ 5 | 59
+ 6 | 60
+ 6 | 61
+ 6 | 62
+ 6 | 63
+ 6 | 64
+ 6 | 65
+ 6 | 66
+(66 rows)
+
+delete from t;
+-- Small groups of only 1 tuple each tested around each mode transition point.
+insert into t(a, b) select i, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 31;
+ a  | b  
+----+----
+  1 |  1
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+(31 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 32;
+ a  | b  
+----+----
+  1 |  1
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+(32 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 33;
+ a  | b  
+----+----
+  1 |  1
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+ 33 | 33
+(33 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 65;
+ a  | b  
+----+----
+  1 |  1
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+ 33 | 33
+ 34 | 34
+ 35 | 35
+ 36 | 36
+ 37 | 37
+ 38 | 38
+ 39 | 39
+ 40 | 40
+ 41 | 41
+ 42 | 42
+ 43 | 43
+ 44 | 44
+ 45 | 45
+ 46 | 46
+ 47 | 47
+ 48 | 48
+ 49 | 49
+ 50 | 50
+ 51 | 51
+ 52 | 52
+ 53 | 53
+ 54 | 54
+ 55 | 55
+ 56 | 56
+ 57 | 57
+ 58 | 58
+ 59 | 59
+ 60 | 60
+ 61 | 61
+ 62 | 62
+ 63 | 63
+ 64 | 64
+ 65 | 65
+(65 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+           QUERY PLAN            
+---------------------------------
+ Limit
+   ->  Incremental Sort
+         Sort Key: t.a, t.b
+         Presorted Key: t.a
+         ->  Sort
+               Sort Key: t.a
+               ->  Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 66;
+ a  | b  
+----+----
+  1 |  1
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+ 33 | 33
+ 34 | 34
+ 35 | 35
+ 36 | 36
+ 37 | 37
+ 38 | 38
+ 39 | 39
+ 40 | 40
+ 41 | 41
+ 42 | 42
+ 43 | 43
+ 44 | 44
+ 45 | 45
+ 46 | 46
+ 47 | 47
+ 48 | 48
+ 49 | 49
+ 50 | 50
+ 51 | 51
+ 52 | 52
+ 53 | 53
+ 54 | 54
+ 55 | 55
+ 56 | 56
+ 57 | 57
+ 58 | 58
+ 59 | 59
+ 60 | 60
+ 61 | 61
+ 62 | 62
+ 63 | 63
+ 64 | 64
+ 65 | 65
+ 66 | 66
+(66 rows)
+
+delete from t;
+drop table t;
+-- Incremental sort vs. parallel queries
+set min_parallel_table_scan_size = '1kB';
+set min_parallel_index_scan_size = '1kB';
+set parallel_setup_cost = 0;
+set parallel_tuple_cost = 0;
+create table t (a int, b int, c int);
+insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
+create index on t (a);
+analyze t;
+set enable_incrementalsort = off;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Limit
+   ->  Sort
+         Sort Key: a, b, (sum(c))
+         ->  Finalize HashAggregate
+               Group Key: a, b
+               ->  Gather
+                     Workers Planned: 2
+                     ->  Partial HashAggregate
+                           Group Key: a, b
+                           ->  Parallel Seq Scan on t
+(10 rows)
+
+set enable_incrementalsort = on;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Limit
+   ->  Sort
+         Sort Key: a, b, (sum(c))
+         ->  Finalize HashAggregate
+               Group Key: a, b
+               ->  Gather
+                     Workers Planned: 2
+                     ->  Partial HashAggregate
+                           Group Key: a, b
+                           ->  Parallel Seq Scan on t
+(10 rows)
+
+drop table t;
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index fb4b342261..c36970575f 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -11,6 +11,8 @@ SET enable_partitionwise_aggregate TO true;
 SET enable_partitionwise_join TO true;
 -- Disable parallel plans.
 SET max_parallel_workers_per_gather TO 0;
+-- Disable incremental sort, which can influence selected plans due to fuzz factor.
+SET enable_incrementalsort TO off;
 --
 -- Tests for list partitioned tables.
 --
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 715842b87a..a126f0ad61 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_hashagg                 | on
  enable_hashagg_disk            | on
  enable_hashjoin                | on
+ enable_incrementalsort         | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
  enable_material                | on
@@ -91,7 +92,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(19 rows)
+(20 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index a98dba7b2f..a741e89616 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview
 # ----------
 # Another group of parallel tests
 # ----------
-test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8
+test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 incremental_sort
 
 # rules cannot run concurrently with any test that creates
 # a view or rule in the public schema
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 3f66e0b859..1a6821ca46 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -89,6 +89,7 @@ test: select_distinct_on
 test: select_implicit
 test: select_having
 test: subselect
+test: incremental_sort
 test: union
 test: case
 test: join
diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql
new file mode 100644
index 0000000000..3b359efa29
--- /dev/null
+++ b/src/test/regress/sql/incremental_sort.sql
@@ -0,0 +1,213 @@
+-- When we have to sort the entire table, incremental sort will
+-- be slower than plain sort, so it should not be used.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+
+-- When there is a LIMIT clause, incremental sort is beneficial because
+-- it only has to sort some of the groups, and not the entire table.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten
+limit 1;
+
+-- When work_mem is not enough to sort the entire table, incremental sort
+-- may be faster if individual groups still fit into work_mem.
+set work_mem to '2MB';
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+reset work_mem;
+
+create table t(a integer, b integer);
+
+create or replace function explain_analyze_without_memory(query text)
+returns table (out_line text) language plpgsql
+as
+$$
+declare
+  line text;
+begin
+  for line in
+    execute 'explain (analyze, costs off, summary off, timing off) ' || query
+  loop
+    out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g');
+    return next;
+  end loop;
+end;
+$$;
+
+create or replace function explain_analyze_inc_sort_nodes(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+  elements jsonb;
+  element jsonb;
+  matching_nodes jsonb := '[]'::jsonb;
+begin
+  execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements;
+  while jsonb_array_length(elements) > 0 loop
+    element := elements->0;
+    elements := elements - 0;
+    case jsonb_typeof(element)
+    when 'array' then
+      if jsonb_array_length(element) > 0 then
+        elements := elements || element;
+      end if;
+    when 'object' then
+      if element ? 'Plan' then
+        elements := elements || jsonb_build_array(element->'Plan');
+        element := element - 'Plan';
+      else
+        if element ? 'Plans' then
+          elements := elements || jsonb_build_array(element->'Plans');
+          element := element - 'Plans';
+        end if;
+        if (element->>'Node Type')::text = 'Incremental Sort' then
+          matching_nodes := matching_nodes || element;
+        end if;
+      end if;
+    end case;
+  end loop;
+  return matching_nodes;
+end;
+$$;
+
+create or replace function explain_analyze_inc_sort_nodes_without_memory(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+  nodes jsonb := '[]'::jsonb;
+  node jsonb;
+  group_key text;
+  space_key text;
+begin
+  for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+    for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+      for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+        node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false);
+        node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false);
+      end loop;
+    end loop;
+    nodes := nodes || node;
+  end loop;
+  return nodes;
+end;
+$$;
+
+create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text)
+returns bool language plpgsql
+as
+$$
+declare
+  node jsonb;
+  group_stats jsonb;
+  group_key text;
+  space_key text;
+begin
+  for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+    for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+      group_stats := node->group_key;
+      for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+        if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then
+          raise exception '% has invalid max space < average space', group_key;
+        end if;
+      end loop;
+    end loop;
+  end loop;
+  return true;
+end;
+$$;
+
+-- A single large group tested around each mode transition point.
+insert into t(a, b) select 1, i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+select * from (select * from t order by a) s order by a, b limit 31;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+select * from (select * from t order by a) s order by a, b limit 32;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+select * from (select * from t order by a) s order by a, b limit 33;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+select * from (select * from t order by a) s order by a, b limit 65;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+select * from (select * from t order by a) s order by a, b limit 66;
+delete from t;
+
+-- An initial large group followed by a small group.
+insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 55;
+select * from (select * from t order by a) s order by a, b limit 55;
+-- Test EXPLAIN ANALYZE with only a fullsort group.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55');
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55'));
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55');
+delete from t;
+
+-- An initial small group followed by a large group.
+insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 70;
+select * from (select * from t order by a) s order by a, b limit 70;
+-- Test rescan.
+begin;
+-- We force the planner to choose a plan with incremental sort on the right side
+-- of a nested loop join node. That way we trigger the rescan code path.
+set local enable_hashjoin = off;
+set local enable_mergejoin = off;
+set local enable_material = off;
+set local enable_sort = off;
+explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+rollback;
+-- Test EXPLAIN ANALYZE with both fullsort and presorted groups.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70');
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70'));
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70');
+delete from t;
+
+-- Small groups of 10 tuples each tested around each mode transition point.
+insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+select * from (select * from t order by a) s order by a, b limit 31;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+select * from (select * from t order by a) s order by a, b limit 32;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+select * from (select * from t order by a) s order by a, b limit 33;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+select * from (select * from t order by a) s order by a, b limit 65;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+select * from (select * from t order by a) s order by a, b limit 66;
+delete from t;
+
+-- Small groups of only 1 tuple each tested around each mode transition point.
+insert into t(a, b) select i, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+select * from (select * from t order by a) s order by a, b limit 31;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+select * from (select * from t order by a) s order by a, b limit 32;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+select * from (select * from t order by a) s order by a, b limit 33;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+select * from (select * from t order by a) s order by a, b limit 65;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+select * from (select * from t order by a) s order by a, b limit 66;
+delete from t;
+
+drop table t;
+
+-- Incremental sort vs. parallel queries
+set min_parallel_table_scan_size = '1kB';
+set min_parallel_index_scan_size = '1kB';
+set parallel_setup_cost = 0;
+set parallel_tuple_cost = 0;
+
+create table t (a int, b int, c int);
+insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
+create index on t (a);
+analyze t;
+
+set enable_incrementalsort = off;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+
+set enable_incrementalsort = on;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+
+drop table t;
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index ba4fed4d43..178f2079fa 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -12,6 +12,8 @@ SET enable_partitionwise_aggregate TO true;
 SET enable_partitionwise_join TO true;
 -- Disable parallel plans.
 SET max_parallel_workers_per_gather TO 0;
+-- Disable incremental sort, which can influence selected plans due to fuzz factor.
+SET enable_incrementalsort TO off;
 
 --
 -- Tests for list partitioned tables.