diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index fdeda90f03..2f04702b6e 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1472,6 +1472,45 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-replacement-sort-tuples" xreflabel="replacement_sort_tuples">
+      <term><varname>replacement_sort_tuples</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>replacement_sort_tuples</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When the number of tuples to be sorted is smaller than this number,
+        a sort will produce its first output run using replacement selection
+        rather than quicksort.  This may be useful in memory-constrained
+        environments where tuples that are input into larger sort operations
+        have a strong physical-to-logical correlation.  Note that this does
+        not include input tuples with an <emphasis>inverse</emphasis>
+        correlation.  It is possible for the replacement selection algorithm
+        to generate one long run that requires no merging, where use of the
+        default strategy would result in many runs that must be merged
+        to produce a final sorted output.  This may allow sort
+        operations to complete sooner.
+       </para>
+       <para>
+        The default is 150,000 tuples.  Note that higher values are typically
+        not much more effective, and may be counter-productive, since the
+        priority queue is sensitive to the size of available CPU cache, whereas
+        the default strategy sorts runs using a <firstterm>cache
+        oblivious</firstterm> algorithm.  This property allows the default sort
+        strategy to automatically and transparently make effective use
+        of available CPU cache.
+       </para>
+       <para>
+        Setting <varname>maintenance_work_mem</varname> to its default
+        value usually prevents utility command external sorts (e.g.,
+        sorts used by <command>CREATE INDEX</> to build B-Tree
+        indexes) from ever using replacement selection sort, unless the
+        input tuples are quite wide.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-autovacuum-work-mem" xreflabel="autovacuum_work_mem">
       <term><varname>autovacuum_work_mem</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 8e4caeba0a..4917922ab5 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1432,8 +1432,8 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * total, but we will also need to write and read each tuple once per
  * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
  * number of initial runs formed and M is the merge order used by tuplesort.c.
- * Since the average initial run should be about twice sort_mem, we have
- *		disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))
+ * Since the average initial run should be about sort_mem, we have
+ *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
  *		cpu = comparison_cost * t * log2(t)
  *
  * If the sort is bounded (i.e., only the first k result tuples are needed)
@@ -1509,7 +1509,7 @@ cost_sort(Path *path, PlannerInfo *root,
 		 * We'll have to use a disk-based sort of all the tuples
 		 */
 		double		npages = ceil(input_bytes / BLCKSZ);
-		double		nruns = (input_bytes / sort_mem_bytes) * 0.5;
+		double		nruns = input_bytes / sort_mem_bytes;
 		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
 		double		log_runs;
 		double		npageaccesses;
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 597dab4301..5a7783b981 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -109,6 +109,7 @@ bool		enableFsync = true;
 bool		allowSystemTableMods = false;
 int			work_mem = 1024;
 int			maintenance_work_mem = 16384;
+int			replacement_sort_tuples = 150000;
 
 /*
  * Primary determinants of sizes of shared-memory structures.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a07050017e..f7ed167d7f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1928,6 +1928,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"replacement_sort_tuples", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Sets the maximum number of tuples to be sorted using replacement selection."),
+			gettext_noop("When more tuples than this are present, quicksort will be used.")
+		},
+		&replacement_sort_tuples,
+		150000, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * We use the hopefully-safely-small value of 100kB as the compiled-in
 	 * default for max_stack_depth.  InitializeGUCOptions will increase it if
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 8da3ff14c6..bcc86e29d2 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -125,6 +125,7 @@
 # actively intend to use prepared transactions.
 #work_mem = 4MB				# min 64kB
 #maintenance_work_mem = 64MB		# min 1MB
+#replacement_sort_tuples = 150000	# limits use of replacement selection sort
 #autovacuum_work_mem = -1		# min 1MB, or -1 to use maintenance_work_mem
 #max_stack_depth = 2MB			# min 100kB
 #dynamic_shared_memory_type = posix	# the default is the first option
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d033c95e78..78c5cfc8af 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -11,14 +11,16 @@
  * algorithm.
  *
  * See Knuth, volume 3, for more than you want to know about the external
- * sorting algorithm.  We divide the input into sorted runs using replacement
- * selection, in the form of a priority tree implemented as a heap
- * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
- * merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D
- * are implemented by logtape.c, which avoids space wastage by recycling
- * disk space as soon as each block is read from its "tape".
+ * sorting algorithm.  Historically, we divided the input into sorted runs
+ * using replacement selection, in the form of a priority tree implemented
+ * as a heap (essentially his Algorithm 5.2.3H -- although that strategy is
+ * often avoided altogether), but that can now only happen first the first
+ * run.  We merge the runs using polyphase merge, Knuth's Algorithm
+ * 5.4.2D.  The logical "tapes" used by Algorithm D are implemented by
+ * logtape.c, which avoids space wastage by recycling disk space as soon
+ * as each block is read from its "tape".
  *
- * We do not form the initial runs using Knuth's recommended replacement
+ * We never form the initial runs using Knuth's recommended replacement
  * selection data structure (Algorithm 5.4.1R), because it uses a fixed
  * number of records in memory at all times.  Since we are dealing with
  * tuples that may vary considerably in size, we want to be able to vary
@@ -28,7 +30,30 @@
  * Algorithm 5.4.1R, each record is stored with the run number that it
  * must go into, and we use (run number, key) as the ordering key for the
  * heap.  When the run number at the top of the heap changes, we know that
- * no more records of the prior run are left in the heap.
+ * no more records of the prior run are left in the heap.  Note that there
+ * are in practice only ever two distinct run numbers, due to the greatly
+ * reduced use of replacement selection in PostgreSQL 9.6.
+ *
+ * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small
+ * customizations) is only used with the aim of producing just one run,
+ * thereby avoiding all merging.  Only the first run can use replacement
+ * selection, which is why there are now only two possible valid run
+ * numbers, and why heapification is customized to not distinguish between
+ * tuples in the second run (those will be quicksorted).  We generally
+ * prefer a simple hybrid sort-merge strategy, where runs are sorted in much
+ * the same way as the entire input of an internal sort is sorted (using
+ * qsort()).  The replacement_sort_tuples GUC controls the limited remaining
+ * use of replacement selection for the first run.
+ *
+ * There are several reasons to favor a hybrid sort-merge strategy.
+ * Maintaining a priority tree/heap has poor CPU cache characteristics.
+ * Furthermore, the growth in main memory sizes has greatly diminished the
+ * value of having runs that are larger than available memory, even in the
+ * case where there is partially sorted input and runs can be made far
+ * larger by using a heap.  In most cases, a single-pass merge step is all
+ * that is required even when runs are no larger than available memory.
+ * Avoiding multiple merge passes was traditionally considered to be the
+ * major advantage of using replacement selection.
  *
  * The approximate amount of memory allowed for any one sort operation
  * is specified in kilobytes by the caller (most pass work_mem).  Initially,
@@ -36,13 +61,12 @@
  * we haven't exceeded workMem.  If we reach the end of the input without
  * exceeding workMem, we sort the array using qsort() and subsequently return
  * tuples just by scanning the tuple array sequentially.  If we do exceed
- * workMem, we construct a heap using Algorithm H and begin to emit tuples
- * into sorted runs in temporary tapes, emitting just enough tuples at each
- * step to get back within the workMem limit.  Whenever the run number at
- * the top of the heap changes, we begin a new run with a new output tape
- * (selected per Algorithm D).  After the end of the input is reached,
- * we dump out remaining tuples in memory into a final run (or two),
- * then merge the runs using Algorithm D.
+ * workMem, we begin to emit tuples into sorted runs in temporary tapes.
+ * When tuples are dumped in batch after quicksorting, we begin a new run
+ * with a new output tape (selected per Algorithm D).  After the end of the
+ * input is reached, we dump out remaining tuples in memory into a final run
+ * (or two, when replacement selection is still used), then merge the runs
+ * using Algorithm D.
  *
  * When merging runs, we use a heap containing just the frontmost tuple from
  * each source run; we repeatedly output the smallest tuple and insert the
@@ -162,15 +186,18 @@ bool		optimize_bounded_sort = true;
  * described above.  Accordingly, "tuple" is always used in preference to
  * datum1 as the authoritative value for pass-by-reference cases.
  *
- * While building initial runs, tupindex holds the tuple's run number.  During
- * merge passes, we re-use it to hold the input tape number that each tuple in
- * the heap was read from, or to hold the index of the next tuple pre-read
- * from the same tape in the case of pre-read entries.  tupindex goes unused
- * if the sort occurs entirely in memory.
+ * While building initial runs, tupindex holds the tuple's run number.
+ * Historically, the run number could meaningfully distinguish many runs, but
+ * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement
+ * selection is always abandoned after the first run; no other run number
+ * should be represented here.  During merge passes, we re-use it to hold the
+ * input tape number that each tuple in the heap was read from, or to hold the
+ * index of the next tuple pre-read from the same tape in the case of pre-read
+ * entries.  tupindex goes unused if the sort occurs entirely in memory.
  */
 typedef struct
 {
-	void	   *tuple;			/* the tuple proper */
+	void	   *tuple;			/* the tuple itself */
 	Datum		datum1;			/* value of first key column */
 	bool		isnull1;		/* is first key column NULL? */
 	int			tupindex;		/* see notes above */
@@ -206,6 +233,15 @@ typedef enum
 #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
 
+ /*
+  * Run numbers, used during external sort operations.
+  *
+  * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun.
+  */
+#define RUN_FIRST		0
+#define HEAP_RUN_NEXT	INT_MAX
+#define RUN_SECOND		1
+
 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 												Tuplesortstate *state);
 
@@ -292,9 +328,17 @@ struct Tuplesortstate
 	 */
 	bool		batchUsed;
 
+	/*
+	 * While building initial runs, this indicates if the replacement
+	 * selection strategy is in use.  When it isn't, then a simple hybrid
+	 * sort-merge strategy is in use instead (runs are quicksorted).
+	 */
+	bool		replaceActive;
+
 	/*
 	 * While building initial runs, this is the current output run number
-	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
+	 * (starting at RUN_FIRST).  Afterwards, it is the number of initial
+	 * runs we made.
 	 */
 	int			currentRun;
 
@@ -493,6 +537,7 @@ struct Tuplesortstate
 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 static bool consider_abort_common(Tuplesortstate *state);
+static bool useselection(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state);
 static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
@@ -508,8 +553,10 @@ static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen);
 static void mergepreread(Tuplesortstate *state);
 static void mergeprereadone(Tuplesortstate *state, int srcTape);
 static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void dumpbatch(Tuplesortstate *state, bool alltuples);
 static void make_bounded_heap(Tuplesortstate *state);
 static void sort_bounded_heap(Tuplesortstate *state);
+static void tuplesort_sort_memtuples(Tuplesortstate *state);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 					  int tupleindex, bool checkIndex);
 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
@@ -654,7 +701,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
 	if (LACKMEM(state))
 		elog(ERROR, "insufficient memory allowed for sort");
 
-	state->currentRun = 0;
+	state->currentRun = RUN_FIRST;
 
 	/*
 	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
@@ -1566,22 +1613,61 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 
 			/*
 			 * Insert the tuple into the heap, with run number currentRun if
-			 * it can go into the current run, else run number currentRun+1.
-			 * The tuple can go into the current run if it is >= the first
+			 * it can go into the current run, else HEAP_RUN_NEXT.  The tuple
+			 * can go into the current run if it is >= the first
 			 * not-yet-output tuple.  (Actually, it could go into the current
 			 * run if it is >= the most recently output tuple ... but that
 			 * would require keeping around the tuple we last output, and it's
 			 * simplest to let writetup free each tuple as soon as it's
 			 * written.)
 			 *
-			 * Note there will always be at least one tuple in the heap at
-			 * this point; see dumptuples.
+			 * Note that this only applies when:
+			 *
+			 * - currentRun is RUN_FIRST
+			 *
+			 * - Replacement selection is in use (typically it is never used).
+			 *
+			 * When these two conditions are not both true, all tuples are
+			 * appended indifferently, much like the TSS_INITIAL case.
+			 *
+			 * There should always be room to store the incoming tuple.
 			 */
-			Assert(state->memtupcount > 0);
-			if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
+			Assert(!state->replaceActive || state->memtupcount > 0);
+			if (state->replaceActive &&
+				COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
+			{
+				Assert(state->currentRun == RUN_FIRST);
+
+				/*
+				 * Insert tuple into first, fully heapified run.
+				 *
+				 * Unlike classic replacement selection, which this module was
+				 * previously based on, only RUN_FIRST tuples are fully
+				 * heapified.  Any second/next run tuples are appended
+				 * indifferently.  While HEAP_RUN_NEXT tuples may be sifted
+				 * out of the way of first run tuples, COMPARETUP() will never
+				 * be called for the run's tuples during sifting (only our
+				 * initial COMPARETUP() call is required for the tuple, to
+				 * determine that the tuple does not belong in RUN_FIRST).
+				 */
 				tuplesort_heap_insert(state, tuple, state->currentRun, true);
+			}
 			else
-				tuplesort_heap_insert(state, tuple, state->currentRun + 1, true);
+			{
+				/*
+				 * Tuple was determined to not belong to heapified RUN_FIRST,
+				 * or replacement selection not in play.  Append the tuple to
+				 * memtuples indifferently.
+				 *
+				 * dumptuples() does not trust that the next run's tuples are
+				 * heapified.  Anything past the first run will always be
+				 * quicksorted even when replacement selection is initially
+				 * used.  (When it's never used, every tuple still takes this
+				 * path.)
+				 */
+				tuple->tupindex = HEAP_RUN_NEXT;
+				state->memtuples[state->memtupcount++] = *tuple;
+			}
 
 			/*
 			 * If we are over the memory limit, dump tuples till we're under.
@@ -1658,18 +1744,7 @@ tuplesort_performsort(Tuplesortstate *state)
 			 * We were able to accumulate all the tuples within the allowed
 			 * amount of memory.  Just qsort 'em and we're done.
 			 */
-			if (state->memtupcount > 1)
-			{
-				/* Can we use the single-key sort function? */
-				if (state->onlyKey != NULL)
-					qsort_ssup(state->memtuples, state->memtupcount,
-							   state->onlyKey);
-				else
-					qsort_tuple(state->memtuples,
-								state->memtupcount,
-								state->comparetup,
-								state);
-			}
+			tuplesort_sort_memtuples(state);
 			state->current = 0;
 			state->eof_reached = false;
 			state->markpos_offset = 0;
@@ -2181,6 +2256,28 @@ tuplesort_merge_order(int64 allowedMem)
 	return mOrder;
 }
 
+/*
+ * useselection - determine algorithm to use to sort first run.
+ *
+ * It can sometimes be useful to use the replacement selection algorithm if it
+ * results in one large run, and there is little available workMem.  See
+ * remarks on RUN_SECOND optimization within dumptuples().
+ */
+static bool
+useselection(Tuplesortstate *state)
+{
+	/*
+	 * memtupsize might be noticeably higher than memtupcount here in atypical
+	 * cases.  It seems slightly preferable to not allow recent outliers to
+	 * impact this determination.  Note that caller's trace_sort output reports
+	 * memtupcount instead.
+	 */
+	if (state->memtupsize <= replacement_sort_tuples)
+		return true;
+
+	return false;
+}
+
 /*
  * inittapes - initialize for tape sorting.
  *
@@ -2190,7 +2287,6 @@ static void
 inittapes(Tuplesortstate *state)
 {
 	int			maxTapes,
-				ntuples,
 				j;
 	int64		tapeSpace;
 
@@ -2253,25 +2349,42 @@ inittapes(Tuplesortstate *state)
 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
 
 	/*
-	 * Convert the unsorted contents of memtuples[] into a heap. Each tuple is
-	 * marked as belonging to run number zero.
-	 *
-	 * NOTE: we pass false for checkIndex since there's no point in comparing
-	 * indexes in this step, even though we do intend the indexes to be part
-	 * of the sort key...
+	 * Give replacement selection a try based on user setting.  There will
+	 * be a switch to a simple hybrid sort-merge strategy after the first
+	 * run (iff we could not output one long run).
 	 */
-	ntuples = state->memtupcount;
-	state->memtupcount = 0;		/* make the heap empty */
-	for (j = 0; j < ntuples; j++)
+	state->replaceActive = useselection(state);
+
+	if (state->replaceActive)
 	{
-		/* Must copy source tuple to avoid possible overwrite */
-		SortTuple	stup = state->memtuples[j];
+#ifdef TRACE_SORT
+		if (trace_sort)
+			elog(LOG, "replacement selection will sort %d first run tuples",
+				 state->memtupcount);
+#endif
+		/*
+		 * Convert the unsorted contents of memtuples[] into a heap. Each
+		 * tuple is marked as belonging to run number zero.
+		 *
+		 * NOTE: we pass false for checkIndex since there's no point in
+		 * comparing indexes in this step, even though we do intend the
+		 * indexes to be part of the sort key...
+		 */
+		int			ntuples = state->memtupcount;
 
-		tuplesort_heap_insert(state, &stup, 0, false);
+		state->memtupcount = 0;		/* make the heap empty */
+
+		for (j = 0; j < ntuples; j++)
+		{
+			/* Must copy source tuple to avoid possible overwrite */
+			SortTuple	stup = state->memtuples[j];
+
+			tuplesort_heap_insert(state, &stup, 0, false);
+		}
+		Assert(state->memtupcount == ntuples);
 	}
-	Assert(state->memtupcount == ntuples);
 
-	state->currentRun = 0;
+	state->currentRun = RUN_FIRST;
 
 	/*
 	 * Initialize variables of Algorithm D (step D1).
@@ -2362,11 +2475,12 @@ mergeruns(Tuplesortstate *state)
 
 	/*
 	 * If we produced only one initial run (quite likely if the total data
-	 * volume is between 1X and 2X workMem), we can just use that tape as the
-	 * finished output, rather than doing a useless merge.  (This obvious
-	 * optimization is not in Knuth's algorithm.)
+	 * volume is between 1X and 2X workMem when replacement selection is used,
+	 * but something we particular count on when input is presorted), we can
+	 * just use that tape as the finished output, rather than doing a useless
+	 * merge.  (This obvious optimization is not in Knuth's algorithm.)
 	 */
-	if (state->currentRun == 1)
+	if (state->currentRun == RUN_SECOND)
 	{
 		state->result_tape = state->tp_tapenum[state->destTape];
 		/* must freeze and rewind the finished output tape */
@@ -3094,21 +3208,25 @@ mergeprereadone(Tuplesortstate *state, int srcTape)
 }
 
 /*
- * dumptuples - remove tuples from heap and write to tape
+ * dumptuples - remove tuples from memtuples and write to tape
  *
  * This is used during initial-run building, but not during merging.
  *
- * When alltuples = false, dump only enough tuples to get under the
- * availMem limit (and leave at least one tuple in the heap in any case,
- * since puttuple assumes it always has a tuple to compare to).  We also
- * insist there be at least one free slot in the memtuples[] array.
+ * When alltuples = false and replacement selection is still active, dump
+ * only enough tuples to get under the availMem limit (and leave at least
+ * one tuple in memtuples, since puttuple will then assume it is a heap that
+ * has a tuple to compare to).  We always insist there be at least one free
+ * slot in the memtuples[] array.
  *
- * When alltuples = true, dump everything currently in memory.
- * (This case is only used at end of input data.)
+ * When alltuples = true, dump everything currently in memory.  (This
+ * case is only used at end of input data, although in practice only the
+ * first run could fail to dump all tuples when we LACKMEM(), and only
+ * when replacement selection is active.)
  *
- * If we empty the heap, close out the current run and return (this should
- * only happen at end of input data).  If we see that the tuple run number
- * at the top of the heap has changed, start a new run.
+ * If, when replacement selection is active, we see that the tuple run
+ * number at the top of the heap has changed, start a new run.  This must be
+ * the first run, because replacement selection is always abandoned for all
+ * further runs.
  */
 static void
 dumptuples(Tuplesortstate *state, bool alltuples)
@@ -3117,46 +3235,183 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 		   (LACKMEM(state) && state->memtupcount > 1) ||
 		   state->memtupcount >= state->memtupsize)
 	{
-		/*
-		 * Dump the heap's frontmost entry, and sift up to remove it from the
-		 * heap.
-		 */
-		Assert(state->memtupcount > 0);
-		WRITETUP(state, state->tp_tapenum[state->destTape],
-				 &state->memtuples[0]);
-		tuplesort_heap_siftup(state, true);
+		if (state->replaceActive)
+		{
+			/*
+			 * Still holding out for a case favorable to replacement selection.
+			 * Still incrementally spilling using heap.
+			 *
+			 * Dump the heap's frontmost entry, and sift up to remove it from
+			 * the heap.
+			 */
+			Assert(state->memtupcount > 0);
+			WRITETUP(state, state->tp_tapenum[state->destTape],
+					 &state->memtuples[0]);
+			tuplesort_heap_siftup(state, true);
+		}
+		else
+		{
+			/*
+			 * Once committed to quicksorting runs, never incrementally
+			 * spill
+			 */
+			dumpbatch(state, alltuples);
+			break;
+		}
 
 		/*
-		 * If the heap is empty *or* top run number has changed, we've
-		 * finished the current run.
+		 * If top run number has changed, we've finished the current run
+		 * (this can only be the first run), and will no longer spill
+		 * incrementally.
 		 */
 		if (state->memtupcount == 0 ||
-			state->currentRun != state->memtuples[0].tupindex)
+			state->memtuples[0].tupindex == HEAP_RUN_NEXT)
 		{
 			markrunend(state, state->tp_tapenum[state->destTape]);
+			Assert(state->currentRun == RUN_FIRST);
 			state->currentRun++;
 			state->tp_runs[state->destTape]++;
 			state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
 
 #ifdef TRACE_SORT
 			if (trace_sort)
-				elog(LOG, "finished writing%s run %d to tape %d: %s",
-					 (state->memtupcount == 0) ? " final" : "",
+				elog(LOG, "finished incrementally writing %s run %d to tape %d: %s",
+					 (state->memtupcount == 0) ? "only" : "first",
 					 state->currentRun, state->destTape,
 					 pg_rusage_show(&state->ru_start));
 #endif
+			/*
+			 * Done if heap is empty, which is possible when there is only one
+			 * long run.
+			 */
+			Assert(state->currentRun == RUN_SECOND);
+			if (state->memtupcount == 0)
+			{
+				/*
+				 * Replacement selection best case; no final merge required,
+				 * because there was only one initial run (second run has no
+				 * tuples).  See RUN_SECOND case in mergeruns().
+				 */
+				break;
+			}
 
 			/*
-			 * Done if heap is empty, else prepare for new run.
+			 * Abandon replacement selection for second run (as well as any
+			 * subsequent runs).
 			 */
-			if (state->memtupcount == 0)
-				break;
-			Assert(state->currentRun == state->memtuples[0].tupindex);
+			state->replaceActive = false;
+
+			/*
+			 * First tuple of next run should not be heapified, and so will
+			 * bear placeholder run number.  In practice this must actually be
+			 * the second run, which just became the currentRun, so we're
+			 * clear to quicksort and dump the tuples in batch next time
+			 * memtuples becomes full.
+			 */
+			Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT);
 			selectnewtape(state);
 		}
 	}
 }
 
+/*
+ * dumpbatch - sort and dump all memtuples, forming one run on tape
+ *
+ * Second or subsequent runs are never heapified by this module (although
+ * heapification still respects run number differences between the first and
+ * second runs), and a heap (replacement selection priority queue) is often
+ * avoided in the first place.
+ */
+static void
+dumpbatch(Tuplesortstate *state, bool alltuples)
+{
+	int			memtupwrite;
+	int			i;
+
+	/*
+	 * Final call might require no sorting, in rare cases where we just so
+	 * happen to have previously LACKMEM()'d at the point where exactly all
+	 * remaining tuples are loaded into memory, just before input was
+	 * exhausted.
+	 *
+	 * In general, short final runs are quite possible.  Rather than
+	 * allowing a special case where there was a superfluous
+	 * selectnewtape() call (i.e. a call with no subsequent run actually
+	 * written to destTape), we prefer to write out a 0 tuple run.
+	 *
+	 * mergepreread()/mergeprereadone() are prepared for 0 tuple runs, and
+	 * will reliably mark the tape inactive for the merge when called from
+	 * beginmerge().  This case is therefore similar to the case where
+	 * mergeonerun() finds a dummy run for the tape, and so doesn't need to
+	 * merge a run from the tape (or conceptually "merges" the dummy run,
+	 * if you prefer).  According to Knuth, Algorithm D "isn't strictly
+	 * optimal" in its method of distribution and dummy run assignment;
+	 * this edge case seems very unlikely to make that appreciably worse.
+	 */
+	Assert(state->status == TSS_BUILDRUNS);
+
+	/*
+	 * It seems unlikely that this limit will ever be exceeded, but take no
+	 * chances
+	 */
+	if (state->currentRun == INT_MAX)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot have more than %d runs for an external sort",
+						INT_MAX)));
+
+	state->currentRun++;
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "starting quicksort of run %d: %s",
+			 state->currentRun, pg_rusage_show(&state->ru_start));
+#endif
+
+	/*
+	 * Sort all tuples accumulated within the allowed amount of memory for this
+	 * run using quicksort
+	 */
+	tuplesort_sort_memtuples(state);
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished quicksort of run %d: %s",
+			 state->currentRun, pg_rusage_show(&state->ru_start));
+#endif
+
+	memtupwrite = state->memtupcount;
+	for (i = 0; i < memtupwrite; i++)
+	{
+		WRITETUP(state, state->tp_tapenum[state->destTape],
+				 &state->memtuples[i]);
+		state->memtupcount--;
+	}
+
+	/*
+	 * Reset tuple memory.  We've freed all of the tuples that we previously
+	 * allocated.  It's important to avoid fragmentation when there is a stark
+	 * change in allocation patterns due to the use of batch memory.
+	 * Fragmentation due to AllocSetFree's bucketing by size class might be
+	 * particularly bad if this step wasn't taken.
+	 */
+	MemoryContextReset(state->tuplecontext);
+
+	markrunend(state, state->tp_tapenum[state->destTape]);
+	state->tp_runs[state->destTape]++;
+	state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished writing run %d to tape %d: %s",
+			 state->currentRun, state->destTape,
+			 pg_rusage_show(&state->ru_start));
+#endif
+
+	if (!alltuples)
+		selectnewtape(state);
+}
+
 /*
  * tuplesort_rescan		- rewind and replay the scan
  */
@@ -3315,10 +3570,15 @@ tuplesort_get_stats(Tuplesortstate *state,
  *
  * Compare two SortTuples.  If checkIndex is true, use the tuple index
  * as the front of the sort key; otherwise, no.
+ *
+ * Note that for checkIndex callers, the heap invariant is never
+ * maintained beyond the first run, and so there are no COMPARETUP()
+ * calls needed to distinguish tuples in HEAP_RUN_NEXT.
  */
 
 #define HEAPCOMPARE(tup1,tup2) \
-	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \
+	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \
+					(tup1)->tupindex == HEAP_RUN_NEXT) ? \
 	 ((tup1)->tupindex) - ((tup2)->tupindex) : \
 	 COMPARETUP(state, tup1, tup2))
 
@@ -3416,6 +3676,31 @@ sort_bounded_heap(Tuplesortstate *state)
 	state->boundUsed = true;
 }
 
+/*
+ * Sort all memtuples using specialized qsort() routines.
+ *
+ * Quicksort is used for small in-memory sorts.  Quicksort is also generally
+ * preferred to replacement selection for generating runs during external sort
+ * operations, although replacement selection is sometimes used for the first
+ * run.
+ */
+static void
+tuplesort_sort_memtuples(Tuplesortstate *state)
+{
+	if (state->memtupcount > 1)
+	{
+		/* Can we use the single-key sort function? */
+		if (state->onlyKey != NULL)
+			qsort_ssup(state->memtuples, state->memtupcount,
+					   state->onlyKey);
+		else
+			qsort_tuple(state->memtuples,
+						state->memtupcount,
+						state->comparetup,
+						state);
+	}
+}
+
 /*
  * Insert a new tuple into an empty or existing heap, maintaining the
  * heap invariant.  Caller is responsible for ensuring there's room.
@@ -3443,6 +3728,7 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 
 	memtuples = state->memtuples;
 	Assert(state->memtupcount < state->memtupsize);
+	Assert(!checkIndex || tupleindex == RUN_FIRST);
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -3475,6 +3761,7 @@ tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
 	int			i,
 				n;
 
+	Assert(!checkIndex || state->currentRun == RUN_FIRST);
 	if (--state->memtupcount <= 0)
 		return;
 
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 9200f045a2..356fcc6eda 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -239,6 +239,7 @@ extern bool enableFsync;
 extern bool allowSystemTableMods;
 extern PGDLLIMPORT int work_mem;
 extern PGDLLIMPORT int maintenance_work_mem;
+extern PGDLLIMPORT int replacement_sort_tuples;
 
 extern int	VacuumCostPageHit;
 extern int	VacuumCostPageMiss;