diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 114db38116..f68c992213 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4573,6 +4573,20 @@ ANY num_sync (
+ enable_incrementalsort (boolean)
+
+ enable_incrementalsort configuration parameter
+
+
+
+
+ Enables or disables the query planner's use of incremental sort steps.
+ The default is on.
+
+
+
+
enable_indexscan (boolean)
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 58477ac83a..0dfc3e80e2 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -291,7 +291,47 @@ EXPLAIN SELECT * FROM tenk1 WHERE unique1 = 42;
often see this plan type for queries that fetch just a single row. It's
also often used for queries that have an ORDER BY condition
that matches the index order, because then no extra sorting step is needed
- to satisfy the ORDER BY.
+ to satisfy the ORDER BY. In this example, adding
+ ORDER BY unique1 would use the same plan because the
+ index already implicitly provides the requested ordering.
+
+
+
+ The planner may implement an ORDER BY clause in several
+ ways. The above example shows that such an ordering clause may be
+ implemented implicitly. The planner may also add an explicit
+ sort step:
+
+
+EXPLAIN SELECT * FROM tenk1 ORDER BY unique1;
+ QUERY PLAN
+-------------------------------------------------------------------
+ Sort (cost=1109.39..1134.39 rows=10000 width=244)
+ Sort Key: unique1
+ -> Seq Scan on tenk1 (cost=0.00..445.00 rows=10000 width=244)
+
+
+ If the a part of the plan guarantess an ordering on a prefix of the
+ required sort keys, then the planner may instead decide to use an
+ incremental sort step:
+
+
+EXPLAIN SELECT * FROM tenk1 ORDER BY four, ten LIMIT 100;
+ QUERY PLAN
+------------------------------------------------------------------------------------------------------
+ Limit (cost=521.06..538.05 rows=100 width=244)
+ -> Incremental Sort (cost=521.06..2220.95 rows=10000 width=244)
+ Sort Key: four, ten
+ Presorted Key: four
+ -> Index Scan using index_tenk1_on_four on tenk1 (cost=0.29..1510.08 rows=10000 width=244)
+
+
+ Compared to regular sorts, sorting incrementally allows returning tuples
+ before the entire result set has been sorted, which particularly enables
+ optimizations with LIMIT queries. It may also reduce
+ memory usage and the likelihood of spilling sorts to disk, but it comes at
+ the cost of the increased overhead of splitting the result set into multiple
+ sorting batches.
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index bb58f92851..62c86ecdc5 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -82,6 +82,8 @@ static void show_upper_qual(List *qual, const char *qlabel,
ExplainState *es);
static void show_sort_keys(SortState *sortstate, List *ancestors,
ExplainState *es);
+static void show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+ List *ancestors, ExplainState *es);
static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
ExplainState *es);
static void show_agg_keys(AggState *astate, List *ancestors,
@@ -95,7 +97,7 @@ static void show_grouping_set_keys(PlanState *planstate,
static void show_group_keys(GroupState *gstate, List *ancestors,
ExplainState *es);
static void show_sort_group_keys(PlanState *planstate, const char *qlabel,
- int nkeys, AttrNumber *keycols,
+ int nkeys, int nPresortedKeys, AttrNumber *keycols,
Oid *sortOperators, Oid *collations, bool *nullsFirst,
List *ancestors, ExplainState *es);
static void show_sortorder_options(StringInfo buf, Node *sortexpr,
@@ -103,6 +105,8 @@ static void show_sortorder_options(StringInfo buf, Node *sortexpr,
static void show_tablesample(TableSampleClause *tsc, PlanState *planstate,
List *ancestors, ExplainState *es);
static void show_sort_info(SortState *sortstate, ExplainState *es);
+static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
+ ExplainState *es);
static void show_hash_info(HashState *hashstate, ExplainState *es);
static void show_hashagg_info(AggState *hashstate, ExplainState *es);
static void show_tidbitmap_info(BitmapHeapScanState *planstate,
@@ -1278,6 +1282,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
case T_Sort:
pname = sname = "Sort";
break;
+ case T_IncrementalSort:
+ pname = sname = "Incremental Sort";
+ break;
case T_Group:
pname = sname = "Group";
break;
@@ -1937,6 +1944,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_sort_keys(castNode(SortState, planstate), ancestors, es);
show_sort_info(castNode(SortState, planstate), es);
break;
+ case T_IncrementalSort:
+ show_incremental_sort_keys(castNode(IncrementalSortState, planstate),
+ ancestors, es);
+ show_incremental_sort_info(castNode(IncrementalSortState, planstate),
+ es);
+ break;
case T_MergeAppend:
show_merge_append_keys(castNode(MergeAppendState, planstate),
ancestors, es);
@@ -2270,12 +2283,29 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es)
Sort *plan = (Sort *) sortstate->ss.ps.plan;
show_sort_group_keys((PlanState *) sortstate, "Sort Key",
- plan->numCols, plan->sortColIdx,
+ plan->numCols, 0, plan->sortColIdx,
plan->sortOperators, plan->collations,
plan->nullsFirst,
ancestors, es);
}
+/*
+ * Show the sort keys for a IncrementalSort node.
+ */
+static void
+show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+ List *ancestors, ExplainState *es)
+{
+ IncrementalSort *plan = (IncrementalSort *) incrsortstate->ss.ps.plan;
+
+ show_sort_group_keys((PlanState *) incrsortstate, "Sort Key",
+ plan->sort.numCols, plan->nPresortedCols,
+ plan->sort.sortColIdx,
+ plan->sort.sortOperators, plan->sort.collations,
+ plan->sort.nullsFirst,
+ ancestors, es);
+}
+
/*
* Likewise, for a MergeAppend node.
*/
@@ -2286,7 +2316,7 @@ show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
MergeAppend *plan = (MergeAppend *) mstate->ps.plan;
show_sort_group_keys((PlanState *) mstate, "Sort Key",
- plan->numCols, plan->sortColIdx,
+ plan->numCols, 0, plan->sortColIdx,
plan->sortOperators, plan->collations,
plan->nullsFirst,
ancestors, es);
@@ -2310,7 +2340,7 @@ show_agg_keys(AggState *astate, List *ancestors,
show_grouping_sets(outerPlanState(astate), plan, ancestors, es);
else
show_sort_group_keys(outerPlanState(astate), "Group Key",
- plan->numCols, plan->grpColIdx,
+ plan->numCols, 0, plan->grpColIdx,
NULL, NULL, NULL,
ancestors, es);
@@ -2379,7 +2409,7 @@ show_grouping_set_keys(PlanState *planstate,
if (sortnode)
{
show_sort_group_keys(planstate, "Sort Key",
- sortnode->numCols, sortnode->sortColIdx,
+ sortnode->numCols, 0, sortnode->sortColIdx,
sortnode->sortOperators, sortnode->collations,
sortnode->nullsFirst,
ancestors, es);
@@ -2436,7 +2466,7 @@ show_group_keys(GroupState *gstate, List *ancestors,
/* The key columns refer to the tlist of the child plan */
ancestors = lcons(plan, ancestors);
show_sort_group_keys(outerPlanState(gstate), "Group Key",
- plan->numCols, plan->grpColIdx,
+ plan->numCols, 0, plan->grpColIdx,
NULL, NULL, NULL,
ancestors, es);
ancestors = list_delete_first(ancestors);
@@ -2449,13 +2479,14 @@ show_group_keys(GroupState *gstate, List *ancestors,
*/
static void
show_sort_group_keys(PlanState *planstate, const char *qlabel,
- int nkeys, AttrNumber *keycols,
+ int nkeys, int nPresortedKeys, AttrNumber *keycols,
Oid *sortOperators, Oid *collations, bool *nullsFirst,
List *ancestors, ExplainState *es)
{
Plan *plan = planstate->plan;
List *context;
List *result = NIL;
+ List *resultPresorted = NIL;
StringInfoData sortkeybuf;
bool useprefix;
int keyno;
@@ -2495,9 +2526,13 @@ show_sort_group_keys(PlanState *planstate, const char *qlabel,
nullsFirst[keyno]);
/* Emit one property-list item per sort key */
result = lappend(result, pstrdup(sortkeybuf.data));
+ if (keyno < nPresortedKeys)
+ resultPresorted = lappend(resultPresorted, exprstr);
}
ExplainPropertyList(qlabel, result, es);
+ if (nPresortedKeys > 0)
+ ExplainPropertyList("Presorted Key", resultPresorted, es);
}
/*
@@ -2711,6 +2746,196 @@ show_sort_info(SortState *sortstate, ExplainState *es)
}
}
+/*
+ * Incremental sort nodes sort in (a potentially very large number of) batches,
+ * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into
+ * an intelligible summary.
+ *
+ * This function is used for both a non-parallel node and each worker in a
+ * parallel incremental sort node.
+ */
+static void
+show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
+ const char *groupLabel, bool indent, ExplainState *es)
+{
+ ListCell *methodCell;
+ List *methodNames = NIL;
+
+ /* Generate a list of sort methods used across all groups. */
+ for (int bit = 0; bit < sizeof(bits32); ++bit)
+ {
+ if (groupInfo->sortMethods & (1 << bit))
+ {
+ TuplesortMethod sortMethod = (1 << bit);
+ const char *methodName;
+
+ methodName = tuplesort_method_name(sortMethod);
+ methodNames = lappend(methodNames, unconstify(char *, methodName));
+ }
+ }
+
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ if (indent)
+ appendStringInfoSpaces(es->str, es->indent * 2);
+ appendStringInfo(es->str, "%s Groups: %ld Sort Method", groupLabel,
+ groupInfo->groupCount);
+ /* plural/singular based on methodNames size */
+ if (list_length(methodNames) > 1)
+ appendStringInfo(es->str, "s: ");
+ else
+ appendStringInfo(es->str, ": ");
+ foreach(methodCell, methodNames)
+ {
+ appendStringInfo(es->str, "%s", (char *) methodCell->ptr_value);
+ if (foreach_current_index(methodCell) < list_length(methodNames) - 1)
+ appendStringInfo(es->str, ", ");
+ }
+
+ if (groupInfo->maxMemorySpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+ const char *spaceTypeName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+ appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB",
+ spaceTypeName, avgSpace,
+ groupInfo->maxMemorySpaceUsed);
+ }
+
+ if (groupInfo->maxDiskSpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+
+ const char *spaceTypeName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+ /* Add a semicolon separator only if memory stats were printed. */
+ if (groupInfo->maxMemorySpaceUsed > 0)
+ appendStringInfo(es->str, ";");
+ appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB",
+ spaceTypeName, avgSpace,
+ groupInfo->maxDiskSpaceUsed);
+ }
+ }
+ else
+ {
+ StringInfoData groupName;
+
+ initStringInfo(&groupName);
+ appendStringInfo(&groupName, "%s Groups", groupLabel);
+ ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es);
+ ExplainPropertyInteger("Group Count", NULL, groupInfo->groupCount, es);
+
+ ExplainPropertyList("Sort Methods Used", methodNames, es);
+
+ if (groupInfo->maxMemorySpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+ const char *spaceTypeName;
+ StringInfoData memoryName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+ initStringInfo(&memoryName);
+ appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName);
+ ExplainOpenGroup("Sort Space", memoryName.data, true, es);
+
+ ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+ ExplainPropertyInteger("Maximum Sort Space Used", "kB",
+ groupInfo->maxMemorySpaceUsed, es);
+
+ ExplainCloseGroup("Sort Spaces", memoryName.data, true, es);
+ }
+ if (groupInfo->maxDiskSpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+ const char *spaceTypeName;
+ StringInfoData diskName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+ initStringInfo(&diskName);
+ appendStringInfo(&diskName, "Sort Space %s", spaceTypeName);
+ ExplainOpenGroup("Sort Space", diskName.data, true, es);
+
+ ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+ ExplainPropertyInteger("Maximum Sort Space Used", "kB",
+ groupInfo->maxDiskSpaceUsed, es);
+
+ ExplainCloseGroup("Sort Spaces", diskName.data, true, es);
+ }
+
+ ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es);
+ }
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show tuplesort stats for a incremental sort node
+ */
+static void
+show_incremental_sort_info(IncrementalSortState *incrsortstate,
+ ExplainState *es)
+{
+ IncrementalSortGroupInfo *fullsortGroupInfo;
+ IncrementalSortGroupInfo *prefixsortGroupInfo;
+
+ fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo;
+
+ if (!(es->analyze && fullsortGroupInfo->groupCount > 0))
+ return;
+
+ show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", true, es);
+ prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo;
+ if (prefixsortGroupInfo->groupCount > 0)
+ {
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, " ");
+ show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es);
+ }
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, "\n");
+
+ if (incrsortstate->shared_info != NULL)
+ {
+ int n;
+ bool indent_first_line;
+
+ for (n = 0; n < incrsortstate->shared_info->num_workers; n++)
+ {
+ IncrementalSortInfo *incsort_info =
+ &incrsortstate->shared_info->sinfo[n];
+
+ /*
+ * If a worker hasn't process any sort groups at all, then exclude
+ * it from output since it either didn't launch or didn't
+ * contribute anything meaningful.
+ */
+ fullsortGroupInfo = &incsort_info->fullsortGroupInfo;
+ prefixsortGroupInfo = &incsort_info->prefixsortGroupInfo;
+ if (fullsortGroupInfo->groupCount == 0 &&
+ prefixsortGroupInfo->groupCount == 0)
+ continue;
+
+ if (es->workers_state)
+ ExplainOpenWorker(n, es);
+
+ indent_first_line = es->workers_state == NULL || es->verbose;
+ show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort",
+ indent_first_line, es);
+ if (prefixsortGroupInfo->groupCount > 0)
+ {
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, " ");
+ show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es);
+ }
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, "\n");
+
+ if (es->workers_state)
+ ExplainCloseWorker(n, es);
+ }
+ }
+}
+
/*
* Show information on hash buckets/batches.
*/
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index a983800e4b..f990c6473a 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -46,6 +46,7 @@ OBJS = \
nodeGroup.o \
nodeHash.o \
nodeHashjoin.o \
+ nodeIncrementalSort.o \
nodeIndexonlyscan.o \
nodeIndexscan.o \
nodeLimit.o \
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index b12aeb3334..e2154ba86a 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -30,6 +30,7 @@
#include "executor/nodeGroup.h"
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
#include "executor/nodeIndexonlyscan.h"
#include "executor/nodeIndexscan.h"
#include "executor/nodeLimit.h"
@@ -252,6 +253,10 @@ ExecReScan(PlanState *node)
ExecReScanSort((SortState *) node);
break;
+ case T_IncrementalSortState:
+ ExecReScanIncrementalSort((IncrementalSortState *) node);
+ break;
+
case T_GroupState:
ExecReScanGroup((GroupState *) node);
break;
@@ -557,8 +562,17 @@ ExecSupportsBackwardScan(Plan *node)
case T_CteScan:
case T_Material:
case T_Sort:
+ /* these don't evaluate tlist */
return true;
+ case T_IncrementalSort:
+
+ /*
+ * Unlike full sort, incremental sort keeps only a single group of
+ * tuples in memory, so it can't scan backwards.
+ */
+ return false;
+
case T_LockRows:
case T_Limit:
return ExecSupportsBackwardScan(outerPlan(node));
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index b7d0719953..41cb41481d 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -31,6 +31,7 @@
#include "executor/nodeForeignscan.h"
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
#include "executor/nodeIndexonlyscan.h"
#include "executor/nodeIndexscan.h"
#include "executor/nodeSeqscan.h"
@@ -283,6 +284,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
/* even when not parallel-aware, for EXPLAIN ANALYZE */
ExecSortEstimate((SortState *) planstate, e->pcxt);
break;
+ case T_IncrementalSortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
+ break;
default:
break;
@@ -496,6 +501,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
/* even when not parallel-aware, for EXPLAIN ANALYZE */
ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
break;
+ case T_IncrementalSortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
+ break;
default:
break;
@@ -972,6 +981,7 @@ ExecParallelReInitializeDSM(PlanState *planstate,
break;
case T_HashState:
case T_SortState:
+ case T_IncrementalSortState:
/* these nodes have DSM state, but no reinitialization is required */
break;
@@ -1032,6 +1042,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
case T_SortState:
ExecSortRetrieveInstrumentation((SortState *) planstate);
break;
+ case T_IncrementalSortState:
+ ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate);
+ break;
case T_HashState:
ExecHashRetrieveInstrumentation((HashState *) planstate);
break;
@@ -1318,6 +1331,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
/* even when not parallel-aware, for EXPLAIN ANALYZE */
ExecSortInitializeWorker((SortState *) planstate, pwcxt);
break;
+ case T_IncrementalSortState:
+ /* even when not parallel-aware, for EXPLAIN ANALYZE */
+ ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
+ pwcxt);
+ break;
default:
break;
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 7b2e84f402..5662e7d742 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -88,6 +88,7 @@
#include "executor/nodeGroup.h"
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
+#include "executor/nodeIncrementalSort.h"
#include "executor/nodeIndexonlyscan.h"
#include "executor/nodeIndexscan.h"
#include "executor/nodeLimit.h"
@@ -313,6 +314,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
estate, eflags);
break;
+ case T_IncrementalSort:
+ result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node,
+ estate, eflags);
+ break;
+
case T_Group:
result = (PlanState *) ExecInitGroup((Group *) node,
estate, eflags);
@@ -693,6 +699,10 @@ ExecEndNode(PlanState *node)
ExecEndSort((SortState *) node);
break;
+ case T_IncrementalSortState:
+ ExecEndIncrementalSort((IncrementalSortState *) node);
+ break;
+
case T_GroupState:
ExecEndGroup((GroupState *) node);
break;
@@ -839,6 +849,30 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node)
sortState->bound = tuples_needed;
}
}
+ else if (IsA(child_node, IncrementalSortState))
+ {
+ /*
+ * If it is an IncrementalSort node, notify it that it can use bounded
+ * sort.
+ *
+ * Note: it is the responsibility of nodeIncrementalSort.c to react
+ * properly to changes of these parameters. If we ever redesign this,
+ * it'd be a good idea to integrate this signaling with the
+ * parameter-change mechanism.
+ */
+ IncrementalSortState *sortState = (IncrementalSortState *) child_node;
+
+ if (tuples_needed < 0)
+ {
+ /* make sure flag gets reset if needed upon rescan */
+ sortState->bounded = false;
+ }
+ else
+ {
+ sortState->bounded = true;
+ sortState->bound = tuples_needed;
+ }
+ }
else if (IsA(child_node, AppendState))
{
/*
diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c
new file mode 100644
index 0000000000..bcab7c054c
--- /dev/null
+++ b/src/backend/executor/nodeIncrementalSort.c
@@ -0,0 +1,1263 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.c
+ * Routines to handle incremental sorting of relations.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/executor/nodeIncrementalSort.c
+ *
+ * DESCRIPTION
+ *
+ * Incremental sort is an optimized variant of multikey sort for cases
+ * when the input is already sorted by a prefix of the sort keys. For
+ * example when a sort by (key1, key2 ... keyN) is requested, and the
+ * input is already sorted by (key1, key2 ... keyM), M < N, we can
+ * divide the input into groups where keys (key1, ... keyM) are equal,
+ * and only sort on the remaining columns.
+ *
+ * Consider the following example. We have input tuples consisting of
+ * two integers (X, Y) already presorted by X, while it's required to
+ * sort them by both X and Y. Let input tuples be following.
+ *
+ * (1, 5)
+ * (1, 2)
+ * (2, 9)
+ * (2, 1)
+ * (2, 5)
+ * (3, 3)
+ * (3, 7)
+ *
+ * An incremental sort algorithm would split the input into the following
+ * groups, which have equal X, and then sort them by Y individually:
+ *
+ * (1, 5) (1, 2)
+ * (2, 9) (2, 1) (2, 5)
+ * (3, 3) (3, 7)
+ *
+ * After sorting these groups and putting them altogether, we would get
+ * the following result which is sorted by X and Y, as requested:
+ *
+ * (1, 2)
+ * (1, 5)
+ * (2, 1)
+ * (2, 5)
+ * (2, 9)
+ * (3, 3)
+ * (3, 7)
+ *
+ * Incremental sort may be more efficient than plain sort, particularly
+ * on large datasets, as it reduces the amount of data to sort at once,
+ * making it more likely it fits into work_mem (eliminating the need to
+ * spill to disk). But the main advantage of incremental sort is that
+ * it can start producing rows early, before sorting the whole dataset,
+ * which is a significant benefit especially for queries with LIMIT.
+ *
+ * The algorithm we've implemented here is modified from the theoretical
+ * base described above by operating in two different modes:
+ * - Fetching a minimum number of tuples without checking prefix key
+ * group membership and sorting on all columns when safe.
+ * - Fetching all tuples for a single prefix key group and sorting on
+ * solely the unsorted columns.
+ * We always begin in the first mode, and employ a heuristic to switch
+ * into the second mode if we believe it's beneficial.
+ *
+ * Sorting incrementally can potentially use less memory, avoid fetching
+ * and sorting all tuples in the the dataset, and begin returning tuples
+ * before the entire result set is available.
+ *
+ * The hybrid mode approach allows us to optimize for both very small
+ * groups (where the overhead of a new tuplesort is high) and very large
+ * groups (where we can lower cost by not having to sort on already sorted
+ * columns), albeit at some extra cost while switching between modes.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIncrementalSort.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * We need to store the instrumentation information in either local node's sort
+ * info or, for a parallel worker process, in the shared info (this avoids
+ * having to additionally memcpy the info from local memory to shared memory
+ * at each instrumentation call). This macro expands to choose the proper sort
+ * state and group info.
+ *
+ * Arguments:
+ * - node: type IncrementalSortState *
+ * - groupName: the token fullsort or prefixsort
+ */
+#define INSTRUMENT_SORT_GROUP(node, groupName) \
+ if (node->ss.ps.instrument != NULL) \
+ { \
+ if (node->shared_info && node->am_worker) \
+ { \
+ Assert(IsParallelWorker()); \
+ Assert(ParallelWorkerNumber <= node->shared_info->num_workers); \
+ instrumentSortedGroup(&node->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, node->groupName##_state); \
+ } else { \
+ instrumentSortedGroup(&node->incsort_info.groupName##GroupInfo, node->groupName##_state); \
+ } \
+ }
+
+/* ----------------------------------------------------------------
+ * instrumentSortedGroup
+ *
+ * Because incremental sort processes (potentially many) sort batches, we need
+ * to capture tuplesort stats each time we finalize a sort state. This summary
+ * data is later used for EXPLAIN ANALYZE output.
+ * ----------------------------------------------------------------
+ */
+static void
+instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo,
+ Tuplesortstate *sortState)
+{
+ TuplesortInstrumentation sort_instr;
+ groupInfo->groupCount++;
+
+ tuplesort_get_stats(sortState, &sort_instr);
+
+ /* Calculate total and maximum memory and disk space used. */
+ switch (sort_instr.spaceType)
+ {
+ case SORT_SPACE_TYPE_DISK:
+ groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed;
+ if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed)
+ groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed;
+
+ break;
+ case SORT_SPACE_TYPE_MEMORY:
+ groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed;
+ if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed)
+ groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed;
+
+ break;
+ }
+
+ /* Track each sort method we've used. */
+ groupInfo->sortMethods |= sort_instr.sortMethod;
+}
+
+/* ----------------------------------------------------------------
+ * preparePresortedCols
+ *
+ * Prepare information for presorted_keys comparisons.
+ * ----------------------------------------------------------------
+ */
+static void
+preparePresortedCols(IncrementalSortState *node)
+{
+ IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+ node->presorted_keys =
+ (PresortedKeyData *) palloc(plannode->nPresortedCols *
+ sizeof(PresortedKeyData));
+
+ /* Pre-cache comparison functions for each pre-sorted key. */
+ for (int i = 0; i < plannode->nPresortedCols; i++)
+ {
+ Oid equalityOp,
+ equalityFunc;
+ PresortedKeyData *key;
+
+ key = &node->presorted_keys[i];
+ key->attno = plannode->sort.sortColIdx[i];
+
+ equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i],
+ NULL);
+ if (!OidIsValid(equalityOp))
+ elog(ERROR, "missing equality operator for ordering operator %u",
+ plannode->sort.sortOperators[i]);
+
+ equalityFunc = get_opcode(equalityOp);
+ if (!OidIsValid(equalityFunc))
+ elog(ERROR, "missing function for operator %u", equalityOp);
+
+ /* Lookup the comparison function */
+ fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext);
+
+ /* We can initialize the callinfo just once and re-use it */
+ key->fcinfo = palloc0(SizeForFunctionCallInfo(2));
+ InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2,
+ plannode->sort.collations[i], NULL, NULL);
+ key->fcinfo->args[0].isnull = false;
+ key->fcinfo->args[1].isnull = false;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * isCurrentGroup
+ *
+ * Check whether a given tuple belongs to the current sort group by comparing
+ * the presorted column values to the pivot tuple of the current group.
+ * ----------------------------------------------------------------
+ */
+static bool
+isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple)
+{
+ int nPresortedCols;
+
+ nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols;
+
+ /*
+ * That the input is sorted by keys * (0, ... n) implies that the tail
+ * keys are more likely to change. Therefore we do our comparison starting
+ * from the last pre-sorted column to optimize for early detection of
+ * inequality and minimizing the number of function calls..
+ */
+ for (int i = nPresortedCols - 1; i >= 0; i--)
+ {
+ Datum datumA,
+ datumB,
+ result;
+ bool isnullA,
+ isnullB;
+ AttrNumber attno = node->presorted_keys[i].attno;
+ PresortedKeyData *key;
+
+ datumA = slot_getattr(pivot, attno, &isnullA);
+ datumB = slot_getattr(tuple, attno, &isnullB);
+
+ /* Special case for NULL-vs-NULL, else use standard comparison */
+ if (isnullA || isnullB)
+ {
+ if (isnullA == isnullB)
+ continue;
+ else
+ return false;
+ }
+
+ key = &node->presorted_keys[i];
+
+ key->fcinfo->args[0].value = datumA;
+ key->fcinfo->args[1].value = datumB;
+
+ /* just for paranoia's sake, we reset isnull each time */
+ key->fcinfo->isnull = false;
+
+ result = FunctionCallInvoke(key->fcinfo);
+
+ /* Check for null result, since caller is clearly not expecting one */
+ if (key->fcinfo->isnull)
+ elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid);
+
+ if (!DatumGetBool(result))
+ return false;
+ }
+ return true;
+}
+
+/* ----------------------------------------------------------------
+ * switchToPresortedPrefixMode
+ *
+ * When we determine that we've likely encountered a large batch of tuples all
+ * having the same presorted prefix values, we want to optimize tuplesort by
+ * only sorting on unsorted suffix keys.
+ *
+ * The problem is that we've already accumulated several tuples in another
+ * tuplesort configured to sort by all columns (assuming that there may be
+ * more than one prefix key group). So to switch to presorted prefix mode we
+ * have to go back and look at all the tuples we've already accumulated to
+ * verify they're all part of the same prefix key group before sorting them
+ * solely by unsorted suffix keys.
+ *
+ * While it's likely that all already fetch tuples are all part of a single
+ * prefix group, we also have to handle the possibility that there is at least
+ * one different prefix key group before the large prefix key group.
+ * ----------------------------------------------------------------
+ */
+static void
+switchToPresortedPrefixMode(PlanState *pstate)
+{
+ IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+ ScanDirection dir;
+ int64 nTuples = 0;
+ bool lastTuple = false;
+ bool firstTuple = true;
+ TupleDesc tupDesc;
+ PlanState *outerNode;
+ IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan);
+
+ dir = node->ss.ps.state->es_direction;
+ outerNode = outerPlanState(node);
+ tupDesc = ExecGetResultType(outerNode);
+
+ /* Configure the prefix sort state the first time around. */
+ if (node->prefixsort_state == NULL)
+ {
+ Tuplesortstate *prefixsort_state;
+ int nPresortedCols = plannode->nPresortedCols;
+
+ /*
+ * Optimize the sort by assuming the prefix columns are all equal and
+ * thus we only need to sort by any remaining columns.
+ */
+ prefixsort_state = tuplesort_begin_heap(tupDesc,
+ plannode->sort.numCols - nPresortedCols,
+ &(plannode->sort.sortColIdx[nPresortedCols]),
+ &(plannode->sort.sortOperators[nPresortedCols]),
+ &(plannode->sort.collations[nPresortedCols]),
+ &(plannode->sort.nullsFirst[nPresortedCols]),
+ work_mem,
+ NULL,
+ false);
+ node->prefixsort_state = prefixsort_state;
+ }
+ else
+ {
+ /* Next group of presorted data */
+ tuplesort_reset(node->prefixsort_state);
+ }
+
+ /*
+ * If the current node has a bound, then it's reasonably likely that a
+ * large prefix key group will benefit from bounded sort, so configure the
+ * tuplesort to allow for that optimization.
+ */
+ if (node->bounded)
+ {
+ SO1_printf("Setting bound on presorted prefix tuplesort to: %ld\n",
+ node->bound - node->bound_Done);
+ tuplesort_set_bound(node->prefixsort_state,
+ node->bound - node->bound_Done);
+ }
+
+ /*
+ * Copy as many tuples as we can (i.e., in the same prefix key group) from
+ * the full sort state to the prefix sort state.
+ */
+ for (;;)
+ {
+ lastTuple = node->n_fullsort_remaining - nTuples == 1;
+
+ /*
+ * When we encounter multiple prefix key groups inside the full sort
+ * tuplesort we have to carry over the last read tuple into the next
+ * batch.
+ */
+ if (firstTuple && !TupIsNull(node->transfer_tuple))
+ {
+ tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+ nTuples++;
+
+ /* The carried over tuple is our new group pivot tuple. */
+ ExecCopySlot(node->group_pivot, node->transfer_tuple);
+ }
+ else
+ {
+ tuplesort_gettupleslot(node->fullsort_state,
+ ScanDirectionIsForward(dir),
+ false, node->transfer_tuple, NULL);
+
+ /*
+ * If this is our first time through the loop, then we need to
+ * save the first tuple we get as our new group pivot.
+ */
+ if (TupIsNull(node->group_pivot))
+ ExecCopySlot(node->group_pivot, node->transfer_tuple);
+
+ if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple))
+ {
+ tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple);
+ nTuples++;
+ }
+ else
+ {
+ /*
+ * The tuple isn't part of the current batch so we need to
+ * carry it over into the next batch of tuples we transfer out
+ * of the full sort tuplesort into the presorted prefix
+ * tuplesort. We don't actually have to do anything special to
+ * save the tuple since we've already loaded it into the
+ * node->transfer_tuple slot, and, even though that slot
+ * points to memory inside the full sort tuplesort, we can't
+ * reset that tuplesort anyway until we've fully transferred
+ * out of its tuples, so this reference is safe. We do need to
+ * reset the group pivot tuple though since we've finished the
+ * current prefix key group.
+ */
+ ExecClearTuple(node->group_pivot);
+ break;
+ }
+ }
+
+ firstTuple = false;
+
+ /*
+ * If we've copied all of the tuples from the full sort state into the
+ * prefix sort state, then we don't actually know that we've yet found
+ * the last tuple in that prefix key group until we check the next
+ * tuple from the outer plan node, so we retain the current group
+ * pivot tuple prefix key group comparison.
+ */
+ if (lastTuple)
+ break;
+ }
+
+ /*
+ * Track how many tuples remain in the full sort batch so that we know if
+ * we need to sort multiple prefix key groups before processing tuples
+ * remaining in the large single prefix key group we think we've
+ * encountered.
+ */
+ SO1_printf("Moving %ld tuples to presorted prefix tuplesort\n", nTuples);
+ node->n_fullsort_remaining -= nTuples;
+ SO1_printf("Setting n_fullsort_remaining to %ld\n", node->n_fullsort_remaining);
+
+ if (lastTuple)
+ {
+ /*
+ * We've confirmed that all tuples remaining in the full sort batch is
+ * in the same prefix key group and moved all of those tuples into the
+ * presorted prefix tuplesort. Now we can save our pivot comparison
+ * tuple and continue fetching tuples from the outer execution node to
+ * load into the presorted prefix tuplesort.
+ */
+ ExecCopySlot(node->group_pivot, node->transfer_tuple);
+ SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n");
+ node->execution_status = INCSORT_LOADPREFIXSORT;
+
+ /*
+ * Make sure we clear the transfer tuple slot so that next time we
+ * encounter a large prefix key group we don't incorrectly assume we
+ * have a tuple carried over from the previous group.
+ */
+ ExecClearTuple(node->transfer_tuple);
+ }
+ else
+ {
+ /*
+ * We finished a group but didn't consume all of the tuples from the
+ * full sort state, so we'll sort this batch, let the outer node read
+ * out all of those tuples, and then come back around to find another
+ * batch.
+ */
+ SO1_printf("Sorting presorted prefix tuplesort with %ld tuples\n", nTuples);
+ tuplesort_performsort(node->prefixsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, prefixsort)
+
+ if (node->bounded)
+ {
+ /*
+ * If the current node has a bound and we've already sorted n
+ * tuples, then the functional bound remaining is (original bound
+ * - n), so store the current number of processed tuples for use
+ * in configuring sorting bound.
+ */
+ SO2_printf("Changing bound_Done from %ld to %ld\n",
+ Min(node->bound, node->bound_Done + nTuples), node->bound_Done);
+ node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+ }
+
+ SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (switchToPresortedPrefixMode)\n");
+ node->execution_status = INCSORT_READPREFIXSORT;
+ }
+}
+
+/*
+ * Sorting many small groups with tuplesort is inefficient. In order to
+ * cope with this problem we don't start a new group until the current one
+ * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also
+ * means we can't assume small groups of tuples all have the same prefix keys.)
+ * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking
+ * for the new group as soon as we've met our bound to avoid fetching more
+ * tuples than we absolutely have to fetch.
+ */
+#define DEFAULT_MIN_GROUP_SIZE 32
+
+/*
+ * While we've optimized for small prefix key groups by not starting our prefix
+ * key comparisons until we've reached a minimum number of tuples, we don't want
+ * that optimization to cause us to lose out on the benefits of being able to
+ * assume a large group of tuples is fully presorted by its prefix keys.
+ * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic
+ * for determining when we believe we've encountered a large group, and, if we
+ * get to that point without finding a new prefix key group we transition to
+ * presorted prefix key mode.
+ */
+#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE)
+
+/* ----------------------------------------------------------------
+ * ExecIncrementalSort
+ *
+ * Assuming that outer subtree returns tuple presorted by some prefix
+ * of target sort columns, performs incremental sort.
+ *
+ * Conditions:
+ * -- none.
+ *
+ * Initial States:
+ * -- the outer child is prepared to return the first tuple.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+ExecIncrementalSort(PlanState *pstate)
+{
+ IncrementalSortState *node = castNode(IncrementalSortState, pstate);
+ EState *estate;
+ ScanDirection dir;
+ Tuplesortstate *read_sortstate;
+ Tuplesortstate *fullsort_state;
+ TupleTableSlot *slot;
+ IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan;
+ PlanState *outerNode;
+ TupleDesc tupDesc;
+ int64 nTuples = 0;
+ int64 minGroupSize;
+
+ CHECK_FOR_INTERRUPTS();
+
+ estate = node->ss.ps.state;
+ dir = estate->es_direction;
+ fullsort_state = node->fullsort_state;
+
+ /*
+ * If a previous iteration has sorted a batch, then we need to check to
+ * see if there are any remaining tuples in that batch that we can return
+ * before moving on to other execution states.
+ */
+ if (node->execution_status == INCSORT_READFULLSORT
+ || node->execution_status == INCSORT_READPREFIXSORT)
+ {
+ /*
+ * Return next tuple from the current sorted group set if available.
+ */
+ read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+ fullsort_state : node->prefixsort_state;
+ slot = node->ss.ps.ps_ResultTupleSlot;
+
+ /*
+ * We have to populate the slot from the tuplesort before checking
+ * outerNodeDone because it will set the slot to NULL if no more
+ * tuples remain. If the tuplesort is empty, but we don't have any
+ * more tuples available for sort from the outer node, then
+ * outerNodeDone will have been set so we'll return that now-empty
+ * slot to the caller.
+ */
+ if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+ false, slot, NULL) || node->outerNodeDone)
+
+ /*
+ * Note: there isn't a good test case for the node->outerNodeDone
+ * check directly, but we need it for any plan where the outer
+ * node will fail when trying to fetch too many tuples.
+ */
+ return slot;
+ else if (node->n_fullsort_remaining > 0)
+ {
+ /*
+ * When we transition to presorted prefix mode, we might have
+ * accumulated at least one additional prefix key group in the
+ * full sort tuplesort. The first call to
+ * switchToPresortedPrefixMode() will have pulled the first one of
+ * those groups out, and we've returned those tuples to the parent
+ * node, but if at this point we still have tuples remaining in
+ * the full sort state (i.e., n_fullsort_remaining > 0), then we
+ * need to re-execute the prefix mode transition function to pull
+ * out the next prefix key group.
+ */
+ SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (%ld)\n",
+ node->n_fullsort_remaining);
+ switchToPresortedPrefixMode(pstate);
+ }
+ else
+ {
+ /*
+ * If we don't have any sorted tuples to read and we're not
+ * currently transitioning into presorted prefix sort mode, then
+ * it's time to start the process all over again by building a new
+ * group in the full sort state.
+ */
+ SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n");
+ node->execution_status = INCSORT_LOADFULLSORT;
+ }
+ }
+
+ /*
+ * Scan the subplan in the forward direction while creating the sorted
+ * data.
+ */
+ estate->es_direction = ForwardScanDirection;
+
+ outerNode = outerPlanState(node);
+ tupDesc = ExecGetResultType(outerNode);
+
+ /* Load tuples into the full sort state. */
+ if (node->execution_status == INCSORT_LOADFULLSORT)
+ {
+ /*
+ * Initialize sorting structures.
+ */
+ if (fullsort_state == NULL)
+ {
+ /*
+ * Initialize presorted column support structures for
+ * isCurrentGroup(). It's correct to do this along with the
+ * initial intialization for the full sort state (and not for the
+ * prefix sort state) since we always load the full sort state
+ * first.
+ */
+ preparePresortedCols(node);
+
+ /*
+ * Since we optimize small prefix key groups by accumulating a
+ * minimum number of tuples before sorting, we can't assume that a
+ * group of tuples all have the same prefix key values. Hence we
+ * setup the full sort tuplesort to sort by all requested sort
+ * keys.
+ */
+ fullsort_state = tuplesort_begin_heap(tupDesc,
+ plannode->sort.numCols,
+ plannode->sort.sortColIdx,
+ plannode->sort.sortOperators,
+ plannode->sort.collations,
+ plannode->sort.nullsFirst,
+ work_mem,
+ NULL,
+ false);
+ node->fullsort_state = fullsort_state;
+ }
+ else
+ {
+ /* Reset sort for the next batch. */
+ tuplesort_reset(fullsort_state);
+ }
+
+ /*
+ * Calculate the remaining tuples left if bounded and configure both
+ * bounded sort and the minimum group size accordingly.
+ */
+ if (node->bounded)
+ {
+ int64 currentBound = node->bound - node->bound_Done;
+
+ /*
+ * Bounded sort isn't likely to be a useful optimization for full
+ * sort mode since we limit full sort mode to a relatively small
+ * number of tuples and tuplesort doesn't switch over to top-n
+ * heap sort anyway unless it hits (2 * bound) tuples.
+ */
+ if (currentBound < DEFAULT_MIN_GROUP_SIZE)
+ tuplesort_set_bound(fullsort_state, currentBound);
+
+ minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound);
+ }
+ else
+ minGroupSize = DEFAULT_MIN_GROUP_SIZE;
+
+ /*
+ * Because we have to read the next tuple to find out that we've
+ * encountered a new prefix key group, on subsequent groups we have to
+ * carry over that extra tuple and add it to the new group's sort here
+ * before we read any new tuples from the outer node.
+ */
+ if (!TupIsNull(node->group_pivot))
+ {
+ tuplesort_puttupleslot(fullsort_state, node->group_pivot);
+ nTuples++;
+
+ /*
+ * We're in full sort mode accumulating a minimum number of tuples
+ * and not checking for prefix key equality yet, so we can't
+ * assume the group pivot tuple will reamin the same -- unless
+ * we're using a minimum group size of 1, in which case the pivot
+ * is obviously still the pviot.
+ */
+ if (nTuples != minGroupSize)
+ ExecClearTuple(node->group_pivot);
+ }
+
+
+ /*
+ * Pull as many tuples from the outer node as possible given our
+ * current operating mode.
+ */
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+
+ /*
+ * If the outer node can't provide us any more tuples, then we can
+ * sort the current group and return those tuples.
+ */
+ if (TupIsNull(slot))
+ {
+ /*
+ * We need to know later if the outer node has completed to be
+ * able to distinguish between being done with a batch and
+ * being done with the whole node.
+ */
+ node->outerNodeDone = true;
+
+ SO1_printf("Sorting fullsort with %ld tuples\n", nTuples);
+ tuplesort_performsort(fullsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, fullsort)
+
+ SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n");
+ node->execution_status = INCSORT_READFULLSORT;
+ break;
+ }
+
+ /* Accumulate the next group of presorted tuples. */
+ if (nTuples < minGroupSize)
+ {
+ /*
+ * If we haven't yet hit our target minimum group size, then
+ * we don't need to bother checking for inclusion in the
+ * current prefix group since at this point we'll assume that
+ * we'll full sort this batch to avoid a large number of very
+ * tiny (and thus inefficient) sorts.
+ */
+ tuplesort_puttupleslot(fullsort_state, slot);
+ nTuples++;
+
+ /*
+ * If we've reach our minimum group size, then we need to
+ * store the most recent tuple as a pivot.
+ */
+ if (nTuples == minGroupSize)
+ ExecCopySlot(node->group_pivot, slot);
+ }
+ else
+ {
+ /*
+ * If we've already accumulated enough tuples to reach our
+ * minimum group size, then we need to compare any additional
+ * tuples to our pivot tuple to see if we reach the end of
+ * that prefix key group. Only after we find changed prefix
+ * keys can we guarantee sort stability of the tuples we've
+ * already accumulated.
+ */
+ if (isCurrentGroup(node, node->group_pivot, slot))
+ {
+ /*
+ * As long as the prefix keys match the pivot tuple then
+ * load the tuple into the tuplesort.
+ */
+ tuplesort_puttupleslot(fullsort_state, slot);
+ nTuples++;
+ }
+ else
+ {
+ /*
+ * Since the tuple we fetched isn't part of the current
+ * prefix key group we don't want to sort it as part of
+ * the current batch. Instead we use the group_pivot slot
+ * to carry it over to the next batch (even though we
+ * won't actually treat it as a group pivot).
+ */
+ ExecCopySlot(node->group_pivot, slot);
+
+ if (node->bounded)
+ {
+ /*
+ * If the current node has a bound, and we've already
+ * sorted n tuples, then the functional bound
+ * remaining is (original bound - n), so store the
+ * current number of processed tuples for later use
+ * configuring the sort state's bound.
+ */
+ SO2_printf("Changing bound_Done from %ld to %ld\n",
+ node->bound_Done,
+ Min(node->bound, node->bound_Done + nTuples));
+ node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+ }
+
+ /*
+ * Once we find changed prefix keys we can complete the
+ * sort and transition modes to reading out the sorted
+ * tuples.
+ */
+ SO1_printf("Sorting fullsort tuplesort with %ld tuples\n",
+ nTuples);
+ tuplesort_performsort(fullsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, fullsort)
+
+ SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n");
+ node->execution_status = INCSORT_READFULLSORT;
+ break;
+ }
+ }
+
+ /*
+ * Unless we've alrady transitioned modes to reading from the full
+ * sort state, then we assume that having read at least
+ * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're
+ * processing a large group of tuples all having equal prefix keys
+ * (but haven't yet found the final tuple in that prefix key
+ * group), so we need to transition in to presorted prefix mode.
+ */
+ if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE &&
+ node->execution_status != INCSORT_READFULLSORT)
+ {
+ /*
+ * The group pivot we have stored has already been put into
+ * the tuplesort; we don't want to carry it over. Since we
+ * haven't yet found the end of the prefix key group, it might
+ * seem like we should keep this, but we don't actually know
+ * how many prefix key groups might be represented in the full
+ * sort state, so we'll let the mode transition function
+ * manage this state for us.
+ */
+ ExecClearTuple(node->group_pivot);
+
+ /*
+ * Unfortunately the tuplesort API doesn't include a way to
+ * retrieve tuples unless a sort has been performed, so we
+ * perform the sort even though we could just as easily rely
+ * on FIFO retrieval semantics when transferring them to the
+ * presorted prefix tuplesort.
+ */
+ SO1_printf("Sorting fullsort tuplesort with %ld tuples\n", nTuples);
+ tuplesort_performsort(fullsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, fullsort)
+
+ /*
+ * If the full sort tuplesort happened to switch into top-n
+ * heapsort mode then we will only be able to retrieve
+ * currentBound tuples (since the tuplesort will have only
+ * retained the top-n tuples). This is safe even though we
+ * haven't yet completed fetching the current prefix key group
+ * because the tuples we've "lost" already sorted "below" the
+ * retained ones, and we're already contractually guaranteed
+ * to not need any more than the currentBound tuples.
+ */
+ if (tuplesort_used_bound(node->fullsort_state))
+ {
+ int64 currentBound = node->bound - node->bound_Done;
+
+ SO2_printf("Read %ld tuples, but setting to %ld because we used bounded sort\n",
+ nTuples, Min(currentBound, nTuples));
+ nTuples = Min(currentBound, nTuples);
+ }
+
+ SO1_printf("Setting n_fullsort_remaining to %ld and calling switchToPresortedPrefixMode()\n",
+ nTuples);
+
+ /*
+ * We might have multiple prefix key groups in the full sort
+ * state, so the mode transition function needs to know the it
+ * needs to move from the fullsort to presorted prefix sort.
+ */
+ node->n_fullsort_remaining = nTuples;
+
+ /* Transition the tuples to the presorted prefix tuplesort. */
+ switchToPresortedPrefixMode(pstate);
+
+ /*
+ * Since we know we had tuples to move to the presorted prefix
+ * tuplesort, we know that unless that transition has verified
+ * that all tuples belonged to the same prefix key group (in
+ * which case we can go straight to continuing to load tuples
+ * into that tuplesort), we should have a tuple to return
+ * here.
+ *
+ * Either way, the appropriate execution status should have
+ * been set by switchToPresortedPrefixMode(), so we can drop
+ * out of the loop here and let the appropriate path kick in.
+ */
+ break;
+ }
+ }
+ }
+
+ if (node->execution_status == INCSORT_LOADPREFIXSORT)
+ {
+ /*
+ * We only enter this state after the mode transition function has
+ * confirmed all remaining tuples from the full sort state have the
+ * same prefix and moved those tuples to the prefix sort state. That
+ * function has also set a group pivot tuple (which doesn't need to be
+ * carried over; it's already been put into the prefix sort state).
+ */
+ Assert(!TupIsNull(node->group_pivot));
+
+ /*
+ * Read tuples from the outer node and load them into the prefix sort
+ * state until we encounter a tuple whose prefix keys don't match the
+ * current group_pivot tuple, since we can't guarantee sort stability
+ * until we have all tuples matching those prefix keys.
+ */
+ for (;;)
+ {
+ slot = ExecProcNode(outerNode);
+
+ /*
+ * If we've exhausted tuples from the outer node we're done
+ * loading the prefix sort state.
+ */
+ if (TupIsNull(slot))
+ {
+ /*
+ * We need to know later if the outer node has completed to be
+ * able to distinguish between being done with a batch and
+ * being done with the whole node.
+ */
+ node->outerNodeDone = true;
+ break;
+ }
+
+ /*
+ * If the tuple's prefix keys match our pivot tuple, we're not
+ * done yet and can load it into the prefix sort state. If not, we
+ * don't want to sort it as part of the current batch. Instead we
+ * use the group_pivot slot to carry it over to the next batch
+ * (even though we won't actually treat it as a group pivot).
+ */
+ if (isCurrentGroup(node, node->group_pivot, slot))
+ {
+ tuplesort_puttupleslot(node->prefixsort_state, slot);
+ nTuples++;
+ }
+ else
+ {
+ ExecCopySlot(node->group_pivot, slot);
+ break;
+ }
+ }
+
+ /*
+ * Perform the sort and begin returning the tuples to the parent plan
+ * node.
+ */
+ SO1_printf("Sorting presorted prefix tuplesort with >= %ld tuples\n", nTuples);
+ tuplesort_performsort(node->prefixsort_state);
+
+ INSTRUMENT_SORT_GROUP(node, prefixsort)
+
+ SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n");
+ node->execution_status = INCSORT_READPREFIXSORT;
+
+ if (node->bounded)
+ {
+ /*
+ * If the current node has a bound, and we've already sorted n
+ * tuples, then the functional bound remaining is (original bound
+ * - n), so store the current number of processed tuples for use
+ * in configuring sorting bound.
+ */
+ SO2_printf("Changing bound_Done from %ld to %ld\n",
+ node->bound_Done,
+ Min(node->bound, node->bound_Done + nTuples));
+ node->bound_Done = Min(node->bound, node->bound_Done + nTuples);
+ }
+ }
+
+ /* Restore to user specified direction. */
+ estate->es_direction = dir;
+
+ /*
+ * Get the first or next tuple from tuplesort. Returns NULL if no more
+ * tuples.
+ */
+ read_sortstate = node->execution_status == INCSORT_READFULLSORT ?
+ fullsort_state : node->prefixsort_state;
+ slot = node->ss.ps.ps_ResultTupleSlot;
+ (void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir),
+ false, slot, NULL);
+ return slot;
+}
+
+/* ----------------------------------------------------------------
+ * ExecInitIncrementalSort
+ *
+ * Creates the run-time state information for the sort node
+ * produced by the planner and initializes its outer subtree.
+ * ----------------------------------------------------------------
+ */
+IncrementalSortState *
+ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags)
+{
+ IncrementalSortState *incrsortstate;
+
+ SO_printf("ExecInitIncrementalSort: initializing sort node\n");
+
+ /*
+ * Incremental sort can't be used with either EXEC_FLAG_REWIND,
+ * EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, because we only one of many sort
+ * batches in the current sort state.
+ */
+ Assert((eflags & (EXEC_FLAG_BACKWARD |
+ EXEC_FLAG_MARK)) == 0);
+
+ /* Initialize state structure. */
+ incrsortstate = makeNode(IncrementalSortState);
+ incrsortstate->ss.ps.plan = (Plan *) node;
+ incrsortstate->ss.ps.state = estate;
+ incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort;
+
+ incrsortstate->execution_status = INCSORT_LOADFULLSORT;
+ incrsortstate->bounded = false;
+ incrsortstate->outerNodeDone = false;
+ incrsortstate->bound_Done = 0;
+ incrsortstate->fullsort_state = NULL;
+ incrsortstate->prefixsort_state = NULL;
+ incrsortstate->group_pivot = NULL;
+ incrsortstate->transfer_tuple = NULL;
+ incrsortstate->n_fullsort_remaining = 0;
+ incrsortstate->presorted_keys = NULL;
+
+ if (incrsortstate->ss.ps.instrument != NULL)
+ {
+ IncrementalSortGroupInfo *fullsortGroupInfo =
+ &incrsortstate->incsort_info.fullsortGroupInfo;
+ IncrementalSortGroupInfo *prefixsortGroupInfo =
+ &incrsortstate->incsort_info.prefixsortGroupInfo;
+
+ fullsortGroupInfo->groupCount = 0;
+ fullsortGroupInfo->maxDiskSpaceUsed = 0;
+ fullsortGroupInfo->totalDiskSpaceUsed = 0;
+ fullsortGroupInfo->maxMemorySpaceUsed = 0;
+ fullsortGroupInfo->totalMemorySpaceUsed = 0;
+ fullsortGroupInfo->sortMethods = 0;
+ prefixsortGroupInfo->groupCount = 0;
+ prefixsortGroupInfo->maxDiskSpaceUsed = 0;
+ prefixsortGroupInfo->totalDiskSpaceUsed = 0;
+ prefixsortGroupInfo->maxMemorySpaceUsed = 0;
+ prefixsortGroupInfo->totalMemorySpaceUsed = 0;
+ prefixsortGroupInfo->sortMethods = 0;
+ }
+
+ /*
+ * Miscellaneous initialization
+ *
+ * Sort nodes don't initialize their ExprContexts because they never call
+ * ExecQual or ExecProject.
+ */
+
+ /*
+ * Initialize child nodes.
+ *
+ * We shield the child node from the need to support REWIND, BACKWARD, or
+ * MARK/RESTORE.
+ */
+ eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);
+
+ outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+ /*
+ * Initialize scan slot and type.
+ */
+ ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple);
+
+ /*
+ * Initialize return slot and type. No need to initialize projection info
+ * because we don't do any projections.
+ */
+ ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple);
+ incrsortstate->ss.ps.ps_ProjInfo = NULL;
+
+ /*
+ * Initialize standalone slots to store a tuple for pivot prefix keys and
+ * for carrying over a tuple from one batch to the next.
+ */
+ incrsortstate->group_pivot =
+ MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+ &TTSOpsMinimalTuple);
+ incrsortstate->transfer_tuple =
+ MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)),
+ &TTSOpsMinimalTuple);
+
+ SO_printf("ExecInitIncrementalSort: sort node initialized\n");
+
+ return incrsortstate;
+}
+
+/* ----------------------------------------------------------------
+ * ExecEndIncrementalSort(node)
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIncrementalSort(IncrementalSortState *node)
+{
+ SO_printf("ExecEndIncrementalSort: shutting down sort node\n");
+
+ /* clean out the scan tuple */
+ ExecClearTuple(node->ss.ss_ScanTupleSlot);
+ /* must drop pointer to sort result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+ /* must drop stanalone tuple slots from outer node */
+ ExecDropSingleTupleTableSlot(node->group_pivot);
+ ExecDropSingleTupleTableSlot(node->transfer_tuple);
+
+ /*
+ * Release tuplesort resources.
+ */
+ if (node->fullsort_state != NULL)
+ {
+ tuplesort_end(node->fullsort_state);
+ node->fullsort_state = NULL;
+ }
+ if (node->prefixsort_state != NULL)
+ {
+ tuplesort_end(node->prefixsort_state);
+ node->prefixsort_state = NULL;
+ }
+
+ /*
+ * Shut down the subplan.
+ */
+ ExecEndNode(outerPlanState(node));
+
+ SO_printf("ExecEndIncrementalSort: sort node shutdown\n");
+}
+
+void
+ExecReScanIncrementalSort(IncrementalSortState *node)
+{
+ PlanState *outerPlan = outerPlanState(node);
+
+ /*
+ * Incremental sort doesn't support efficient rescan even when paramters
+ * haven't changed (e.g., rewind) because unlike regular sort we don't
+ * store all tuples at once for the full sort.
+ *
+ * So even if EXEC_FLAG_REWIND is set we just reset all of our state and
+ * reexecute the sort along with the child node below us.
+ *
+ * In theory if we've only fill the full sort with one batch (and haven't
+ * reset it for a new batch yet) then we could efficiently rewind, but
+ * that seems a narrow enough case that it's not worth handling specially
+ * at this time.
+ */
+
+ /* must drop pointer to sort result tuple */
+ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+
+ if (node->group_pivot != NULL)
+ ExecClearTuple(node->group_pivot);
+ if (node->transfer_tuple != NULL)
+ ExecClearTuple(node->transfer_tuple);
+
+ node->bounded = false;
+ node->outerNodeDone = false;
+ node->n_fullsort_remaining = 0;
+ node->bound_Done = 0;
+ node->presorted_keys = NULL;
+
+ node->execution_status = INCSORT_LOADFULLSORT;
+
+ /*
+ * If we've set up either of the sort states yet, we need to reset them.
+ * We could end them and null out the pointers, but there's no reason to
+ * repay the setup cost, and because guard setting up pivot comparator
+ * state similarly, doing so might actually cause a leak.
+ */
+ if (node->fullsort_state != NULL)
+ {
+ tuplesort_reset(node->fullsort_state);
+ node->fullsort_state = NULL;
+ }
+ if (node->prefixsort_state != NULL)
+ {
+ tuplesort_reset(node->prefixsort_state);
+ node->prefixsort_state = NULL;
+ }
+
+ /*
+ * If chgParam of subnode is not null, theni the plan will be re-scanned
+ * by the first ExecProcNode.
+ */
+ if (outerPlan->chgParam == NULL)
+ ExecReScan(outerPlan);
+}
+
+/* ----------------------------------------------------------------
+ * Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ * ExecSortEstimate
+ *
+ * Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo));
+ size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo));
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortInitializeDSM
+ *
+ * Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt)
+{
+ Size size;
+
+ /* don't need this if not instrumenting or no workers */
+ if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+ return;
+
+ size = offsetof(SharedIncrementalSortInfo, sinfo)
+ + pcxt->nworkers * sizeof(IncrementalSortInfo);
+ node->shared_info = shm_toc_allocate(pcxt->toc, size);
+ /* ensure any unfilled slots will contain zeroes */
+ memset(node->shared_info, 0, size);
+ node->shared_info->num_workers = pcxt->nworkers;
+ shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+ node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortInitializeWorker
+ *
+ * Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt)
+{
+ node->shared_info =
+ shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+ node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ * ExecSortRetrieveInstrumentation
+ *
+ * Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node)
+{
+ Size size;
+ SharedIncrementalSortInfo *si;
+
+ if (node->shared_info == NULL)
+ return;
+
+ size = offsetof(SharedIncrementalSortInfo, sinfo)
+ + node->shared_info->num_workers * sizeof(IncrementalSortInfo);
+ si = palloc(size);
+ memcpy(si, node->shared_info, size);
+ node->shared_info = si;
+}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 5d1debc196..9d2bfd7ed6 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -93,7 +93,8 @@ ExecSort(PlanState *pstate)
plannode->collations,
plannode->nullsFirst,
work_mem,
- NULL, node->randomAccess);
+ NULL,
+ node->randomAccess);
if (node->bounded)
tuplesort_set_bound(tuplesortstate, node->bound);
node->tuplesortstate = (void *) tuplesortstate;
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index eaf93c64b8..f9d86859ee 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -927,6 +927,24 @@ _copyMaterial(const Material *from)
}
+/*
+ * CopySortFields
+ *
+ * This function copies the fields of the Sort node. It is used by
+ * all the copy functions for classes which inherit from Sort.
+ */
+static void
+CopySortFields(const Sort *from, Sort *newnode)
+{
+ CopyPlanFields((const Plan *) from, (Plan *) newnode);
+
+ COPY_SCALAR_FIELD(numCols);
+ COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber));
+ COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid));
+ COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid));
+ COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool));
+}
+
/*
* _copySort
*/
@@ -938,13 +956,29 @@ _copySort(const Sort *from)
/*
* copy node superclass fields
*/
- CopyPlanFields((const Plan *) from, (Plan *) newnode);
+ CopySortFields(from, newnode);
- COPY_SCALAR_FIELD(numCols);
- COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber));
- COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid));
- COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid));
- COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool));
+ return newnode;
+}
+
+
+/*
+ * _copyIncrementalSort
+ */
+static IncrementalSort *
+_copyIncrementalSort(const IncrementalSort *from)
+{
+ IncrementalSort *newnode = makeNode(IncrementalSort);
+
+ /*
+ * copy node superclass fields
+ */
+ CopySortFields((const Sort *) from, (Sort *) newnode);
+
+ /*
+ * copy remainder of node
+ */
+ COPY_SCALAR_FIELD(nPresortedCols);
return newnode;
}
@@ -4898,6 +4932,9 @@ copyObjectImpl(const void *from)
case T_Sort:
retval = _copySort(from);
break;
+ case T_IncrementalSort:
+ retval = _copyIncrementalSort(from);
+ break;
case T_Group:
retval = _copyGroup(from);
break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f4aecdcbcd..35ed8c0d53 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -837,10 +837,8 @@ _outMaterial(StringInfo str, const Material *node)
}
static void
-_outSort(StringInfo str, const Sort *node)
+_outSortInfo(StringInfo str, const Sort *node)
{
- WRITE_NODE_TYPE("SORT");
-
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(numCols);
@@ -850,6 +848,24 @@ _outSort(StringInfo str, const Sort *node)
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
}
+static void
+_outSort(StringInfo str, const Sort *node)
+{
+ WRITE_NODE_TYPE("SORT");
+
+ _outSortInfo(str, node);
+}
+
+static void
+_outIncrementalSort(StringInfo str, const IncrementalSort *node)
+{
+ WRITE_NODE_TYPE("INCREMENTALSORT");
+
+ _outSortInfo(str, (const Sort *) node);
+
+ WRITE_INT_FIELD(nPresortedCols);
+}
+
static void
_outUnique(StringInfo str, const Unique *node)
{
@@ -3786,6 +3802,9 @@ outNode(StringInfo str, const void *obj)
case T_Sort:
_outSort(str, obj);
break;
+ case T_IncrementalSort:
+ _outIncrementalSort(str, obj);
+ break;
case T_Unique:
_outUnique(str, obj);
break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index d5b23a3479..2a2f39bf04 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2150,12 +2150,13 @@ _readMaterial(void)
}
/*
- * _readSort
+ * ReadCommonSort
+ * Assign the basic stuff of all nodes that inherit from Sort
*/
-static Sort *
-_readSort(void)
+static void
+ReadCommonSort(Sort *local_node)
{
- READ_LOCALS(Sort);
+ READ_TEMP_LOCALS();
ReadCommonPlan(&local_node->plan);
@@ -2164,6 +2165,32 @@ _readSort(void)
READ_OID_ARRAY(sortOperators, local_node->numCols);
READ_OID_ARRAY(collations, local_node->numCols);
READ_BOOL_ARRAY(nullsFirst, local_node->numCols);
+}
+
+/*
+ * _readSort
+ */
+static Sort *
+_readSort(void)
+{
+ READ_LOCALS_NO_FIELDS(Sort);
+
+ ReadCommonSort(local_node);
+
+ READ_DONE();
+}
+
+/*
+ * _readIncrementalSort
+ */
+static IncrementalSort *
+_readIncrementalSort(void)
+{
+ READ_LOCALS(IncrementalSort);
+
+ ReadCommonSort(&local_node->sort);
+
+ READ_INT_FIELD(nPresortedCols);
READ_DONE();
}
@@ -2801,6 +2828,8 @@ parseNodeString(void)
return_value = _readMaterial();
else if (MATCH("SORT", 4))
return_value = _readSort();
+ else if (MATCH("INCREMENTALSORT", 15))
+ return_value = _readIncrementalSort();
else if (MATCH("GROUP", 5))
return_value = _readGroup();
else if (MATCH("AGG", 3))
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 905bbe77d8..ccf46dd0aa 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -3881,6 +3881,10 @@ print_path(PlannerInfo *root, Path *path, int indent)
ptype = "Sort";
subpath = ((SortPath *) path)->subpath;
break;
+ case T_IncrementalSortPath:
+ ptype = "IncrementalSort";
+ subpath = ((SortPath *) path)->subpath;
+ break;
case T_GroupPath:
ptype = "Group";
subpath = ((GroupPath *) path)->subpath;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9e7e57f118..0eef5d7707 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -128,6 +128,7 @@ bool enable_indexonlyscan = true;
bool enable_bitmapscan = true;
bool enable_tidscan = true;
bool enable_sort = true;
+bool enable_incrementalsort = true;
bool enable_hashagg = true;
bool enable_hashagg_disk = true;
bool enable_groupingsets_hash_disk = false;
@@ -1648,9 +1649,9 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
}
/*
- * cost_sort
- * Determines and returns the cost of sorting a relation, including
- * the cost of reading the input data.
+ * cost_tuplesort
+ * Determines and returns the cost of sorting a relation using tuplesort,
+ * not including the cost of reading the input data.
*
* If the total volume of data to sort is less than sort_mem, we will do
* an in-memory sort, which requires no I/O and about t*log2(t) tuple
@@ -1677,39 +1678,23 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
* specifying nonzero comparison_cost; typically that's used for any extra
* work that has to be done to prepare the inputs to the comparison operators.
*
- * 'pathkeys' is a list of sort keys
- * 'input_cost' is the total cost for reading the input data
* 'tuples' is the number of tuples in the relation
* 'width' is the average tuple width in bytes
* 'comparison_cost' is the extra cost per comparison, if any
* 'sort_mem' is the number of kilobytes of work memory allowed for the sort
* 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
- *
- * NOTE: some callers currently pass NIL for pathkeys because they
- * can't conveniently supply the sort keys. Since this routine doesn't
- * currently do anything with pathkeys anyway, that doesn't matter...
- * but if it ever does, it should react gracefully to lack of key data.
- * (Actually, the thing we'd most likely be interested in is just the number
- * of sort keys, which all callers *could* supply.)
*/
-void
-cost_sort(Path *path, PlannerInfo *root,
- List *pathkeys, Cost input_cost, double tuples, int width,
- Cost comparison_cost, int sort_mem,
- double limit_tuples)
+static void
+cost_tuplesort(Cost *startup_cost, Cost *run_cost,
+ double tuples, int width,
+ Cost comparison_cost, int sort_mem,
+ double limit_tuples)
{
- Cost startup_cost = input_cost;
- Cost run_cost = 0;
double input_bytes = relation_byte_size(tuples, width);
double output_bytes;
double output_tuples;
long sort_mem_bytes = sort_mem * 1024L;
- if (!enable_sort)
- startup_cost += disable_cost;
-
- path->rows = tuples;
-
/*
* We want to be sure the cost of a sort is never estimated as zero, even
* if passed-in tuple count is zero. Besides, mustn't do log(0)...
@@ -1748,7 +1733,7 @@ cost_sort(Path *path, PlannerInfo *root,
*
* Assume about N log2 N comparisons
*/
- startup_cost += comparison_cost * tuples * LOG2(tuples);
+ *startup_cost = comparison_cost * tuples * LOG2(tuples);
/* Disk costs */
@@ -1759,7 +1744,7 @@ cost_sort(Path *path, PlannerInfo *root,
log_runs = 1.0;
npageaccesses = 2.0 * npages * log_runs;
/* Assume 3/4ths of accesses are sequential, 1/4th are not */
- startup_cost += npageaccesses *
+ *startup_cost += npageaccesses *
(seq_page_cost * 0.75 + random_page_cost * 0.25);
}
else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
@@ -1770,12 +1755,12 @@ cost_sort(Path *path, PlannerInfo *root,
* factor is a bit higher than for quicksort. Tweak it so that the
* cost curve is continuous at the crossover point.
*/
- startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
+ *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
}
else
{
/* We'll use plain quicksort on all the input tuples */
- startup_cost += comparison_cost * tuples * LOG2(tuples);
+ *startup_cost = comparison_cost * tuples * LOG2(tuples);
}
/*
@@ -1786,8 +1771,143 @@ cost_sort(Path *path, PlannerInfo *root,
* here --- the upper LIMIT will pro-rate the run cost so we'd be double
* counting the LIMIT otherwise.
*/
- run_cost += cpu_operator_cost * tuples;
+ *run_cost = cpu_operator_cost * tuples;
+}
+/*
+ * cost_incremental_sort
+ * Determines and returns the cost of sorting a relation incrementally, when
+ * the input path is presorted by a prefix of the pathkeys.
+ *
+ * 'presorted_keys' is the number of leading pathkeys by which the input path
+ * is sorted.
+ *
+ * We estimate the number of groups into which the relation is divided by the
+ * leading pathkeys, and then calculate the cost of sorting a single group
+ * with tuplesort using cost_tuplesort().
+ */
+void
+cost_incremental_sort(Path *path,
+ PlannerInfo *root, List *pathkeys, int presorted_keys,
+ Cost input_startup_cost, Cost input_total_cost,
+ double input_tuples, int width, Cost comparison_cost, int sort_mem,
+ double limit_tuples)
+{
+ Cost startup_cost = 0,
+ run_cost = 0,
+ input_run_cost = input_total_cost - input_startup_cost;
+ double group_tuples,
+ input_groups;
+ Cost group_startup_cost,
+ group_run_cost,
+ group_input_run_cost;
+ List *presortedExprs = NIL;
+ ListCell *l;
+ int i = 0;
+
+ Assert(presorted_keys != 0);
+
+ /*
+ * We want to be sure the cost of a sort is never estimated as zero, even
+ * if passed-in tuple count is zero. Besides, mustn't do log(0)...
+ */
+ if (input_tuples < 2.0)
+ input_tuples = 2.0;
+
+ /* Extract presorted keys as list of expressions */
+ foreach(l, pathkeys)
+ {
+ PathKey *key = (PathKey *) lfirst(l);
+ EquivalenceMember *member = (EquivalenceMember *)
+ linitial(key->pk_eclass->ec_members);
+
+ presortedExprs = lappend(presortedExprs, member->em_expr);
+
+ i++;
+ if (i >= presorted_keys)
+ break;
+ }
+
+ /* Estimate number of groups with equal presorted keys */
+ input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL);
+ group_tuples = input_tuples / input_groups;
+ group_input_run_cost = input_run_cost / input_groups;
+
+ /*
+ * Estimate average cost of sorting of one group where presorted keys are
+ * equal. Incremental sort is sensitive to distribution of tuples to the
+ * groups, where we're relying on quite rough assumptions. Thus, we're
+ * pessimistic about incremental sort performance and increase its average
+ * group size by half.
+ */
+ cost_tuplesort(&group_startup_cost, &group_run_cost,
+ 1.5 * group_tuples, width, comparison_cost, sort_mem,
+ limit_tuples);
+
+ /*
+ * Startup cost of incremental sort is the startup cost of its first group
+ * plus the cost of its input.
+ */
+ startup_cost += group_startup_cost
+ + input_startup_cost + group_input_run_cost;
+
+ /*
+ * After we started producing tuples from the first group, the cost of
+ * producing all the tuples is given by the cost to finish processing this
+ * group, plus the total cost to process the remaining groups, plus the
+ * remaining cost of input.
+ */
+ run_cost += group_run_cost
+ + (group_run_cost + group_startup_cost) * (input_groups - 1)
+ + group_input_run_cost * (input_groups - 1);
+
+ /*
+ * Incremental sort adds some overhead by itself. Firstly, it has to
+ * detect the sort groups. This is roughly equal to one extra copy and
+ * comparison per tuple. Secondly, it has to reset the tuplesort context
+ * for every group.
+ */
+ run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
+ run_cost += 2.0 * cpu_tuple_cost * input_groups;
+
+ path->rows = input_tuples;
+ path->startup_cost = startup_cost;
+ path->total_cost = startup_cost + run_cost;
+}
+
+/*
+ * cost_sort
+ * Determines and returns the cost of sorting a relation, including
+ * the cost of reading the input data.
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys. Since this routine doesn't
+ * currently do anything with pathkeys anyway, that doesn't matter...
+ * but if it ever does, it should react gracefully to lack of key data.
+ * (Actually, the thing we'd most likely be interested in is just the number
+ * of sort keys, which all callers *could* supply.)
+ */
+void
+cost_sort(Path *path, PlannerInfo *root,
+ List *pathkeys, Cost input_cost, double tuples, int width,
+ Cost comparison_cost, int sort_mem,
+ double limit_tuples)
+
+{
+ Cost startup_cost;
+ Cost run_cost;
+
+ cost_tuplesort(&startup_cost, &run_cost,
+ tuples, width,
+ comparison_cost, sort_mem,
+ limit_tuples);
+
+ if (!enable_sort)
+ startup_cost += disable_cost;
+
+ startup_cost += input_cost;
+
+ path->rows = tuples;
path->startup_cost = startup_cost;
path->total_cost = startup_cost + run_cost;
}
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 71b9d42c99..21e3f5a987 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -334,6 +334,60 @@ pathkeys_contained_in(List *keys1, List *keys2)
return false;
}
+/*
+ * pathkeys_count_contained_in
+ * Same as pathkeys_contained_in, but also sets length of longest
+ * common prefix of keys1 and keys2.
+ */
+bool
+pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common)
+{
+ int n = 0;
+ ListCell *key1,
+ *key2;
+
+ /*
+ * See if we can avoiding looping through both lists. This optimization
+ * gains us several percent in planning time in a worst-case test.
+ */
+ if (keys1 == keys2)
+ {
+ *n_common = list_length(keys1);
+ return true;
+ }
+ else if (keys1 == NIL)
+ {
+ *n_common = 0;
+ return true;
+ }
+ else if (keys2 == NIL)
+ {
+ *n_common = 0;
+ return false;
+ }
+
+ /*
+ * If both lists are non-empty, iterate through both to find out how many
+ * items are shared.
+ */
+ forboth(key1, keys1, key2, keys2)
+ {
+ PathKey *pathkey1 = (PathKey *) lfirst(key1);
+ PathKey *pathkey2 = (PathKey *) lfirst(key2);
+
+ if (pathkey1 != pathkey2)
+ {
+ *n_common = n;
+ return false;
+ }
+ n++;
+ }
+
+ /* If we ended with a null value, then we've processed the whole list. */
+ *n_common = n;
+ return (key1 == NULL);
+}
+
/*
* get_cheapest_path_for_pathkeys
* Find the cheapest path (according to the specified criterion) that
@@ -1786,26 +1840,26 @@ right_merge_direction(PlannerInfo *root, PathKey *pathkey)
* Count the number of pathkeys that are useful for meeting the
* query's requested output ordering.
*
- * Unlike merge pathkeys, this is an all-or-nothing affair: it does us
- * no good to order by just the first key(s) of the requested ordering.
- * So the result is always either 0 or list_length(root->query_pathkeys).
+ * Because we the have the possibility of incremental sort, a prefix list of
+ * keys is potentially useful for improving the performance of the requested
+ * ordering. Thus we return 0, if no valuable keys are found, or the number
+ * of leading keys shared by the list and the requested ordering..
*/
static int
pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
{
+ int n_common_pathkeys;
+
if (root->query_pathkeys == NIL)
return 0; /* no special ordering requested */
if (pathkeys == NIL)
return 0; /* unordered path */
- if (pathkeys_contained_in(root->query_pathkeys, pathkeys))
- {
- /* It's useful ... or at least the first N keys are */
- return list_length(root->query_pathkeys);
- }
+ (void) pathkeys_count_contained_in(root->query_pathkeys, pathkeys,
+ &n_common_pathkeys);
- return 0; /* path ordering not useful */
+ return n_common_pathkeys;
}
/*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index fc25908dc6..6d26bfbeb5 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -98,6 +98,8 @@ static Plan *create_projection_plan(PlannerInfo *root,
int flags);
static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe);
static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags);
+static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root,
+ IncrementalSortPath *best_path, int flags);
static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path);
static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path,
int flags);
@@ -244,6 +246,10 @@ static MergeJoin *make_mergejoin(List *tlist,
static Sort *make_sort(Plan *lefttree, int numCols,
AttrNumber *sortColIdx, Oid *sortOperators,
Oid *collations, bool *nullsFirst);
+static IncrementalSort *make_incrementalsort(Plan *lefttree,
+ int numCols, int nPresortedCols,
+ AttrNumber *sortColIdx, Oid *sortOperators,
+ Oid *collations, bool *nullsFirst);
static Plan *prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
Relids relids,
const AttrNumber *reqColIdx,
@@ -258,6 +264,8 @@ static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec,
Relids relids);
static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
Relids relids);
+static IncrementalSort *make_incrementalsort_from_pathkeys(Plan *lefttree,
+ List *pathkeys, Relids relids, int nPresortedCols);
static Sort *make_sort_from_groupcols(List *groupcls,
AttrNumber *grpColIdx,
Plan *lefttree);
@@ -460,6 +468,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
(SortPath *) best_path,
flags);
break;
+ case T_IncrementalSort:
+ plan = (Plan *) create_incrementalsort_plan(root,
+ (IncrementalSortPath *) best_path,
+ flags);
+ break;
case T_Group:
plan = (Plan *) create_group_plan(root,
(GroupPath *) best_path);
@@ -1994,6 +2007,32 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
return plan;
}
+/*
+ * create_incrementalsort_plan
+ *
+ * Do the same as create_sort_plan, but create IncrementalSort plan.
+ */
+static IncrementalSort *
+create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path,
+ int flags)
+{
+ IncrementalSort *plan;
+ Plan *subplan;
+
+ /* See comments in create_sort_plan() above */
+ subplan = create_plan_recurse(root, best_path->spath.subpath,
+ flags | CP_SMALL_TLIST);
+ plan = make_incrementalsort_from_pathkeys(subplan,
+ best_path->spath.path.pathkeys,
+ IS_OTHER_REL(best_path->spath.subpath->parent) ?
+ best_path->spath.path.parent->relids : NULL,
+ best_path->nPresortedCols);
+
+ copy_generic_path_info(&plan->sort.plan, (Path *) best_path);
+
+ return plan;
+}
+
/*
* create_group_plan
*
@@ -5090,6 +5129,12 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
Plan *lefttree = plan->plan.lefttree;
Path sort_path; /* dummy for result of cost_sort */
+ /*
+ * This function shouldn't have to deal with IncrementalSort plans because
+ * they are only created from corresponding Path nodes.
+ */
+ Assert(IsA(plan, Sort));
+
cost_sort(&sort_path, root, NIL,
lefttree->total_cost,
lefttree->plan_rows,
@@ -5677,9 +5722,12 @@ make_sort(Plan *lefttree, int numCols,
AttrNumber *sortColIdx, Oid *sortOperators,
Oid *collations, bool *nullsFirst)
{
- Sort *node = makeNode(Sort);
- Plan *plan = &node->plan;
+ Sort *node;
+ Plan *plan;
+ node = makeNode(Sort);
+
+ plan = &node->plan;
plan->targetlist = lefttree->targetlist;
plan->qual = NIL;
plan->lefttree = lefttree;
@@ -5693,6 +5741,37 @@ make_sort(Plan *lefttree, int numCols,
return node;
}
+/*
+ * make_incrementalsort --- basic routine to build an IncrementalSort plan node
+ *
+ * Caller must have built the sortColIdx, sortOperators, collations, and
+ * nullsFirst arrays already.
+ */
+static IncrementalSort *
+make_incrementalsort(Plan *lefttree, int numCols, int nPresortedCols,
+ AttrNumber *sortColIdx, Oid *sortOperators,
+ Oid *collations, bool *nullsFirst)
+{
+ IncrementalSort *node;
+ Plan *plan;
+
+ node = makeNode(IncrementalSort);
+
+ plan = &node->sort.plan;
+ plan->targetlist = lefttree->targetlist;
+ plan->qual = NIL;
+ plan->lefttree = lefttree;
+ plan->righttree = NULL;
+ node->nPresortedCols = nPresortedCols;
+ node->sort.numCols = numCols;
+ node->sort.sortColIdx = sortColIdx;
+ node->sort.sortOperators = sortOperators;
+ node->sort.collations = collations;
+ node->sort.nullsFirst = nullsFirst;
+
+ return node;
+}
+
/*
* prepare_sort_from_pathkeys
* Prepare to sort according to given pathkeys
@@ -6039,6 +6118,42 @@ make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids)
collations, nullsFirst);
}
+/*
+ * make_incrementalsort_from_pathkeys
+ * Create sort plan to sort according to given pathkeys
+ *
+ * 'lefttree' is the node which yields input tuples
+ * 'pathkeys' is the list of pathkeys by which the result is to be sorted
+ * 'relids' is the set of relations required by prepare_sort_from_pathkeys()
+ * 'nPresortedCols' is the number of presorted columns in input tuples
+ */
+static IncrementalSort *
+make_incrementalsort_from_pathkeys(Plan *lefttree, List *pathkeys,
+ Relids relids, int nPresortedCols)
+{
+ int numsortkeys;
+ AttrNumber *sortColIdx;
+ Oid *sortOperators;
+ Oid *collations;
+ bool *nullsFirst;
+
+ /* Compute sort column info, and adjust lefttree as needed */
+ lefttree = prepare_sort_from_pathkeys(lefttree, pathkeys,
+ relids,
+ NULL,
+ false,
+ &numsortkeys,
+ &sortColIdx,
+ &sortOperators,
+ &collations,
+ &nullsFirst);
+
+ /* Now build the Sort node */
+ return make_incrementalsort(lefttree, numsortkeys, nPresortedCols,
+ sortColIdx, sortOperators,
+ collations, nullsFirst);
+}
+
/*
* make_sort_from_sortclauses
* Create sort plan to sort according to given sortclauses
@@ -6774,6 +6889,7 @@ is_projection_capable_path(Path *path)
case T_Hash:
case T_Material:
case T_Sort:
+ case T_IncrementalSort:
case T_Unique:
case T_SetOp:
case T_LockRows:
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f52226ccec..aeb83841d7 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4924,13 +4924,16 @@ create_distinct_paths(PlannerInfo *root,
* Build a new upperrel containing Paths for ORDER BY evaluation.
*
* All paths in the result must satisfy the ORDER BY ordering.
- * The only new path we need consider is an explicit sort on the
- * cheapest-total existing path.
+ * The only new paths we need consider are an explicit full sort
+ * and incremental sort on the cheapest-total existing path.
*
* input_rel: contains the source-data Paths
* target: the output tlist the result Paths must emit
* limit_tuples: estimated bound on the number of output tuples,
* or -1 if no LIMIT or couldn't estimate
+ *
+ * XXX This only looks at sort_pathkeys. I wonder if it needs to look at the
+ * other pathkeys (grouping, ...) like generate_useful_gather_paths.
*/
static RelOptInfo *
create_ordered_paths(PlannerInfo *root,
@@ -4964,29 +4967,77 @@ create_ordered_paths(PlannerInfo *root,
foreach(lc, input_rel->pathlist)
{
- Path *path = (Path *) lfirst(lc);
+ Path *input_path = (Path *) lfirst(lc);
+ Path *sorted_path = input_path;
bool is_sorted;
+ int presorted_keys;
- is_sorted = pathkeys_contained_in(root->sort_pathkeys,
- path->pathkeys);
- if (path == cheapest_input_path || is_sorted)
+ is_sorted = pathkeys_count_contained_in(root->sort_pathkeys,
+ input_path->pathkeys, &presorted_keys);
+
+ if (is_sorted)
{
- if (!is_sorted)
+ /* Use the input path as is, but add a projection step if needed */
+ if (sorted_path->pathtarget != target)
+ sorted_path = apply_projection_to_path(root, ordered_rel,
+ sorted_path, target);
+
+ add_path(ordered_rel, sorted_path);
+ }
+ else
+ {
+ /*
+ * Try adding an explicit sort, but only to the cheapest total path
+ * since a full sort should generally add the same cost to all
+ * paths.
+ */
+ if (input_path == cheapest_input_path)
{
- /* An explicit sort here can take advantage of LIMIT */
- path = (Path *) create_sort_path(root,
- ordered_rel,
- path,
- root->sort_pathkeys,
- limit_tuples);
+ /*
+ * Sort the cheapest input path. An explicit sort here can
+ * take advantage of LIMIT.
+ */
+ sorted_path = (Path *) create_sort_path(root,
+ ordered_rel,
+ input_path,
+ root->sort_pathkeys,
+ limit_tuples);
+ /* Add projection step if needed */
+ if (sorted_path->pathtarget != target)
+ sorted_path = apply_projection_to_path(root, ordered_rel,
+ sorted_path, target);
+
+ add_path(ordered_rel, sorted_path);
}
- /* Add projection step if needed */
- if (path->pathtarget != target)
- path = apply_projection_to_path(root, ordered_rel,
- path, target);
+ /*
+ * If incremental sort is enabled, then try it as well. Unlike with
+ * regular sorts, we can't just look at the cheapest path, because
+ * the cost of incremental sort depends on how well presorted the
+ * path is. Additionally incremental sort may enable a cheaper
+ * startup path to win out despite higher total cost.
+ */
+ if (!enable_incrementalsort)
+ continue;
- add_path(ordered_rel, path);
+ /* Likewise, if the path can't be used for incremental sort. */
+ if (!presorted_keys)
+ continue;
+
+ /* Also consider incremental sort. */
+ sorted_path = (Path *) create_incremental_sort_path(root,
+ ordered_rel,
+ input_path,
+ root->sort_pathkeys,
+ presorted_keys,
+ limit_tuples);
+
+ /* Add projection step if needed */
+ if (sorted_path->pathtarget != target)
+ sorted_path = apply_projection_to_path(root, ordered_rel,
+ sorted_path, target);
+
+ add_path(ordered_rel, sorted_path);
}
}
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 3dcded506b..2b676bf406 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -678,6 +678,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
case T_Material:
case T_Sort:
+ case T_IncrementalSort:
case T_Unique:
case T_SetOp:
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 3650e8329d..b02fcb9bfe 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2688,6 +2688,7 @@ finalize_plan(PlannerInfo *root, Plan *plan,
case T_Hash:
case T_Material:
case T_Sort:
+ case T_IncrementalSort:
case T_Unique:
case T_SetOp:
case T_Group:
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 8ba8122ee2..4538ed88e0 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2753,6 +2753,57 @@ create_set_projection_path(PlannerInfo *root,
return pathnode;
}
+/*
+ * create_incremental_sort_path
+ * Creates a pathnode that represents performing an incremental sort.
+ *
+ * 'rel' is the parent relation associated with the result
+ * 'subpath' is the path representing the source of data
+ * 'pathkeys' represents the desired sort order
+ * 'presorted_keys' is the number of keys by which the input path is
+ * already sorted
+ * 'limit_tuples' is the estimated bound on the number of output tuples,
+ * or -1 if no LIMIT or couldn't estimate
+ */
+SortPath *
+create_incremental_sort_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ List *pathkeys,
+ int presorted_keys,
+ double limit_tuples)
+{
+ IncrementalSortPath *sort = makeNode(IncrementalSortPath);
+ SortPath *pathnode = &sort->spath;
+
+ pathnode->path.pathtype = T_IncrementalSort;
+ pathnode->path.parent = rel;
+ /* Sort doesn't project, so use source path's pathtarget */
+ pathnode->path.pathtarget = subpath->pathtarget;
+ /* For now, assume we are above any joins, so no parameterization */
+ pathnode->path.param_info = NULL;
+ pathnode->path.parallel_aware = false;
+ pathnode->path.parallel_safe = rel->consider_parallel &&
+ subpath->parallel_safe;
+ pathnode->path.parallel_workers = subpath->parallel_workers;
+ pathnode->path.pathkeys = pathkeys;
+
+ pathnode->subpath = subpath;
+
+ cost_incremental_sort(&pathnode->path,
+ root, pathkeys, presorted_keys,
+ subpath->startup_cost,
+ subpath->total_cost,
+ subpath->rows,
+ subpath->pathtarget->width,
+ 0.0, /* XXX comparison_cost shouldn't be 0? */
+ work_mem, limit_tuples);
+
+ sort->nPresortedCols = presorted_keys;
+
+ return pathnode;
+}
+
/*
* create_sort_path
* Creates a pathnode that represents performing an explicit sort.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 477af5d552..03a22d71ac 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -991,6 +991,15 @@ static struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+ {
+ {"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the planner's use of incremental sort steps."),
+ NULL
+ },
+ &enable_incrementalsort,
+ true,
+ NULL, NULL, NULL
+ },
{
{"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of hashed aggregation plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 91fa185053..1ae8b77306 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -360,6 +360,7 @@
#enable_parallel_append = on
#enable_seqscan = on
#enable_sort = on
+#enable_incrementalsort = on
#enable_tidscan = on
#enable_partitionwise_join = off
#enable_partitionwise_aggregate = off
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d02e676aa3..cc33a85731 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -125,6 +125,16 @@
#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \
(state)->worker >= 0 ? 1 : 2)
+/*
+ * Initial size of memtuples array. We're trying to select this size so that
+ * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
+ * allocation might possibly be lowered. However, we don't consider array sizes
+ * less than 1024.
+ *
+ */
+#define INITIAL_MEMTUPSIZE Max(1024, \
+ ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
+
/* GUC variables */
#ifdef TRACE_SORT
bool trace_sort = false;
@@ -241,6 +251,14 @@ struct Tuplesortstate
int64 allowedMem; /* total memory allowed, in bytes */
int maxTapes; /* number of tapes (Knuth's T) */
int tapeRange; /* maxTapes-1 (Knuth's P) */
+ int64 maxSpace; /* maximum amount of space occupied among sort
+ * of groups, either in-memory or on-disk */
+ bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk
+ * space, false when it's value for in-memory
+ * space */
+ TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */
+ MemoryContext maincontext; /* memory context for tuple sort metadata that
+ * persists across multiple batches */
MemoryContext sortcontext; /* memory context holding most sort data */
MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
@@ -591,6 +609,7 @@ struct Sharedsort
static Tuplesortstate *tuplesort_begin_common(int workMem,
SortCoordinate coordinate,
bool randomAccess);
+static void tuplesort_begin_batch(Tuplesortstate *state);
static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
static bool consider_abort_common(Tuplesortstate *state);
static void inittapes(Tuplesortstate *state, bool mergeruns);
@@ -647,6 +666,8 @@ static void worker_freeze_result_tape(Tuplesortstate *state);
static void worker_nomergeruns(Tuplesortstate *state);
static void leader_takeover_tapes(Tuplesortstate *state);
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
+static void tuplesort_free(Tuplesortstate *state);
+static void tuplesort_updatemax(Tuplesortstate *state);
/*
* Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
@@ -682,8 +703,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
bool randomAccess)
{
Tuplesortstate *state;
+ MemoryContext maincontext;
MemoryContext sortcontext;
- MemoryContext tuplecontext;
MemoryContext oldcontext;
/* See leader_takeover_tapes() remarks on randomAccess support */
@@ -691,31 +712,31 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
elog(ERROR, "random access disallowed under parallel sort");
/*
- * Create a working memory context for this sort operation. All data
- * needed by the sort will live inside this context.
+ * Memory context surviving tuplesort_reset. This memory context holds
+ * data which is useful to keep while sorting multiple similar batches.
*/
- sortcontext = AllocSetContextCreate(CurrentMemoryContext,
+ maincontext = AllocSetContextCreate(CurrentMemoryContext,
"TupleSort main",
ALLOCSET_DEFAULT_SIZES);
/*
- * Caller tuple (e.g. IndexTuple) memory context.
- *
- * A dedicated child context used exclusively for caller passed tuples
- * eases memory management. Resetting at key points reduces
- * fragmentation. Note that the memtuples array of SortTuples is allocated
- * in the parent context, not this context, because there is no need to
- * free memtuples early.
+ * Create a working memory context for one sort operation. The content of
+ * this context is deleted by tuplesort_reset.
*/
- tuplecontext = AllocSetContextCreate(sortcontext,
- "Caller tuples",
- ALLOCSET_DEFAULT_SIZES);
+ sortcontext = AllocSetContextCreate(maincontext,
+ "TupleSort sort",
+ ALLOCSET_DEFAULT_SIZES);
/*
- * Make the Tuplesortstate within the per-sort context. This way, we
+ * Additionally a working memory context for tuples is setup in
+ * tuplesort_begin_batch.
+ */
+
+ /*
+ * Make the Tuplesortstate within the per-sortstate context. This way, we
* don't need a separate pfree() operation for it at shutdown.
*/
- oldcontext = MemoryContextSwitchTo(sortcontext);
+ oldcontext = MemoryContextSwitchTo(maincontext);
state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
@@ -724,11 +745,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
pg_rusage_init(&state->ru_start);
#endif
- state->status = TSS_INITIAL;
state->randomAccess = randomAccess;
- state->bounded = false;
state->tuples = true;
- state->boundUsed = false;
/*
* workMem is forced to be at least 64KB, the current minimum valid value
@@ -737,38 +755,21 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
* with very little memory.
*/
state->allowedMem = Max(workMem, 64) * (int64) 1024;
- state->availMem = state->allowedMem;
state->sortcontext = sortcontext;
- state->tuplecontext = tuplecontext;
- state->tapeset = NULL;
-
- state->memtupcount = 0;
+ state->maincontext = maincontext;
/*
* Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
* see comments in grow_memtuples().
*/
- state->memtupsize = Max(1024,
- ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
-
- state->growmemtuples = true;
- state->slabAllocatorUsed = false;
- state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
-
- USEMEM(state, GetMemoryChunkSpace(state->memtuples));
-
- /* workMem must be large enough for the minimal memtuples array */
- if (LACKMEM(state))
- elog(ERROR, "insufficient memory allowed for sort");
-
- state->currentRun = 0;
+ state->memtupsize = INITIAL_MEMTUPSIZE;
+ state->memtuples = NULL;
/*
- * maxTapes, tapeRange, and Algorithm D variables will be initialized by
- * inittapes(), if needed
+ * After all of the other non-parallel-related state, we setup all of the
+ * state needed for each batch.
*/
-
- state->result_tape = -1; /* flag that result tape has not been formed */
+ tuplesort_begin_batch(state);
/*
* Initialize parallel-related state based on coordination information
@@ -802,6 +803,77 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
return state;
}
+/*
+ * tuplesort_begin_batch
+ *
+ * Setup, or reset, all state need for processing a new set of tuples with this
+ * sort state. Called both from tuplesort_begin_common (the first time sorting
+ * with this sort state) and tuplesort_reseti (for subsequent usages).
+ */
+static void
+tuplesort_begin_batch(Tuplesortstate *state)
+{
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(state->maincontext);
+
+ /*
+ * Caller tuple (e.g. IndexTuple) memory context.
+ *
+ * A dedicated child context used exclusively for caller passed tuples
+ * eases memory management. Resetting at key points reduces
+ * fragmentation. Note that the memtuples array of SortTuples is allocated
+ * in the parent context, not this context, because there is no need to
+ * free memtuples early.
+ */
+ state->tuplecontext = AllocSetContextCreate(state->sortcontext,
+ "Caller tuples",
+ ALLOCSET_DEFAULT_SIZES);
+
+ state->status = TSS_INITIAL;
+ state->bounded = false;
+ state->boundUsed = false;
+
+ state->availMem = state->allowedMem;
+
+ state->tapeset = NULL;
+
+ state->memtupcount = 0;
+
+ /*
+ * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
+ * see comments in grow_memtuples().
+ */
+ state->growmemtuples = true;
+ state->slabAllocatorUsed = false;
+ if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
+ {
+ pfree(state->memtuples);
+ state->memtuples = NULL;
+ state->memtupsize = INITIAL_MEMTUPSIZE;
+ }
+ if (state->memtuples == NULL)
+ {
+ state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
+ USEMEM(state, GetMemoryChunkSpace(state->memtuples));
+ }
+
+ /* workMem must be large enough for the minimal memtuples array */
+ if (LACKMEM(state))
+ elog(ERROR, "insufficient memory allowed for sort");
+
+ state->currentRun = 0;
+
+ /*
+ * maxTapes, tapeRange, and Algorithm D variables will be initialized by
+ * inittapes(), if needed
+ */
+
+ state->result_tape = -1; /* flag that result tape has not been formed */
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,
int nkeys, AttrNumber *attNums,
@@ -814,7 +886,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
MemoryContext oldcontext;
int i;
- oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ oldcontext = MemoryContextSwitchTo(state->maincontext);
AssertArg(nkeys > 0);
@@ -890,7 +962,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
- oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ oldcontext = MemoryContextSwitchTo(state->maincontext);
#ifdef TRACE_SORT
if (trace_sort)
@@ -985,7 +1057,7 @@ tuplesort_begin_index_btree(Relation heapRel,
MemoryContext oldcontext;
int i;
- oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ oldcontext = MemoryContextSwitchTo(state->maincontext);
#ifdef TRACE_SORT
if (trace_sort)
@@ -1063,7 +1135,7 @@ tuplesort_begin_index_hash(Relation heapRel,
randomAccess);
MemoryContext oldcontext;
- oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ oldcontext = MemoryContextSwitchTo(state->maincontext);
#ifdef TRACE_SORT
if (trace_sort)
@@ -1106,7 +1178,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
int16 typlen;
bool typbyval;
- oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ oldcontext = MemoryContextSwitchTo(state->maincontext);
#ifdef TRACE_SORT
if (trace_sort)
@@ -1224,16 +1296,23 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
}
/*
- * tuplesort_end
+ * tuplesort_used_bound
*
- * Release resources and clean up.
- *
- * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
- * pointing to garbage. Be careful not to attempt to use or free such
- * pointers afterwards!
+ * Allow callers to find out if the sort state was able to use a bound.
*/
-void
-tuplesort_end(Tuplesortstate *state)
+bool
+tuplesort_used_bound(Tuplesortstate *state)
+{
+ return state->boundUsed;
+}
+
+/*
+ * tuplesort_free
+ *
+ * Internal routine for freeing resources of tuplesort.
+ */
+static void
+tuplesort_free(Tuplesortstate *state)
{
/* context swap probably not needed, but let's be safe */
MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
@@ -1291,10 +1370,104 @@ tuplesort_end(Tuplesortstate *state)
MemoryContextSwitchTo(oldcontext);
/*
- * Free the per-sort memory context, thereby releasing all working memory,
- * including the Tuplesortstate struct itself.
+ * Free the per-sort memory context, thereby releasing all working memory.
*/
- MemoryContextDelete(state->sortcontext);
+ MemoryContextReset(state->sortcontext);
+}
+
+/*
+ * tuplesort_end
+ *
+ * Release resources and clean up.
+ *
+ * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
+ * pointing to garbage. Be careful not to attempt to use or free such
+ * pointers afterwards!
+ */
+void
+tuplesort_end(Tuplesortstate *state)
+{
+ tuplesort_free(state);
+
+ /*
+ * Free the main memory context, including the Tuplesortstate struct
+ * itself.
+ */
+ MemoryContextDelete(state->maincontext);
+}
+
+/*
+ * tuplesort_updatemax
+ *
+ * Update maximum resource usage statistics.
+ */
+static void
+tuplesort_updatemax(Tuplesortstate *state)
+{
+ int64 spaceUsed;
+ bool isSpaceDisk;
+
+ /*
+ * Note: it might seem we should provide both memory and disk usage for a
+ * disk-based sort. However, the current code doesn't track memory space
+ * accurately once we have begun to return tuples to the caller (since we
+ * don't account for pfree's the caller is expected to do), so we cannot
+ * rely on availMem in a disk sort. This does not seem worth the overhead
+ * to fix. Is it worth creating an API for the memory context code to
+ * tell us how much is actually used in sortcontext?
+ */
+ if (state->tapeset)
+ {
+ isSpaceDisk = true;
+ spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
+ }
+ else
+ {
+ isSpaceDisk = false;
+ spaceUsed = state->allowedMem - state->availMem;
+ }
+
+ /*
+ * Sort evicts data to the disk when it didn't manage to fit those data to
+ * the main memory. This is why we assume space used on the disk to be
+ * more important for tracking resource usage than space used in memory.
+ * Note that amount of space occupied by some tuple set on the disk might
+ * be less than amount of space occupied by the same tuple set in the
+ * memory due to more compact representation.
+ */
+ if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
+ (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
+ {
+ state->maxSpace = spaceUsed;
+ state->isMaxSpaceDisk = isSpaceDisk;
+ state->maxSpaceStatus = state->status;
+ }
+}
+
+/*
+ * tuplesort_reset
+ *
+ * Reset the tuplesort. Reset all the data in the tuplesort, but leave the
+ * meta-information in. After tuplesort_reset, tuplesort is ready to start
+ * a new sort. This allows avoiding recreation of tuple sort states (and
+ * save resources) when sorting multiple small batches.
+ */
+void
+tuplesort_reset(Tuplesortstate *state)
+{
+ tuplesort_updatemax(state);
+ tuplesort_free(state);
+
+ /*
+ * After we've freed up per-batch memory, re-setup all of the state common
+ * to both the first batch and any subsequent batch.
+ */
+ tuplesort_begin_batch(state);
+
+ state->lastReturnedTuple = NULL;
+ state->slabMemoryBegin = NULL;
+ state->slabMemoryEnd = NULL;
+ state->slabFreeHead = NULL;
}
/*
@@ -2591,8 +2764,7 @@ mergeruns(Tuplesortstate *state)
* Reset tuple memory. We've freed all the tuples that we previously
* allocated. We will use the slab allocator from now on.
*/
- MemoryContextDelete(state->tuplecontext);
- state->tuplecontext = NULL;
+ MemoryContextResetOnly(state->tuplecontext);
/*
* We no longer need a large memtuples array. (We will allocate a smaller
@@ -2642,7 +2814,8 @@ mergeruns(Tuplesortstate *state)
* from each input tape.
*/
state->memtupsize = numInputTapes;
- state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
+ state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext,
+ numInputTapes * sizeof(SortTuple));
USEMEM(state, GetMemoryChunkSpace(state->memtuples));
/*
@@ -3138,18 +3311,15 @@ tuplesort_get_stats(Tuplesortstate *state,
* to fix. Is it worth creating an API for the memory context code to
* tell us how much is actually used in sortcontext?
*/
- if (state->tapeset)
- {
- stats->spaceType = SORT_SPACE_TYPE_DISK;
- stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
- }
- else
- {
- stats->spaceType = SORT_SPACE_TYPE_MEMORY;
- stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
- }
+ tuplesort_updatemax(state);
- switch (state->status)
+ if (state->isMaxSpaceDisk)
+ stats->spaceType = SORT_SPACE_TYPE_DISK;
+ else
+ stats->spaceType = SORT_SPACE_TYPE_MEMORY;
+ stats->spaceUsed = (state->maxSpace + 1023) / 1024;
+
+ switch (state->maxSpaceStatus)
{
case TSS_SORTEDINMEM:
if (state->boundUsed)
diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h
index 2e9920111f..4af6e0013d 100644
--- a/src/include/executor/execdebug.h
+++ b/src/include/executor/execdebug.h
@@ -86,10 +86,12 @@
#define SO_nodeDisplay(l) nodeDisplay(l)
#define SO_printf(s) printf(s)
#define SO1_printf(s, p) printf(s, p)
+#define SO2_printf(s, p1, p2) printf(s, p1, p2)
#else
#define SO_nodeDisplay(l)
#define SO_printf(s)
#define SO1_printf(s, p)
+#define SO2_printf(s, p1, p2)
#endif /* EXEC_SORTDEBUG */
/* ----------------
diff --git a/src/include/executor/nodeIncrementalSort.h b/src/include/executor/nodeIncrementalSort.h
new file mode 100644
index 0000000000..e62c02a4f3
--- /dev/null
+++ b/src/include/executor/nodeIncrementalSort.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.h
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/executor/nodeIncrementalSort.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NODEINCREMENTALSORT_H
+#define NODEINCREMENTALSORT_H
+
+#include "access/parallel.h"
+#include "nodes/execnodes.h"
+
+extern IncrementalSortState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags);
+extern void ExecEndIncrementalSort(IncrementalSortState *node);
+extern void ExecReScanIncrementalSort(IncrementalSortState *node);
+
+/* parallel instrumentation support */
+extern void ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt);
+extern void ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt);
+extern void ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pcxt);
+extern void ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node);
+
+#endif /* NODEINCREMENTALSORT_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0fb5d61a3f..fb490b404c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1982,6 +1982,21 @@ typedef struct MaterialState
Tuplestorestate *tuplestorestate;
} MaterialState;
+
+/* ----------------
+ * When performing sorting by multiple keys, it's possible that the input
+ * dataset is already sorted on a prefix of those keys. We call these
+ * "presorted keys".
+ * PresortedKeyData represents information about one such key.
+ * ----------------
+ */
+typedef struct PresortedKeyData
+{
+ FmgrInfo flinfo; /* comparison function info */
+ FunctionCallInfo fcinfo; /* comparison function call info */
+ OffsetNumber attno; /* attribute number in tuple */
+} PresortedKeyData;
+
/* ----------------
* Shared memory container for per-worker sort information
* ----------------
@@ -2010,6 +2025,71 @@ typedef struct SortState
SharedSortInfo *shared_info; /* one entry per worker */
} SortState;
+/* ----------------
+ * Instrumentation information for IncrementalSort
+ * ----------------
+ */
+typedef struct IncrementalSortGroupInfo
+{
+ int64 groupCount;
+ long maxDiskSpaceUsed;
+ long totalDiskSpaceUsed;
+ long maxMemorySpaceUsed;
+ long totalMemorySpaceUsed;
+ bits32 sortMethods; /* bitmask of TuplesortMethod */
+} IncrementalSortGroupInfo;
+
+typedef struct IncrementalSortInfo
+{
+ IncrementalSortGroupInfo fullsortGroupInfo;
+ IncrementalSortGroupInfo prefixsortGroupInfo;
+} IncrementalSortInfo;
+
+/* ----------------
+ * Shared memory container for per-worker incremental sort information
+ * ----------------
+ */
+typedef struct SharedIncrementalSortInfo
+{
+ int num_workers;
+ IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER];
+} SharedIncrementalSortInfo;
+
+/* ----------------
+ * IncrementalSortState information
+ * ----------------
+ */
+typedef enum
+{
+ INCSORT_LOADFULLSORT,
+ INCSORT_LOADPREFIXSORT,
+ INCSORT_READFULLSORT,
+ INCSORT_READPREFIXSORT,
+} IncrementalSortExecutionStatus;
+
+typedef struct IncrementalSortState
+{
+ ScanState ss; /* its first field is NodeTag */
+ bool bounded; /* is the result set bounded? */
+ int64 bound; /* if bounded, how many tuples are needed */
+ bool outerNodeDone; /* finished fetching tuples from outer node */
+ int64 bound_Done; /* value of bound we did the sort with */
+ IncrementalSortExecutionStatus execution_status;
+ int64 n_fullsort_remaining;
+ Tuplesortstate *fullsort_state; /* private state of tuplesort.c */
+ Tuplesortstate *prefixsort_state; /* private state of tuplesort.c */
+ /* the keys by which the input path is already sorted */
+ PresortedKeyData *presorted_keys;
+
+ IncrementalSortInfo incsort_info;
+
+ /* slot for pivot tuple defining values of presorted keys within group */
+ TupleTableSlot *group_pivot;
+ TupleTableSlot *transfer_tuple;
+ bool am_worker; /* are we a worker? */
+ SharedIncrementalSortInfo *shared_info; /* one entry per worker */
+} IncrementalSortState;
+
/* ---------------------
* GroupState information
* ---------------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 8a76afe8cc..50b1ba5186 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -74,6 +74,7 @@ typedef enum NodeTag
T_HashJoin,
T_Material,
T_Sort,
+ T_IncrementalSort,
T_Group,
T_Agg,
T_WindowAgg,
@@ -130,6 +131,7 @@ typedef enum NodeTag
T_HashJoinState,
T_MaterialState,
T_SortState,
+ T_IncrementalSortState,
T_GroupState,
T_AggState,
T_WindowAggState,
@@ -245,6 +247,7 @@ typedef enum NodeTag
T_ProjectionPath,
T_ProjectSetPath,
T_SortPath,
+ T_IncrementalSortPath,
T_GroupPath,
T_UpperUniquePath,
T_AggPath,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 469c686e3f..a6d206b25a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1638,6 +1638,15 @@ typedef struct SortPath
Path *subpath; /* path representing input source */
} SortPath;
+/*
+ * IncrementalSortPath
+ */
+typedef struct IncrementalSortPath
+{
+ SortPath spath;
+ int nPresortedCols; /* number of presorted columns */
+} IncrementalSortPath;
+
/*
* GroupPath represents grouping (of presorted input)
*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 4869fe7b6d..be8ef54a1e 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -774,6 +774,16 @@ typedef struct Sort
bool *nullsFirst; /* NULLS FIRST/LAST directions */
} Sort;
+/* ----------------
+ * incremental sort node
+ * ----------------
+ */
+typedef struct IncrementalSort
+{
+ Sort sort;
+ int nPresortedCols; /* number of presorted columns */
+} IncrementalSort;
+
/* ---------------
* group node -
* Used for queries with GROUP BY (but no aggregates) specified.
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 735ba09650..9710e5c0a4 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -53,6 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan;
extern PGDLLIMPORT bool enable_bitmapscan;
extern PGDLLIMPORT bool enable_tidscan;
extern PGDLLIMPORT bool enable_sort;
+extern PGDLLIMPORT bool enable_incrementalsort;
extern PGDLLIMPORT bool enable_hashagg;
extern PGDLLIMPORT bool enable_hashagg_disk;
extern PGDLLIMPORT bool enable_groupingsets_hash_disk;
@@ -103,6 +104,11 @@ extern void cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width,
Cost comparison_cost, int sort_mem,
double limit_tuples);
+extern void cost_incremental_sort(Path *path,
+ PlannerInfo *root, List *pathkeys, int presorted_keys,
+ Cost input_startup_cost, Cost input_total_cost,
+ double input_tuples, int width, Cost comparison_cost, int sort_mem,
+ double limit_tuples);
extern void cost_append(AppendPath *path);
extern void cost_merge_append(Path *path, PlannerInfo *root,
List *pathkeys, int n_streams,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e450fe112a..bcd08af753 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -184,6 +184,12 @@ extern ProjectSetPath *create_set_projection_path(PlannerInfo *root,
RelOptInfo *rel,
Path *subpath,
PathTarget *target);
+extern SortPath *create_incremental_sort_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ List *pathkeys,
+ int presorted_keys,
+ double limit_tuples);
extern SortPath *create_sort_path(PlannerInfo *root,
RelOptInfo *rel,
Path *subpath,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index c689fe8e26..ab61f306cb 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -185,6 +185,7 @@ typedef enum
extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
extern bool pathkeys_contained_in(List *keys1, List *keys2);
+extern bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common);
extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
Relids required_outer,
CostSelector cost_criterion,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index a2fdd3fcd3..8d00a9e501 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -61,14 +61,17 @@ typedef struct SortCoordinateData *SortCoordinate;
* Data structures for reporting sort statistics. Note that
* TuplesortInstrumentation can't contain any pointers because we
* sometimes put it in shared memory.
+ *
+ * TuplesortMethod is used in a bitmask in Increment Sort's shared memory
+ * instrumentation so needs to have each value be a separate bit.
*/
typedef enum
{
- SORT_TYPE_STILL_IN_PROGRESS = 0,
- SORT_TYPE_TOP_N_HEAPSORT,
- SORT_TYPE_QUICKSORT,
- SORT_TYPE_EXTERNAL_SORT,
- SORT_TYPE_EXTERNAL_MERGE
+ SORT_TYPE_STILL_IN_PROGRESS = 1 << 0,
+ SORT_TYPE_TOP_N_HEAPSORT = 1 << 1,
+ SORT_TYPE_QUICKSORT = 1 << 2,
+ SORT_TYPE_EXTERNAL_SORT = 1 << 3,
+ SORT_TYPE_EXTERNAL_MERGE = 1 << 4
} TuplesortMethod;
typedef enum
@@ -215,6 +218,7 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
bool randomAccess);
extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
+extern bool tuplesort_used_bound(Tuplesortstate *state);
extern void tuplesort_puttupleslot(Tuplesortstate *state,
TupleTableSlot *slot);
@@ -239,6 +243,8 @@ extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
extern void tuplesort_end(Tuplesortstate *state);
+extern void tuplesort_reset(Tuplesortstate *state);
+
extern void tuplesort_get_stats(Tuplesortstate *state,
TuplesortInstrumentation *stats);
extern const char *tuplesort_method_name(TuplesortMethod m);
diff --git a/src/test/isolation/expected/drop-index-concurrently-1.out b/src/test/isolation/expected/drop-index-concurrently-1.out
index 75dff56bc4..8e6adb66bb 100644
--- a/src/test/isolation/expected/drop-index-concurrently-1.out
+++ b/src/test/isolation/expected/drop-index-concurrently-1.out
@@ -21,7 +21,7 @@ QUERY PLAN
Sort
Sort Key: id, data
- -> Seq Scan on test_dc
+ -> Index Scan using test_dc_pkey on test_dc
Filter: ((data)::text = '34'::text)
step select2: SELECT * FROM test_dc WHERE data=34 ORDER BY id,data;
id data
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
new file mode 100644
index 0000000000..f130c606c8
--- /dev/null
+++ b/src/test/regress/expected/incremental_sort.out
@@ -0,0 +1,1441 @@
+-- When we have to sort the entire table, incremental sort will
+-- be slower than plain sort, so it should not be used.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+ QUERY PLAN
+-----------------------------------
+ Sort
+ Sort Key: tenk1.four, tenk1.ten
+ -> Sort
+ Sort Key: tenk1.four
+ -> Seq Scan on tenk1
+(5 rows)
+
+-- When there is a LIMIT clause, incremental sort is beneficial because
+-- it only has to sort some of the groups, and not the entire table.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten
+limit 1;
+ QUERY PLAN
+-----------------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: tenk1.four, tenk1.ten
+ Presorted Key: tenk1.four
+ -> Sort
+ Sort Key: tenk1.four
+ -> Seq Scan on tenk1
+(7 rows)
+
+-- When work_mem is not enough to sort the entire table, incremental sort
+-- may be faster if individual groups still fit into work_mem.
+set work_mem to '2MB';
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+ QUERY PLAN
+-----------------------------------
+ Incremental Sort
+ Sort Key: tenk1.four, tenk1.ten
+ Presorted Key: tenk1.four
+ -> Sort
+ Sort Key: tenk1.four
+ -> Seq Scan on tenk1
+(6 rows)
+
+reset work_mem;
+create table t(a integer, b integer);
+create or replace function explain_analyze_without_memory(query text)
+returns table (out_line text) language plpgsql
+as
+$$
+declare
+ line text;
+begin
+ for line in
+ execute 'explain (analyze, costs off, summary off, timing off) ' || query
+ loop
+ out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g');
+ return next;
+ end loop;
+end;
+$$;
+create or replace function explain_analyze_inc_sort_nodes(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+ elements jsonb;
+ element jsonb;
+ matching_nodes jsonb := '[]'::jsonb;
+begin
+ execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements;
+ while jsonb_array_length(elements) > 0 loop
+ element := elements->0;
+ elements := elements - 0;
+ case jsonb_typeof(element)
+ when 'array' then
+ if jsonb_array_length(element) > 0 then
+ elements := elements || element;
+ end if;
+ when 'object' then
+ if element ? 'Plan' then
+ elements := elements || jsonb_build_array(element->'Plan');
+ element := element - 'Plan';
+ else
+ if element ? 'Plans' then
+ elements := elements || jsonb_build_array(element->'Plans');
+ element := element - 'Plans';
+ end if;
+ if (element->>'Node Type')::text = 'Incremental Sort' then
+ matching_nodes := matching_nodes || element;
+ end if;
+ end if;
+ end case;
+ end loop;
+ return matching_nodes;
+end;
+$$;
+create or replace function explain_analyze_inc_sort_nodes_without_memory(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+ nodes jsonb := '[]'::jsonb;
+ node jsonb;
+ group_key text;
+ space_key text;
+begin
+ for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+ for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+ for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+ node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false);
+ node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false);
+ end loop;
+ end loop;
+ nodes := nodes || node;
+ end loop;
+ return nodes;
+end;
+$$;
+create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text)
+returns bool language plpgsql
+as
+$$
+declare
+ node jsonb;
+ group_stats jsonb;
+ group_key text;
+ space_key text;
+begin
+ for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+ for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+ group_stats := node->group_key;
+ for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+ if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then
+ raise exception '% has invalid max space < average space', group_key;
+ end if;
+ end loop;
+ end loop;
+ end loop;
+ return true;
+end;
+$$;
+-- A single large group tested around each mode transition point.
+insert into t(a, b) select 1, i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 31;
+ a | b
+---+----
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 5
+ 1 | 6
+ 1 | 7
+ 1 | 8
+ 1 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+(31 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 32;
+ a | b
+---+----
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 5
+ 1 | 6
+ 1 | 7
+ 1 | 8
+ 1 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+(32 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 33;
+ a | b
+---+----
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 5
+ 1 | 6
+ 1 | 7
+ 1 | 8
+ 1 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+(33 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 65;
+ a | b
+---+----
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 5
+ 1 | 6
+ 1 | 7
+ 1 | 8
+ 1 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+ 1 | 34
+ 1 | 35
+ 1 | 36
+ 1 | 37
+ 1 | 38
+ 1 | 39
+ 1 | 40
+ 1 | 41
+ 1 | 42
+ 1 | 43
+ 1 | 44
+ 1 | 45
+ 1 | 46
+ 1 | 47
+ 1 | 48
+ 1 | 49
+ 1 | 50
+ 1 | 51
+ 1 | 52
+ 1 | 53
+ 1 | 54
+ 1 | 55
+ 1 | 56
+ 1 | 57
+ 1 | 58
+ 1 | 59
+ 1 | 60
+ 1 | 61
+ 1 | 62
+ 1 | 63
+ 1 | 64
+ 1 | 65
+(65 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 66;
+ a | b
+---+----
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 5
+ 1 | 6
+ 1 | 7
+ 1 | 8
+ 1 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+ 1 | 34
+ 1 | 35
+ 1 | 36
+ 1 | 37
+ 1 | 38
+ 1 | 39
+ 1 | 40
+ 1 | 41
+ 1 | 42
+ 1 | 43
+ 1 | 44
+ 1 | 45
+ 1 | 46
+ 1 | 47
+ 1 | 48
+ 1 | 49
+ 1 | 50
+ 1 | 51
+ 1 | 52
+ 1 | 53
+ 1 | 54
+ 1 | 55
+ 1 | 56
+ 1 | 57
+ 1 | 58
+ 1 | 59
+ 1 | 60
+ 1 | 61
+ 1 | 62
+ 1 | 63
+ 1 | 64
+ 1 | 65
+ 1 | 66
+(66 rows)
+
+delete from t;
+-- An initial large group followed by a small group.
+insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 55;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 55;
+ a | b
+---+----
+ 1 | 1
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 5
+ 1 | 6
+ 1 | 7
+ 1 | 8
+ 1 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 1 | 20
+ 1 | 21
+ 1 | 22
+ 1 | 23
+ 1 | 24
+ 1 | 25
+ 1 | 26
+ 1 | 27
+ 1 | 28
+ 1 | 29
+ 1 | 30
+ 1 | 31
+ 1 | 32
+ 1 | 33
+ 1 | 34
+ 1 | 35
+ 1 | 36
+ 1 | 37
+ 1 | 38
+ 1 | 39
+ 1 | 40
+ 1 | 41
+ 1 | 42
+ 1 | 43
+ 1 | 44
+ 1 | 45
+ 1 | 46
+ 1 | 47
+ 1 | 48
+ 1 | 49
+ 2 | 50
+ 2 | 51
+ 2 | 52
+ 2 | 53
+ 2 | 54
+ 2 | 55
+(55 rows)
+
+-- Test EXPLAIN ANALYZE with only a fullsort group.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55');
+ explain_analyze_without_memory
+------------------------------------------------------------------------------------------------
+ Limit (actual rows=55 loops=1)
+ -> Incremental Sort (actual rows=55 loops=1)
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ Full-sort Groups: 2 Sort Methods: top-N heapsort, quicksort Memory: avg=NNkB peak=NNkB
+ -> Sort (actual rows=100 loops=1)
+ Sort Key: t.a
+ Sort Method: quicksort Memory: NNkB
+ -> Seq Scan on t (actual rows=100 loops=1)
+(9 rows)
+
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55'));
+ jsonb_pretty
+--------------------------------------------------
+ [ +
+ { +
+ "Sort Key": [ +
+ "t.a", +
+ "t.b" +
+ ], +
+ "Node Type": "Incremental Sort", +
+ "Actual Rows": 55, +
+ "Actual Loops": 1, +
+ "Presorted Key": [ +
+ "t.a" +
+ ], +
+ "Parallel Aware": false, +
+ "Full-sort Groups": { +
+ "Group Count": 2, +
+ "Sort Methods Used": [ +
+ "top-N heapsort", +
+ "quicksort" +
+ ], +
+ "Sort Space Memory": { +
+ "Average Sort Space Used": "NN",+
+ "Maximum Sort Space Used": "NN" +
+ } +
+ }, +
+ "Parent Relationship": "Outer" +
+ } +
+ ]
+(1 row)
+
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55');
+ explain_analyze_inc_sort_nodes_verify_invariants
+--------------------------------------------------
+ t
+(1 row)
+
+delete from t;
+-- An initial small group followed by a large group.
+insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 70;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 70;
+ a | b
+---+----
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 9 | 5
+ 9 | 6
+ 9 | 7
+ 9 | 8
+ 9 | 9
+ 9 | 10
+ 9 | 11
+ 9 | 12
+ 9 | 13
+ 9 | 14
+ 9 | 15
+ 9 | 16
+ 9 | 17
+ 9 | 18
+ 9 | 19
+ 9 | 20
+ 9 | 21
+ 9 | 22
+ 9 | 23
+ 9 | 24
+ 9 | 25
+ 9 | 26
+ 9 | 27
+ 9 | 28
+ 9 | 29
+ 9 | 30
+ 9 | 31
+ 9 | 32
+ 9 | 33
+ 9 | 34
+ 9 | 35
+ 9 | 36
+ 9 | 37
+ 9 | 38
+ 9 | 39
+ 9 | 40
+ 9 | 41
+ 9 | 42
+ 9 | 43
+ 9 | 44
+ 9 | 45
+ 9 | 46
+ 9 | 47
+ 9 | 48
+ 9 | 49
+ 9 | 50
+ 9 | 51
+ 9 | 52
+ 9 | 53
+ 9 | 54
+ 9 | 55
+ 9 | 56
+ 9 | 57
+ 9 | 58
+ 9 | 59
+ 9 | 60
+ 9 | 61
+ 9 | 62
+ 9 | 63
+ 9 | 64
+ 9 | 65
+ 9 | 66
+ 9 | 67
+ 9 | 68
+ 9 | 69
+ 9 | 70
+(70 rows)
+
+-- Test rescan.
+begin;
+-- We force the planner to choose a plan with incremental sort on the right side
+-- of a nested loop join node. That way we trigger the rescan code path.
+set local enable_hashjoin = off;
+set local enable_mergejoin = off;
+set local enable_material = off;
+set local enable_sort = off;
+explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+ QUERY PLAN
+------------------------------------------------
+ Nested Loop Left Join
+ Join Filter: (t_1.a = t.a)
+ -> Seq Scan on t
+ Filter: (a = ANY ('{1,2}'::integer[]))
+ -> Incremental Sort
+ Sort Key: t_1.a, t_1.b
+ Presorted Key: t_1.a
+ -> Sort
+ Sort Key: t_1.a
+ -> Seq Scan on t t_1
+(10 rows)
+
+select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+ a | b | a | b
+---+---+---+---
+ 1 | 1 | 1 | 1
+ 2 | 2 | 2 | 2
+(2 rows)
+
+rollback;
+-- Test EXPLAIN ANALYZE with both fullsort and presorted groups.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70');
+ explain_analyze_without_memory
+-----------------------------------------------------------------------------------------------------------------------------------------------------
+ Limit (actual rows=70 loops=1)
+ -> Incremental Sort (actual rows=70 loops=1)
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ Full-sort Groups: 1 Sort Method: quicksort Memory: avg=NNkB peak=NNkB Presorted Groups: 5 Sort Method: quicksort Memory: avg=NNkB peak=NNkB
+ -> Sort (actual rows=100 loops=1)
+ Sort Key: t.a
+ Sort Method: quicksort Memory: NNkB
+ -> Seq Scan on t (actual rows=100 loops=1)
+(9 rows)
+
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70'));
+ jsonb_pretty
+--------------------------------------------------
+ [ +
+ { +
+ "Sort Key": [ +
+ "t.a", +
+ "t.b" +
+ ], +
+ "Node Type": "Incremental Sort", +
+ "Actual Rows": 70, +
+ "Actual Loops": 1, +
+ "Presorted Key": [ +
+ "t.a" +
+ ], +
+ "Parallel Aware": false, +
+ "Full-sort Groups": { +
+ "Group Count": 1, +
+ "Sort Methods Used": [ +
+ "quicksort" +
+ ], +
+ "Sort Space Memory": { +
+ "Average Sort Space Used": "NN",+
+ "Maximum Sort Space Used": "NN" +
+ } +
+ }, +
+ "Presorted Groups": { +
+ "Group Count": 5, +
+ "Sort Methods Used": [ +
+ "quicksort" +
+ ], +
+ "Sort Space Memory": { +
+ "Average Sort Space Used": "NN",+
+ "Maximum Sort Space Used": "NN" +
+ } +
+ }, +
+ "Parent Relationship": "Outer" +
+ } +
+ ]
+(1 row)
+
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70');
+ explain_analyze_inc_sort_nodes_verify_invariants
+--------------------------------------------------
+ t
+(1 row)
+
+delete from t;
+-- Small groups of 10 tuples each tested around each mode transition point.
+insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 31;
+ a | b
+---+----
+ 0 | 1
+ 0 | 2
+ 0 | 3
+ 0 | 4
+ 0 | 5
+ 0 | 6
+ 0 | 7
+ 0 | 8
+ 0 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+(31 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 32;
+ a | b
+---+----
+ 0 | 1
+ 0 | 2
+ 0 | 3
+ 0 | 4
+ 0 | 5
+ 0 | 6
+ 0 | 7
+ 0 | 8
+ 0 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+(32 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 33;
+ a | b
+---+----
+ 0 | 1
+ 0 | 2
+ 0 | 3
+ 0 | 4
+ 0 | 5
+ 0 | 6
+ 0 | 7
+ 0 | 8
+ 0 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+ 3 | 33
+(33 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 65;
+ a | b
+---+----
+ 0 | 1
+ 0 | 2
+ 0 | 3
+ 0 | 4
+ 0 | 5
+ 0 | 6
+ 0 | 7
+ 0 | 8
+ 0 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+ 3 | 33
+ 3 | 34
+ 3 | 35
+ 3 | 36
+ 3 | 37
+ 3 | 38
+ 3 | 39
+ 4 | 40
+ 4 | 41
+ 4 | 42
+ 4 | 43
+ 4 | 44
+ 4 | 45
+ 4 | 46
+ 4 | 47
+ 4 | 48
+ 4 | 49
+ 5 | 50
+ 5 | 51
+ 5 | 52
+ 5 | 53
+ 5 | 54
+ 5 | 55
+ 5 | 56
+ 5 | 57
+ 5 | 58
+ 5 | 59
+ 6 | 60
+ 6 | 61
+ 6 | 62
+ 6 | 63
+ 6 | 64
+ 6 | 65
+(65 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 66;
+ a | b
+---+----
+ 0 | 1
+ 0 | 2
+ 0 | 3
+ 0 | 4
+ 0 | 5
+ 0 | 6
+ 0 | 7
+ 0 | 8
+ 0 | 9
+ 1 | 10
+ 1 | 11
+ 1 | 12
+ 1 | 13
+ 1 | 14
+ 1 | 15
+ 1 | 16
+ 1 | 17
+ 1 | 18
+ 1 | 19
+ 2 | 20
+ 2 | 21
+ 2 | 22
+ 2 | 23
+ 2 | 24
+ 2 | 25
+ 2 | 26
+ 2 | 27
+ 2 | 28
+ 2 | 29
+ 3 | 30
+ 3 | 31
+ 3 | 32
+ 3 | 33
+ 3 | 34
+ 3 | 35
+ 3 | 36
+ 3 | 37
+ 3 | 38
+ 3 | 39
+ 4 | 40
+ 4 | 41
+ 4 | 42
+ 4 | 43
+ 4 | 44
+ 4 | 45
+ 4 | 46
+ 4 | 47
+ 4 | 48
+ 4 | 49
+ 5 | 50
+ 5 | 51
+ 5 | 52
+ 5 | 53
+ 5 | 54
+ 5 | 55
+ 5 | 56
+ 5 | 57
+ 5 | 58
+ 5 | 59
+ 6 | 60
+ 6 | 61
+ 6 | 62
+ 6 | 63
+ 6 | 64
+ 6 | 65
+ 6 | 66
+(66 rows)
+
+delete from t;
+-- Small groups of only 1 tuple each tested around each mode transition point.
+insert into t(a, b) select i, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 31;
+ a | b
+----+----
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+(31 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 32;
+ a | b
+----+----
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+(32 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 33;
+ a | b
+----+----
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+ 33 | 33
+(33 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 65;
+ a | b
+----+----
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+ 33 | 33
+ 34 | 34
+ 35 | 35
+ 36 | 36
+ 37 | 37
+ 38 | 38
+ 39 | 39
+ 40 | 40
+ 41 | 41
+ 42 | 42
+ 43 | 43
+ 44 | 44
+ 45 | 45
+ 46 | 46
+ 47 | 47
+ 48 | 48
+ 49 | 49
+ 50 | 50
+ 51 | 51
+ 52 | 52
+ 53 | 53
+ 54 | 54
+ 55 | 55
+ 56 | 56
+ 57 | 57
+ 58 | 58
+ 59 | 59
+ 60 | 60
+ 61 | 61
+ 62 | 62
+ 63 | 63
+ 64 | 64
+ 65 | 65
+(65 rows)
+
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+ QUERY PLAN
+---------------------------------
+ Limit
+ -> Incremental Sort
+ Sort Key: t.a, t.b
+ Presorted Key: t.a
+ -> Sort
+ Sort Key: t.a
+ -> Seq Scan on t
+(7 rows)
+
+select * from (select * from t order by a) s order by a, b limit 66;
+ a | b
+----+----
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+ 6 | 6
+ 7 | 7
+ 8 | 8
+ 9 | 9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+ 20 | 20
+ 21 | 21
+ 22 | 22
+ 23 | 23
+ 24 | 24
+ 25 | 25
+ 26 | 26
+ 27 | 27
+ 28 | 28
+ 29 | 29
+ 30 | 30
+ 31 | 31
+ 32 | 32
+ 33 | 33
+ 34 | 34
+ 35 | 35
+ 36 | 36
+ 37 | 37
+ 38 | 38
+ 39 | 39
+ 40 | 40
+ 41 | 41
+ 42 | 42
+ 43 | 43
+ 44 | 44
+ 45 | 45
+ 46 | 46
+ 47 | 47
+ 48 | 48
+ 49 | 49
+ 50 | 50
+ 51 | 51
+ 52 | 52
+ 53 | 53
+ 54 | 54
+ 55 | 55
+ 56 | 56
+ 57 | 57
+ 58 | 58
+ 59 | 59
+ 60 | 60
+ 61 | 61
+ 62 | 62
+ 63 | 63
+ 64 | 64
+ 65 | 65
+ 66 | 66
+(66 rows)
+
+delete from t;
+drop table t;
+-- Incremental sort vs. parallel queries
+set min_parallel_table_scan_size = '1kB';
+set min_parallel_index_scan_size = '1kB';
+set parallel_setup_cost = 0;
+set parallel_tuple_cost = 0;
+create table t (a int, b int, c int);
+insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
+create index on t (a);
+analyze t;
+set enable_incrementalsort = off;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+ QUERY PLAN
+------------------------------------------------------
+ Limit
+ -> Sort
+ Sort Key: a, b, (sum(c))
+ -> Finalize HashAggregate
+ Group Key: a, b
+ -> Gather
+ Workers Planned: 2
+ -> Partial HashAggregate
+ Group Key: a, b
+ -> Parallel Seq Scan on t
+(10 rows)
+
+set enable_incrementalsort = on;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+ QUERY PLAN
+------------------------------------------------------
+ Limit
+ -> Sort
+ Sort Key: a, b, (sum(c))
+ -> Finalize HashAggregate
+ Group Key: a, b
+ -> Gather
+ Workers Planned: 2
+ -> Partial HashAggregate
+ Group Key: a, b
+ -> Parallel Seq Scan on t
+(10 rows)
+
+drop table t;
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index fb4b342261..c36970575f 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -11,6 +11,8 @@ SET enable_partitionwise_aggregate TO true;
SET enable_partitionwise_join TO true;
-- Disable parallel plans.
SET max_parallel_workers_per_gather TO 0;
+-- Disable incremental sort, which can influence selected plans due to fuzz factor.
+SET enable_incrementalsort TO off;
--
-- Tests for list partitioned tables.
--
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 715842b87a..a126f0ad61 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_hashagg | on
enable_hashagg_disk | on
enable_hashjoin | on
+ enable_incrementalsort | on
enable_indexonlyscan | on
enable_indexscan | on
enable_material | on
@@ -91,7 +92,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_seqscan | on
enable_sort | on
enable_tidscan | on
-(19 rows)
+(20 rows)
-- Test that the pg_timezone_names and pg_timezone_abbrevs views are
-- more-or-less working. We can't test their contents in any great detail
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index a98dba7b2f..a741e89616 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview
# ----------
# Another group of parallel tests
# ----------
-test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8
+test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 incremental_sort
# rules cannot run concurrently with any test that creates
# a view or rule in the public schema
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 3f66e0b859..1a6821ca46 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -89,6 +89,7 @@ test: select_distinct_on
test: select_implicit
test: select_having
test: subselect
+test: incremental_sort
test: union
test: case
test: join
diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql
new file mode 100644
index 0000000000..3b359efa29
--- /dev/null
+++ b/src/test/regress/sql/incremental_sort.sql
@@ -0,0 +1,213 @@
+-- When we have to sort the entire table, incremental sort will
+-- be slower than plain sort, so it should not be used.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+
+-- When there is a LIMIT clause, incremental sort is beneficial because
+-- it only has to sort some of the groups, and not the entire table.
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten
+limit 1;
+
+-- When work_mem is not enough to sort the entire table, incremental sort
+-- may be faster if individual groups still fit into work_mem.
+set work_mem to '2MB';
+explain (costs off)
+select * from (select * from tenk1 order by four) t order by four, ten;
+reset work_mem;
+
+create table t(a integer, b integer);
+
+create or replace function explain_analyze_without_memory(query text)
+returns table (out_line text) language plpgsql
+as
+$$
+declare
+ line text;
+begin
+ for line in
+ execute 'explain (analyze, costs off, summary off, timing off) ' || query
+ loop
+ out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g');
+ return next;
+ end loop;
+end;
+$$;
+
+create or replace function explain_analyze_inc_sort_nodes(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+ elements jsonb;
+ element jsonb;
+ matching_nodes jsonb := '[]'::jsonb;
+begin
+ execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements;
+ while jsonb_array_length(elements) > 0 loop
+ element := elements->0;
+ elements := elements - 0;
+ case jsonb_typeof(element)
+ when 'array' then
+ if jsonb_array_length(element) > 0 then
+ elements := elements || element;
+ end if;
+ when 'object' then
+ if element ? 'Plan' then
+ elements := elements || jsonb_build_array(element->'Plan');
+ element := element - 'Plan';
+ else
+ if element ? 'Plans' then
+ elements := elements || jsonb_build_array(element->'Plans');
+ element := element - 'Plans';
+ end if;
+ if (element->>'Node Type')::text = 'Incremental Sort' then
+ matching_nodes := matching_nodes || element;
+ end if;
+ end if;
+ end case;
+ end loop;
+ return matching_nodes;
+end;
+$$;
+
+create or replace function explain_analyze_inc_sort_nodes_without_memory(query text)
+returns jsonb language plpgsql
+as
+$$
+declare
+ nodes jsonb := '[]'::jsonb;
+ node jsonb;
+ group_key text;
+ space_key text;
+begin
+ for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+ for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+ for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+ node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false);
+ node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false);
+ end loop;
+ end loop;
+ nodes := nodes || node;
+ end loop;
+ return nodes;
+end;
+$$;
+
+create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text)
+returns bool language plpgsql
+as
+$$
+declare
+ node jsonb;
+ group_stats jsonb;
+ group_key text;
+ space_key text;
+begin
+ for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
+ for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop
+ group_stats := node->group_key;
+ for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
+ if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then
+ raise exception '% has invalid max space < average space', group_key;
+ end if;
+ end loop;
+ end loop;
+ end loop;
+ return true;
+end;
+$$;
+
+-- A single large group tested around each mode transition point.
+insert into t(a, b) select 1, i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+select * from (select * from t order by a) s order by a, b limit 31;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+select * from (select * from t order by a) s order by a, b limit 32;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+select * from (select * from t order by a) s order by a, b limit 33;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+select * from (select * from t order by a) s order by a, b limit 65;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+select * from (select * from t order by a) s order by a, b limit 66;
+delete from t;
+
+-- An initial large group followed by a small group.
+insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 55;
+select * from (select * from t order by a) s order by a, b limit 55;
+-- Test EXPLAIN ANALYZE with only a fullsort group.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55');
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55'));
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55');
+delete from t;
+
+-- An initial small group followed by a large group.
+insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 70;
+select * from (select * from t order by a) s order by a, b limit 70;
+-- Test rescan.
+begin;
+-- We force the planner to choose a plan with incremental sort on the right side
+-- of a nested loop join node. That way we trigger the rescan code path.
+set local enable_hashjoin = off;
+set local enable_mergejoin = off;
+set local enable_material = off;
+set local enable_sort = off;
+explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
+rollback;
+-- Test EXPLAIN ANALYZE with both fullsort and presorted groups.
+select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70');
+select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70'));
+select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70');
+delete from t;
+
+-- Small groups of 10 tuples each tested around each mode transition point.
+insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+select * from (select * from t order by a) s order by a, b limit 31;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+select * from (select * from t order by a) s order by a, b limit 32;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+select * from (select * from t order by a) s order by a, b limit 33;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+select * from (select * from t order by a) s order by a, b limit 65;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+select * from (select * from t order by a) s order by a, b limit 66;
+delete from t;
+
+-- Small groups of only 1 tuple each tested around each mode transition point.
+insert into t(a, b) select i, i from generate_series(1, 70) n(i);
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
+select * from (select * from t order by a) s order by a, b limit 31;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
+select * from (select * from t order by a) s order by a, b limit 32;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
+select * from (select * from t order by a) s order by a, b limit 33;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
+select * from (select * from t order by a) s order by a, b limit 65;
+explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
+select * from (select * from t order by a) s order by a, b limit 66;
+delete from t;
+
+drop table t;
+
+-- Incremental sort vs. parallel queries
+set min_parallel_table_scan_size = '1kB';
+set min_parallel_index_scan_size = '1kB';
+set parallel_setup_cost = 0;
+set parallel_tuple_cost = 0;
+
+create table t (a int, b int, c int);
+insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
+create index on t (a);
+analyze t;
+
+set enable_incrementalsort = off;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+
+set enable_incrementalsort = on;
+explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
+
+drop table t;
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index ba4fed4d43..178f2079fa 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -12,6 +12,8 @@ SET enable_partitionwise_aggregate TO true;
SET enable_partitionwise_join TO true;
-- Disable parallel plans.
SET max_parallel_workers_per_gather TO 0;
+-- Disable incremental sort, which can influence selected plans due to fuzz factor.
+SET enable_incrementalsort TO off;
--
-- Tests for list partitioned tables.