diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 114db38116..f68c992213 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4573,6 +4573,20 @@ ANY num_sync ( + enable_incrementalsort (boolean) + + enable_incrementalsort configuration parameter + + + + + Enables or disables the query planner's use of incremental sort steps. + The default is on. + + + + enable_indexscan (boolean) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 58477ac83a..0dfc3e80e2 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -291,7 +291,47 @@ EXPLAIN SELECT * FROM tenk1 WHERE unique1 = 42; often see this plan type for queries that fetch just a single row. It's also often used for queries that have an ORDER BY condition that matches the index order, because then no extra sorting step is needed - to satisfy the ORDER BY. + to satisfy the ORDER BY. In this example, adding + ORDER BY unique1 would use the same plan because the + index already implicitly provides the requested ordering. + + + + The planner may implement an ORDER BY clause in several + ways. The above example shows that such an ordering clause may be + implemented implicitly. The planner may also add an explicit + sort step: + + +EXPLAIN SELECT * FROM tenk1 ORDER BY unique1; + QUERY PLAN +------------------------------------------------------------------- + Sort (cost=1109.39..1134.39 rows=10000 width=244) + Sort Key: unique1 + -> Seq Scan on tenk1 (cost=0.00..445.00 rows=10000 width=244) + + + If the a part of the plan guarantess an ordering on a prefix of the + required sort keys, then the planner may instead decide to use an + incremental sort step: + + +EXPLAIN SELECT * FROM tenk1 ORDER BY four, ten LIMIT 100; + QUERY PLAN +------------------------------------------------------------------------------------------------------ + Limit (cost=521.06..538.05 rows=100 width=244) + -> Incremental Sort (cost=521.06..2220.95 rows=10000 width=244) + Sort Key: four, ten + Presorted Key: four + -> Index Scan using index_tenk1_on_four on tenk1 (cost=0.29..1510.08 rows=10000 width=244) + + + Compared to regular sorts, sorting incrementally allows returning tuples + before the entire result set has been sorted, which particularly enables + optimizations with LIMIT queries. It may also reduce + memory usage and the likelihood of spilling sorts to disk, but it comes at + the cost of the increased overhead of splitting the result set into multiple + sorting batches. diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index bb58f92851..62c86ecdc5 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -82,6 +82,8 @@ static void show_upper_qual(List *qual, const char *qlabel, ExplainState *es); static void show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es); +static void show_incremental_sort_keys(IncrementalSortState *incrsortstate, + List *ancestors, ExplainState *es); static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors, ExplainState *es); static void show_agg_keys(AggState *astate, List *ancestors, @@ -95,7 +97,7 @@ static void show_grouping_set_keys(PlanState *planstate, static void show_group_keys(GroupState *gstate, List *ancestors, ExplainState *es); static void show_sort_group_keys(PlanState *planstate, const char *qlabel, - int nkeys, AttrNumber *keycols, + int nkeys, int nPresortedKeys, AttrNumber *keycols, Oid *sortOperators, Oid *collations, bool *nullsFirst, List *ancestors, ExplainState *es); static void show_sortorder_options(StringInfo buf, Node *sortexpr, @@ -103,6 +105,8 @@ static void show_sortorder_options(StringInfo buf, Node *sortexpr, static void show_tablesample(TableSampleClause *tsc, PlanState *planstate, List *ancestors, ExplainState *es); static void show_sort_info(SortState *sortstate, ExplainState *es); +static void show_incremental_sort_info(IncrementalSortState *incrsortstate, + ExplainState *es); static void show_hash_info(HashState *hashstate, ExplainState *es); static void show_hashagg_info(AggState *hashstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, @@ -1278,6 +1282,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Sort: pname = sname = "Sort"; break; + case T_IncrementalSort: + pname = sname = "Incremental Sort"; + break; case T_Group: pname = sname = "Group"; break; @@ -1937,6 +1944,12 @@ ExplainNode(PlanState *planstate, List *ancestors, show_sort_keys(castNode(SortState, planstate), ancestors, es); show_sort_info(castNode(SortState, planstate), es); break; + case T_IncrementalSort: + show_incremental_sort_keys(castNode(IncrementalSortState, planstate), + ancestors, es); + show_incremental_sort_info(castNode(IncrementalSortState, planstate), + es); + break; case T_MergeAppend: show_merge_append_keys(castNode(MergeAppendState, planstate), ancestors, es); @@ -2270,12 +2283,29 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es) Sort *plan = (Sort *) sortstate->ss.ps.plan; show_sort_group_keys((PlanState *) sortstate, "Sort Key", - plan->numCols, plan->sortColIdx, + plan->numCols, 0, plan->sortColIdx, plan->sortOperators, plan->collations, plan->nullsFirst, ancestors, es); } +/* + * Show the sort keys for a IncrementalSort node. + */ +static void +show_incremental_sort_keys(IncrementalSortState *incrsortstate, + List *ancestors, ExplainState *es) +{ + IncrementalSort *plan = (IncrementalSort *) incrsortstate->ss.ps.plan; + + show_sort_group_keys((PlanState *) incrsortstate, "Sort Key", + plan->sort.numCols, plan->nPresortedCols, + plan->sort.sortColIdx, + plan->sort.sortOperators, plan->sort.collations, + plan->sort.nullsFirst, + ancestors, es); +} + /* * Likewise, for a MergeAppend node. */ @@ -2286,7 +2316,7 @@ show_merge_append_keys(MergeAppendState *mstate, List *ancestors, MergeAppend *plan = (MergeAppend *) mstate->ps.plan; show_sort_group_keys((PlanState *) mstate, "Sort Key", - plan->numCols, plan->sortColIdx, + plan->numCols, 0, plan->sortColIdx, plan->sortOperators, plan->collations, plan->nullsFirst, ancestors, es); @@ -2310,7 +2340,7 @@ show_agg_keys(AggState *astate, List *ancestors, show_grouping_sets(outerPlanState(astate), plan, ancestors, es); else show_sort_group_keys(outerPlanState(astate), "Group Key", - plan->numCols, plan->grpColIdx, + plan->numCols, 0, plan->grpColIdx, NULL, NULL, NULL, ancestors, es); @@ -2379,7 +2409,7 @@ show_grouping_set_keys(PlanState *planstate, if (sortnode) { show_sort_group_keys(planstate, "Sort Key", - sortnode->numCols, sortnode->sortColIdx, + sortnode->numCols, 0, sortnode->sortColIdx, sortnode->sortOperators, sortnode->collations, sortnode->nullsFirst, ancestors, es); @@ -2436,7 +2466,7 @@ show_group_keys(GroupState *gstate, List *ancestors, /* The key columns refer to the tlist of the child plan */ ancestors = lcons(plan, ancestors); show_sort_group_keys(outerPlanState(gstate), "Group Key", - plan->numCols, plan->grpColIdx, + plan->numCols, 0, plan->grpColIdx, NULL, NULL, NULL, ancestors, es); ancestors = list_delete_first(ancestors); @@ -2449,13 +2479,14 @@ show_group_keys(GroupState *gstate, List *ancestors, */ static void show_sort_group_keys(PlanState *planstate, const char *qlabel, - int nkeys, AttrNumber *keycols, + int nkeys, int nPresortedKeys, AttrNumber *keycols, Oid *sortOperators, Oid *collations, bool *nullsFirst, List *ancestors, ExplainState *es) { Plan *plan = planstate->plan; List *context; List *result = NIL; + List *resultPresorted = NIL; StringInfoData sortkeybuf; bool useprefix; int keyno; @@ -2495,9 +2526,13 @@ show_sort_group_keys(PlanState *planstate, const char *qlabel, nullsFirst[keyno]); /* Emit one property-list item per sort key */ result = lappend(result, pstrdup(sortkeybuf.data)); + if (keyno < nPresortedKeys) + resultPresorted = lappend(resultPresorted, exprstr); } ExplainPropertyList(qlabel, result, es); + if (nPresortedKeys > 0) + ExplainPropertyList("Presorted Key", resultPresorted, es); } /* @@ -2711,6 +2746,196 @@ show_sort_info(SortState *sortstate, ExplainState *es) } } +/* + * Incremental sort nodes sort in (a potentially very large number of) batches, + * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into + * an intelligible summary. + * + * This function is used for both a non-parallel node and each worker in a + * parallel incremental sort node. + */ +static void +show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo, + const char *groupLabel, bool indent, ExplainState *es) +{ + ListCell *methodCell; + List *methodNames = NIL; + + /* Generate a list of sort methods used across all groups. */ + for (int bit = 0; bit < sizeof(bits32); ++bit) + { + if (groupInfo->sortMethods & (1 << bit)) + { + TuplesortMethod sortMethod = (1 << bit); + const char *methodName; + + methodName = tuplesort_method_name(sortMethod); + methodNames = lappend(methodNames, unconstify(char *, methodName)); + } + } + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (indent) + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, "%s Groups: %ld Sort Method", groupLabel, + groupInfo->groupCount); + /* plural/singular based on methodNames size */ + if (list_length(methodNames) > 1) + appendStringInfo(es->str, "s: "); + else + appendStringInfo(es->str, ": "); + foreach(methodCell, methodNames) + { + appendStringInfo(es->str, "%s", (char *) methodCell->ptr_value); + if (foreach_current_index(methodCell) < list_length(methodNames) - 1) + appendStringInfo(es->str, ", "); + } + + if (groupInfo->maxMemorySpaceUsed > 0) + { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB", + spaceTypeName, avgSpace, + groupInfo->maxMemorySpaceUsed); + } + + if (groupInfo->maxDiskSpaceUsed > 0) + { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + + const char *spaceTypeName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + /* Add a semicolon separator only if memory stats were printed. */ + if (groupInfo->maxMemorySpaceUsed > 0) + appendStringInfo(es->str, ";"); + appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB", + spaceTypeName, avgSpace, + groupInfo->maxDiskSpaceUsed); + } + } + else + { + StringInfoData groupName; + + initStringInfo(&groupName); + appendStringInfo(&groupName, "%s Groups", groupLabel); + ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es); + ExplainPropertyInteger("Group Count", NULL, groupInfo->groupCount, es); + + ExplainPropertyList("Sort Methods Used", methodNames, es); + + if (groupInfo->maxMemorySpaceUsed > 0) + { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName; + StringInfoData memoryName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + initStringInfo(&memoryName); + appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName); + ExplainOpenGroup("Sort Space", memoryName.data, true, es); + + ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es); + ExplainPropertyInteger("Maximum Sort Space Used", "kB", + groupInfo->maxMemorySpaceUsed, es); + + ExplainCloseGroup("Sort Spaces", memoryName.data, true, es); + } + if (groupInfo->maxDiskSpaceUsed > 0) + { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + const char *spaceTypeName; + StringInfoData diskName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + initStringInfo(&diskName); + appendStringInfo(&diskName, "Sort Space %s", spaceTypeName); + ExplainOpenGroup("Sort Space", diskName.data, true, es); + + ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es); + ExplainPropertyInteger("Maximum Sort Space Used", "kB", + groupInfo->maxDiskSpaceUsed, es); + + ExplainCloseGroup("Sort Spaces", diskName.data, true, es); + } + + ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es); + } +} + +/* + * If it's EXPLAIN ANALYZE, show tuplesort stats for a incremental sort node + */ +static void +show_incremental_sort_info(IncrementalSortState *incrsortstate, + ExplainState *es) +{ + IncrementalSortGroupInfo *fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo; + + fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo; + + if (!(es->analyze && fullsortGroupInfo->groupCount > 0)) + return; + + show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", true, es); + prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo; + if (prefixsortGroupInfo->groupCount > 0) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, " "); + show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es); + } + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, "\n"); + + if (incrsortstate->shared_info != NULL) + { + int n; + bool indent_first_line; + + for (n = 0; n < incrsortstate->shared_info->num_workers; n++) + { + IncrementalSortInfo *incsort_info = + &incrsortstate->shared_info->sinfo[n]; + + /* + * If a worker hasn't process any sort groups at all, then exclude + * it from output since it either didn't launch or didn't + * contribute anything meaningful. + */ + fullsortGroupInfo = &incsort_info->fullsortGroupInfo; + prefixsortGroupInfo = &incsort_info->prefixsortGroupInfo; + if (fullsortGroupInfo->groupCount == 0 && + prefixsortGroupInfo->groupCount == 0) + continue; + + if (es->workers_state) + ExplainOpenWorker(n, es); + + indent_first_line = es->workers_state == NULL || es->verbose; + show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", + indent_first_line, es); + if (prefixsortGroupInfo->groupCount > 0) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, " "); + show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es); + } + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(es->str, "\n"); + + if (es->workers_state) + ExplainCloseWorker(n, es); + } + } +} + /* * Show information on hash buckets/batches. */ diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index a983800e4b..f990c6473a 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -46,6 +46,7 @@ OBJS = \ nodeGroup.o \ nodeHash.o \ nodeHashjoin.o \ + nodeIncrementalSort.o \ nodeIndexonlyscan.o \ nodeIndexscan.o \ nodeLimit.o \ diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index b12aeb3334..e2154ba86a 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -30,6 +30,7 @@ #include "executor/nodeGroup.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeLimit.h" @@ -252,6 +253,10 @@ ExecReScan(PlanState *node) ExecReScanSort((SortState *) node); break; + case T_IncrementalSortState: + ExecReScanIncrementalSort((IncrementalSortState *) node); + break; + case T_GroupState: ExecReScanGroup((GroupState *) node); break; @@ -557,8 +562,17 @@ ExecSupportsBackwardScan(Plan *node) case T_CteScan: case T_Material: case T_Sort: + /* these don't evaluate tlist */ return true; + case T_IncrementalSort: + + /* + * Unlike full sort, incremental sort keeps only a single group of + * tuples in memory, so it can't scan backwards. + */ + return false; + case T_LockRows: case T_Limit: return ExecSupportsBackwardScan(outerPlan(node)); diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index b7d0719953..41cb41481d 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -31,6 +31,7 @@ #include "executor/nodeForeignscan.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeSeqscan.h" @@ -283,6 +284,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortEstimate((SortState *) planstate, e->pcxt); break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt); + break; default: break; @@ -496,6 +501,10 @@ ExecParallelInitializeDSM(PlanState *planstate, /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortInitializeDSM((SortState *) planstate, d->pcxt); break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt); + break; default: break; @@ -972,6 +981,7 @@ ExecParallelReInitializeDSM(PlanState *planstate, break; case T_HashState: case T_SortState: + case T_IncrementalSortState: /* these nodes have DSM state, but no reinitialization is required */ break; @@ -1032,6 +1042,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, case T_SortState: ExecSortRetrieveInstrumentation((SortState *) planstate); break; + case T_IncrementalSortState: + ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate); + break; case T_HashState: ExecHashRetrieveInstrumentation((HashState *) planstate); break; @@ -1318,6 +1331,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortInitializeWorker((SortState *) planstate, pwcxt); break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate, + pwcxt); + break; default: break; diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7b2e84f402..5662e7d742 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -88,6 +88,7 @@ #include "executor/nodeGroup.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeLimit.h" @@ -313,6 +314,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_IncrementalSort: + result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node, + estate, eflags); + break; + case T_Group: result = (PlanState *) ExecInitGroup((Group *) node, estate, eflags); @@ -693,6 +699,10 @@ ExecEndNode(PlanState *node) ExecEndSort((SortState *) node); break; + case T_IncrementalSortState: + ExecEndIncrementalSort((IncrementalSortState *) node); + break; + case T_GroupState: ExecEndGroup((GroupState *) node); break; @@ -839,6 +849,30 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) sortState->bound = tuples_needed; } } + else if (IsA(child_node, IncrementalSortState)) + { + /* + * If it is an IncrementalSort node, notify it that it can use bounded + * sort. + * + * Note: it is the responsibility of nodeIncrementalSort.c to react + * properly to changes of these parameters. If we ever redesign this, + * it'd be a good idea to integrate this signaling with the + * parameter-change mechanism. + */ + IncrementalSortState *sortState = (IncrementalSortState *) child_node; + + if (tuples_needed < 0) + { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } + else + { + sortState->bounded = true; + sortState->bound = tuples_needed; + } + } else if (IsA(child_node, AppendState)) { /* diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c new file mode 100644 index 0000000000..bcab7c054c --- /dev/null +++ b/src/backend/executor/nodeIncrementalSort.c @@ -0,0 +1,1263 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.c + * Routines to handle incremental sorting of relations. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeIncrementalSort.c + * + * DESCRIPTION + * + * Incremental sort is an optimized variant of multikey sort for cases + * when the input is already sorted by a prefix of the sort keys. For + * example when a sort by (key1, key2 ... keyN) is requested, and the + * input is already sorted by (key1, key2 ... keyM), M < N, we can + * divide the input into groups where keys (key1, ... keyM) are equal, + * and only sort on the remaining columns. + * + * Consider the following example. We have input tuples consisting of + * two integers (X, Y) already presorted by X, while it's required to + * sort them by both X and Y. Let input tuples be following. + * + * (1, 5) + * (1, 2) + * (2, 9) + * (2, 1) + * (2, 5) + * (3, 3) + * (3, 7) + * + * An incremental sort algorithm would split the input into the following + * groups, which have equal X, and then sort them by Y individually: + * + * (1, 5) (1, 2) + * (2, 9) (2, 1) (2, 5) + * (3, 3) (3, 7) + * + * After sorting these groups and putting them altogether, we would get + * the following result which is sorted by X and Y, as requested: + * + * (1, 2) + * (1, 5) + * (2, 1) + * (2, 5) + * (2, 9) + * (3, 3) + * (3, 7) + * + * Incremental sort may be more efficient than plain sort, particularly + * on large datasets, as it reduces the amount of data to sort at once, + * making it more likely it fits into work_mem (eliminating the need to + * spill to disk). But the main advantage of incremental sort is that + * it can start producing rows early, before sorting the whole dataset, + * which is a significant benefit especially for queries with LIMIT. + * + * The algorithm we've implemented here is modified from the theoretical + * base described above by operating in two different modes: + * - Fetching a minimum number of tuples without checking prefix key + * group membership and sorting on all columns when safe. + * - Fetching all tuples for a single prefix key group and sorting on + * solely the unsorted columns. + * We always begin in the first mode, and employ a heuristic to switch + * into the second mode if we believe it's beneficial. + * + * Sorting incrementally can potentially use less memory, avoid fetching + * and sorting all tuples in the the dataset, and begin returning tuples + * before the entire result set is available. + * + * The hybrid mode approach allows us to optimize for both very small + * groups (where the overhead of a new tuplesort is high) and very large + * groups (where we can lower cost by not having to sort on already sorted + * columns), albeit at some extra cost while switching between modes. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/execdebug.h" +#include "executor/nodeIncrementalSort.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/tuplesort.h" + +/* + * We need to store the instrumentation information in either local node's sort + * info or, for a parallel worker process, in the shared info (this avoids + * having to additionally memcpy the info from local memory to shared memory + * at each instrumentation call). This macro expands to choose the proper sort + * state and group info. + * + * Arguments: + * - node: type IncrementalSortState * + * - groupName: the token fullsort or prefixsort + */ +#define INSTRUMENT_SORT_GROUP(node, groupName) \ + if (node->ss.ps.instrument != NULL) \ + { \ + if (node->shared_info && node->am_worker) \ + { \ + Assert(IsParallelWorker()); \ + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); \ + instrumentSortedGroup(&node->shared_info->sinfo[ParallelWorkerNumber].groupName##GroupInfo, node->groupName##_state); \ + } else { \ + instrumentSortedGroup(&node->incsort_info.groupName##GroupInfo, node->groupName##_state); \ + } \ + } + +/* ---------------------------------------------------------------- + * instrumentSortedGroup + * + * Because incremental sort processes (potentially many) sort batches, we need + * to capture tuplesort stats each time we finalize a sort state. This summary + * data is later used for EXPLAIN ANALYZE output. + * ---------------------------------------------------------------- + */ +static void +instrumentSortedGroup(IncrementalSortGroupInfo *groupInfo, + Tuplesortstate *sortState) +{ + TuplesortInstrumentation sort_instr; + groupInfo->groupCount++; + + tuplesort_get_stats(sortState, &sort_instr); + + /* Calculate total and maximum memory and disk space used. */ + switch (sort_instr.spaceType) + { + case SORT_SPACE_TYPE_DISK: + groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed) + groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed; + + break; + case SORT_SPACE_TYPE_MEMORY: + groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed) + groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed; + + break; + } + + /* Track each sort method we've used. */ + groupInfo->sortMethods |= sort_instr.sortMethod; +} + +/* ---------------------------------------------------------------- + * preparePresortedCols + * + * Prepare information for presorted_keys comparisons. + * ---------------------------------------------------------------- + */ +static void +preparePresortedCols(IncrementalSortState *node) +{ + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + node->presorted_keys = + (PresortedKeyData *) palloc(plannode->nPresortedCols * + sizeof(PresortedKeyData)); + + /* Pre-cache comparison functions for each pre-sorted key. */ + for (int i = 0; i < plannode->nPresortedCols; i++) + { + Oid equalityOp, + equalityFunc; + PresortedKeyData *key; + + key = &node->presorted_keys[i]; + key->attno = plannode->sort.sortColIdx[i]; + + equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i], + NULL); + if (!OidIsValid(equalityOp)) + elog(ERROR, "missing equality operator for ordering operator %u", + plannode->sort.sortOperators[i]); + + equalityFunc = get_opcode(equalityOp); + if (!OidIsValid(equalityFunc)) + elog(ERROR, "missing function for operator %u", equalityOp); + + /* Lookup the comparison function */ + fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext); + + /* We can initialize the callinfo just once and re-use it */ + key->fcinfo = palloc0(SizeForFunctionCallInfo(2)); + InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2, + plannode->sort.collations[i], NULL, NULL); + key->fcinfo->args[0].isnull = false; + key->fcinfo->args[1].isnull = false; + } +} + +/* ---------------------------------------------------------------- + * isCurrentGroup + * + * Check whether a given tuple belongs to the current sort group by comparing + * the presorted column values to the pivot tuple of the current group. + * ---------------------------------------------------------------- + */ +static bool +isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple) +{ + int nPresortedCols; + + nPresortedCols = castNode(IncrementalSort, node->ss.ps.plan)->nPresortedCols; + + /* + * That the input is sorted by keys * (0, ... n) implies that the tail + * keys are more likely to change. Therefore we do our comparison starting + * from the last pre-sorted column to optimize for early detection of + * inequality and minimizing the number of function calls.. + */ + for (int i = nPresortedCols - 1; i >= 0; i--) + { + Datum datumA, + datumB, + result; + bool isnullA, + isnullB; + AttrNumber attno = node->presorted_keys[i].attno; + PresortedKeyData *key; + + datumA = slot_getattr(pivot, attno, &isnullA); + datumB = slot_getattr(tuple, attno, &isnullB); + + /* Special case for NULL-vs-NULL, else use standard comparison */ + if (isnullA || isnullB) + { + if (isnullA == isnullB) + continue; + else + return false; + } + + key = &node->presorted_keys[i]; + + key->fcinfo->args[0].value = datumA; + key->fcinfo->args[1].value = datumB; + + /* just for paranoia's sake, we reset isnull each time */ + key->fcinfo->isnull = false; + + result = FunctionCallInvoke(key->fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (key->fcinfo->isnull) + elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid); + + if (!DatumGetBool(result)) + return false; + } + return true; +} + +/* ---------------------------------------------------------------- + * switchToPresortedPrefixMode + * + * When we determine that we've likely encountered a large batch of tuples all + * having the same presorted prefix values, we want to optimize tuplesort by + * only sorting on unsorted suffix keys. + * + * The problem is that we've already accumulated several tuples in another + * tuplesort configured to sort by all columns (assuming that there may be + * more than one prefix key group). So to switch to presorted prefix mode we + * have to go back and look at all the tuples we've already accumulated to + * verify they're all part of the same prefix key group before sorting them + * solely by unsorted suffix keys. + * + * While it's likely that all already fetch tuples are all part of a single + * prefix group, we also have to handle the possibility that there is at least + * one different prefix key group before the large prefix key group. + * ---------------------------------------------------------------- + */ +static void +switchToPresortedPrefixMode(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + ScanDirection dir; + int64 nTuples = 0; + bool lastTuple = false; + bool firstTuple = true; + TupleDesc tupDesc; + PlanState *outerNode; + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + dir = node->ss.ps.state->es_direction; + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Configure the prefix sort state the first time around. */ + if (node->prefixsort_state == NULL) + { + Tuplesortstate *prefixsort_state; + int nPresortedCols = plannode->nPresortedCols; + + /* + * Optimize the sort by assuming the prefix columns are all equal and + * thus we only need to sort by any remaining columns. + */ + prefixsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols - nPresortedCols, + &(plannode->sort.sortColIdx[nPresortedCols]), + &(plannode->sort.sortOperators[nPresortedCols]), + &(plannode->sort.collations[nPresortedCols]), + &(plannode->sort.nullsFirst[nPresortedCols]), + work_mem, + NULL, + false); + node->prefixsort_state = prefixsort_state; + } + else + { + /* Next group of presorted data */ + tuplesort_reset(node->prefixsort_state); + } + + /* + * If the current node has a bound, then it's reasonably likely that a + * large prefix key group will benefit from bounded sort, so configure the + * tuplesort to allow for that optimization. + */ + if (node->bounded) + { + SO1_printf("Setting bound on presorted prefix tuplesort to: %ld\n", + node->bound - node->bound_Done); + tuplesort_set_bound(node->prefixsort_state, + node->bound - node->bound_Done); + } + + /* + * Copy as many tuples as we can (i.e., in the same prefix key group) from + * the full sort state to the prefix sort state. + */ + for (;;) + { + lastTuple = node->n_fullsort_remaining - nTuples == 1; + + /* + * When we encounter multiple prefix key groups inside the full sort + * tuplesort we have to carry over the last read tuple into the next + * batch. + */ + if (firstTuple && !TupIsNull(node->transfer_tuple)) + { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + nTuples++; + + /* The carried over tuple is our new group pivot tuple. */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + } + else + { + tuplesort_gettupleslot(node->fullsort_state, + ScanDirectionIsForward(dir), + false, node->transfer_tuple, NULL); + + /* + * If this is our first time through the loop, then we need to + * save the first tuple we get as our new group pivot. + */ + if (TupIsNull(node->group_pivot)) + ExecCopySlot(node->group_pivot, node->transfer_tuple); + + if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple)) + { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + nTuples++; + } + else + { + /* + * The tuple isn't part of the current batch so we need to + * carry it over into the next batch of tuples we transfer out + * of the full sort tuplesort into the presorted prefix + * tuplesort. We don't actually have to do anything special to + * save the tuple since we've already loaded it into the + * node->transfer_tuple slot, and, even though that slot + * points to memory inside the full sort tuplesort, we can't + * reset that tuplesort anyway until we've fully transferred + * out of its tuples, so this reference is safe. We do need to + * reset the group pivot tuple though since we've finished the + * current prefix key group. + */ + ExecClearTuple(node->group_pivot); + break; + } + } + + firstTuple = false; + + /* + * If we've copied all of the tuples from the full sort state into the + * prefix sort state, then we don't actually know that we've yet found + * the last tuple in that prefix key group until we check the next + * tuple from the outer plan node, so we retain the current group + * pivot tuple prefix key group comparison. + */ + if (lastTuple) + break; + } + + /* + * Track how many tuples remain in the full sort batch so that we know if + * we need to sort multiple prefix key groups before processing tuples + * remaining in the large single prefix key group we think we've + * encountered. + */ + SO1_printf("Moving %ld tuples to presorted prefix tuplesort\n", nTuples); + node->n_fullsort_remaining -= nTuples; + SO1_printf("Setting n_fullsort_remaining to %ld\n", node->n_fullsort_remaining); + + if (lastTuple) + { + /* + * We've confirmed that all tuples remaining in the full sort batch is + * in the same prefix key group and moved all of those tuples into the + * presorted prefix tuplesort. Now we can save our pivot comparison + * tuple and continue fetching tuples from the outer execution node to + * load into the presorted prefix tuplesort. + */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_LOADPREFIXSORT; + + /* + * Make sure we clear the transfer tuple slot so that next time we + * encounter a large prefix key group we don't incorrectly assume we + * have a tuple carried over from the previous group. + */ + ExecClearTuple(node->transfer_tuple); + } + else + { + /* + * We finished a group but didn't consume all of the tuples from the + * full sort state, so we'll sort this batch, let the outer node read + * out all of those tuples, and then come back around to find another + * batch. + */ + SO1_printf("Sorting presorted prefix tuplesort with %ld tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + INSTRUMENT_SORT_GROUP(node, prefixsort) + + if (node->bounded) + { + /* + * If the current node has a bound and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from %ld to %ld\n", + Min(node->bound, node->bound_Done + nTuples), node->bound_Done); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + } +} + +/* + * Sorting many small groups with tuplesort is inefficient. In order to + * cope with this problem we don't start a new group until the current one + * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also + * means we can't assume small groups of tuples all have the same prefix keys.) + * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking + * for the new group as soon as we've met our bound to avoid fetching more + * tuples than we absolutely have to fetch. + */ +#define DEFAULT_MIN_GROUP_SIZE 32 + +/* + * While we've optimized for small prefix key groups by not starting our prefix + * key comparisons until we've reached a minimum number of tuples, we don't want + * that optimization to cause us to lose out on the benefits of being able to + * assume a large group of tuples is fully presorted by its prefix keys. + * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic + * for determining when we believe we've encountered a large group, and, if we + * get to that point without finding a new prefix key group we transition to + * presorted prefix key mode. + */ +#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE) + +/* ---------------------------------------------------------------- + * ExecIncrementalSort + * + * Assuming that outer subtree returns tuple presorted by some prefix + * of target sort columns, performs incremental sort. + * + * Conditions: + * -- none. + * + * Initial States: + * -- the outer child is prepared to return the first tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecIncrementalSort(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + EState *estate; + ScanDirection dir; + Tuplesortstate *read_sortstate; + Tuplesortstate *fullsort_state; + TupleTableSlot *slot; + IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan; + PlanState *outerNode; + TupleDesc tupDesc; + int64 nTuples = 0; + int64 minGroupSize; + + CHECK_FOR_INTERRUPTS(); + + estate = node->ss.ps.state; + dir = estate->es_direction; + fullsort_state = node->fullsort_state; + + /* + * If a previous iteration has sorted a batch, then we need to check to + * see if there are any remaining tuples in that batch that we can return + * before moving on to other execution states. + */ + if (node->execution_status == INCSORT_READFULLSORT + || node->execution_status == INCSORT_READPREFIXSORT) + { + /* + * Return next tuple from the current sorted group set if available. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + + /* + * We have to populate the slot from the tuplesort before checking + * outerNodeDone because it will set the slot to NULL if no more + * tuples remain. If the tuplesort is empty, but we don't have any + * more tuples available for sort from the outer node, then + * outerNodeDone will have been set so we'll return that now-empty + * slot to the caller. + */ + if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + false, slot, NULL) || node->outerNodeDone) + + /* + * Note: there isn't a good test case for the node->outerNodeDone + * check directly, but we need it for any plan where the outer + * node will fail when trying to fetch too many tuples. + */ + return slot; + else if (node->n_fullsort_remaining > 0) + { + /* + * When we transition to presorted prefix mode, we might have + * accumulated at least one additional prefix key group in the + * full sort tuplesort. The first call to + * switchToPresortedPrefixMode() will have pulled the first one of + * those groups out, and we've returned those tuples to the parent + * node, but if at this point we still have tuples remaining in + * the full sort state (i.e., n_fullsort_remaining > 0), then we + * need to re-execute the prefix mode transition function to pull + * out the next prefix key group. + */ + SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (%ld)\n", + node->n_fullsort_remaining); + switchToPresortedPrefixMode(pstate); + } + else + { + /* + * If we don't have any sorted tuples to read and we're not + * currently transitioning into presorted prefix sort mode, then + * it's time to start the process all over again by building a new + * group in the full sort state. + */ + SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n"); + node->execution_status = INCSORT_LOADFULLSORT; + } + } + + /* + * Scan the subplan in the forward direction while creating the sorted + * data. + */ + estate->es_direction = ForwardScanDirection; + + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Load tuples into the full sort state. */ + if (node->execution_status == INCSORT_LOADFULLSORT) + { + /* + * Initialize sorting structures. + */ + if (fullsort_state == NULL) + { + /* + * Initialize presorted column support structures for + * isCurrentGroup(). It's correct to do this along with the + * initial intialization for the full sort state (and not for the + * prefix sort state) since we always load the full sort state + * first. + */ + preparePresortedCols(node); + + /* + * Since we optimize small prefix key groups by accumulating a + * minimum number of tuples before sorting, we can't assume that a + * group of tuples all have the same prefix key values. Hence we + * setup the full sort tuplesort to sort by all requested sort + * keys. + */ + fullsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols, + plannode->sort.sortColIdx, + plannode->sort.sortOperators, + plannode->sort.collations, + plannode->sort.nullsFirst, + work_mem, + NULL, + false); + node->fullsort_state = fullsort_state; + } + else + { + /* Reset sort for the next batch. */ + tuplesort_reset(fullsort_state); + } + + /* + * Calculate the remaining tuples left if bounded and configure both + * bounded sort and the minimum group size accordingly. + */ + if (node->bounded) + { + int64 currentBound = node->bound - node->bound_Done; + + /* + * Bounded sort isn't likely to be a useful optimization for full + * sort mode since we limit full sort mode to a relatively small + * number of tuples and tuplesort doesn't switch over to top-n + * heap sort anyway unless it hits (2 * bound) tuples. + */ + if (currentBound < DEFAULT_MIN_GROUP_SIZE) + tuplesort_set_bound(fullsort_state, currentBound); + + minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound); + } + else + minGroupSize = DEFAULT_MIN_GROUP_SIZE; + + /* + * Because we have to read the next tuple to find out that we've + * encountered a new prefix key group, on subsequent groups we have to + * carry over that extra tuple and add it to the new group's sort here + * before we read any new tuples from the outer node. + */ + if (!TupIsNull(node->group_pivot)) + { + tuplesort_puttupleslot(fullsort_state, node->group_pivot); + nTuples++; + + /* + * We're in full sort mode accumulating a minimum number of tuples + * and not checking for prefix key equality yet, so we can't + * assume the group pivot tuple will reamin the same -- unless + * we're using a minimum group size of 1, in which case the pivot + * is obviously still the pviot. + */ + if (nTuples != minGroupSize) + ExecClearTuple(node->group_pivot); + } + + + /* + * Pull as many tuples from the outer node as possible given our + * current operating mode. + */ + for (;;) + { + slot = ExecProcNode(outerNode); + + /* + * If the outer node can't provide us any more tuples, then we can + * sort the current group and return those tuples. + */ + if (TupIsNull(slot)) + { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + + SO1_printf("Sorting fullsort with %ld tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort) + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + + /* Accumulate the next group of presorted tuples. */ + if (nTuples < minGroupSize) + { + /* + * If we haven't yet hit our target minimum group size, then + * we don't need to bother checking for inclusion in the + * current prefix group since at this point we'll assume that + * we'll full sort this batch to avoid a large number of very + * tiny (and thus inefficient) sorts. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + + /* + * If we've reach our minimum group size, then we need to + * store the most recent tuple as a pivot. + */ + if (nTuples == minGroupSize) + ExecCopySlot(node->group_pivot, slot); + } + else + { + /* + * If we've already accumulated enough tuples to reach our + * minimum group size, then we need to compare any additional + * tuples to our pivot tuple to see if we reach the end of + * that prefix key group. Only after we find changed prefix + * keys can we guarantee sort stability of the tuples we've + * already accumulated. + */ + if (isCurrentGroup(node, node->group_pivot, slot)) + { + /* + * As long as the prefix keys match the pivot tuple then + * load the tuple into the tuplesort. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + } + else + { + /* + * Since the tuple we fetched isn't part of the current + * prefix key group we don't want to sort it as part of + * the current batch. Instead we use the group_pivot slot + * to carry it over to the next batch (even though we + * won't actually treat it as a group pivot). + */ + ExecCopySlot(node->group_pivot, slot); + + if (node->bounded) + { + /* + * If the current node has a bound, and we've already + * sorted n tuples, then the functional bound + * remaining is (original bound - n), so store the + * current number of processed tuples for later use + * configuring the sort state's bound. + */ + SO2_printf("Changing bound_Done from %ld to %ld\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + /* + * Once we find changed prefix keys we can complete the + * sort and transition modes to reading out the sorted + * tuples. + */ + SO1_printf("Sorting fullsort tuplesort with %ld tuples\n", + nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort) + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + } + + /* + * Unless we've alrady transitioned modes to reading from the full + * sort state, then we assume that having read at least + * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're + * processing a large group of tuples all having equal prefix keys + * (but haven't yet found the final tuple in that prefix key + * group), so we need to transition in to presorted prefix mode. + */ + if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE && + node->execution_status != INCSORT_READFULLSORT) + { + /* + * The group pivot we have stored has already been put into + * the tuplesort; we don't want to carry it over. Since we + * haven't yet found the end of the prefix key group, it might + * seem like we should keep this, but we don't actually know + * how many prefix key groups might be represented in the full + * sort state, so we'll let the mode transition function + * manage this state for us. + */ + ExecClearTuple(node->group_pivot); + + /* + * Unfortunately the tuplesort API doesn't include a way to + * retrieve tuples unless a sort has been performed, so we + * perform the sort even though we could just as easily rely + * on FIFO retrieval semantics when transferring them to the + * presorted prefix tuplesort. + */ + SO1_printf("Sorting fullsort tuplesort with %ld tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + INSTRUMENT_SORT_GROUP(node, fullsort) + + /* + * If the full sort tuplesort happened to switch into top-n + * heapsort mode then we will only be able to retrieve + * currentBound tuples (since the tuplesort will have only + * retained the top-n tuples). This is safe even though we + * haven't yet completed fetching the current prefix key group + * because the tuples we've "lost" already sorted "below" the + * retained ones, and we're already contractually guaranteed + * to not need any more than the currentBound tuples. + */ + if (tuplesort_used_bound(node->fullsort_state)) + { + int64 currentBound = node->bound - node->bound_Done; + + SO2_printf("Read %ld tuples, but setting to %ld because we used bounded sort\n", + nTuples, Min(currentBound, nTuples)); + nTuples = Min(currentBound, nTuples); + } + + SO1_printf("Setting n_fullsort_remaining to %ld and calling switchToPresortedPrefixMode()\n", + nTuples); + + /* + * We might have multiple prefix key groups in the full sort + * state, so the mode transition function needs to know the it + * needs to move from the fullsort to presorted prefix sort. + */ + node->n_fullsort_remaining = nTuples; + + /* Transition the tuples to the presorted prefix tuplesort. */ + switchToPresortedPrefixMode(pstate); + + /* + * Since we know we had tuples to move to the presorted prefix + * tuplesort, we know that unless that transition has verified + * that all tuples belonged to the same prefix key group (in + * which case we can go straight to continuing to load tuples + * into that tuplesort), we should have a tuple to return + * here. + * + * Either way, the appropriate execution status should have + * been set by switchToPresortedPrefixMode(), so we can drop + * out of the loop here and let the appropriate path kick in. + */ + break; + } + } + } + + if (node->execution_status == INCSORT_LOADPREFIXSORT) + { + /* + * We only enter this state after the mode transition function has + * confirmed all remaining tuples from the full sort state have the + * same prefix and moved those tuples to the prefix sort state. That + * function has also set a group pivot tuple (which doesn't need to be + * carried over; it's already been put into the prefix sort state). + */ + Assert(!TupIsNull(node->group_pivot)); + + /* + * Read tuples from the outer node and load them into the prefix sort + * state until we encounter a tuple whose prefix keys don't match the + * current group_pivot tuple, since we can't guarantee sort stability + * until we have all tuples matching those prefix keys. + */ + for (;;) + { + slot = ExecProcNode(outerNode); + + /* + * If we've exhausted tuples from the outer node we're done + * loading the prefix sort state. + */ + if (TupIsNull(slot)) + { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + break; + } + + /* + * If the tuple's prefix keys match our pivot tuple, we're not + * done yet and can load it into the prefix sort state. If not, we + * don't want to sort it as part of the current batch. Instead we + * use the group_pivot slot to carry it over to the next batch + * (even though we won't actually treat it as a group pivot). + */ + if (isCurrentGroup(node, node->group_pivot, slot)) + { + tuplesort_puttupleslot(node->prefixsort_state, slot); + nTuples++; + } + else + { + ExecCopySlot(node->group_pivot, slot); + break; + } + } + + /* + * Perform the sort and begin returning the tuples to the parent plan + * node. + */ + SO1_printf("Sorting presorted prefix tuplesort with >= %ld tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + INSTRUMENT_SORT_GROUP(node, prefixsort) + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + + if (node->bounded) + { + /* + * If the current node has a bound, and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from %ld to %ld\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + } + + /* Restore to user specified direction. */ + estate->es_direction = dir; + + /* + * Get the first or next tuple from tuplesort. Returns NULL if no more + * tuples. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + (void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + false, slot, NULL); + return slot; +} + +/* ---------------------------------------------------------------- + * ExecInitIncrementalSort + * + * Creates the run-time state information for the sort node + * produced by the planner and initializes its outer subtree. + * ---------------------------------------------------------------- + */ +IncrementalSortState * +ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags) +{ + IncrementalSortState *incrsortstate; + + SO_printf("ExecInitIncrementalSort: initializing sort node\n"); + + /* + * Incremental sort can't be used with either EXEC_FLAG_REWIND, + * EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, because we only one of many sort + * batches in the current sort state. + */ + Assert((eflags & (EXEC_FLAG_BACKWARD | + EXEC_FLAG_MARK)) == 0); + + /* Initialize state structure. */ + incrsortstate = makeNode(IncrementalSortState); + incrsortstate->ss.ps.plan = (Plan *) node; + incrsortstate->ss.ps.state = estate; + incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort; + + incrsortstate->execution_status = INCSORT_LOADFULLSORT; + incrsortstate->bounded = false; + incrsortstate->outerNodeDone = false; + incrsortstate->bound_Done = 0; + incrsortstate->fullsort_state = NULL; + incrsortstate->prefixsort_state = NULL; + incrsortstate->group_pivot = NULL; + incrsortstate->transfer_tuple = NULL; + incrsortstate->n_fullsort_remaining = 0; + incrsortstate->presorted_keys = NULL; + + if (incrsortstate->ss.ps.instrument != NULL) + { + IncrementalSortGroupInfo *fullsortGroupInfo = + &incrsortstate->incsort_info.fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo = + &incrsortstate->incsort_info.prefixsortGroupInfo; + + fullsortGroupInfo->groupCount = 0; + fullsortGroupInfo->maxDiskSpaceUsed = 0; + fullsortGroupInfo->totalDiskSpaceUsed = 0; + fullsortGroupInfo->maxMemorySpaceUsed = 0; + fullsortGroupInfo->totalMemorySpaceUsed = 0; + fullsortGroupInfo->sortMethods = 0; + prefixsortGroupInfo->groupCount = 0; + prefixsortGroupInfo->maxDiskSpaceUsed = 0; + prefixsortGroupInfo->totalDiskSpaceUsed = 0; + prefixsortGroupInfo->maxMemorySpaceUsed = 0; + prefixsortGroupInfo->totalMemorySpaceUsed = 0; + prefixsortGroupInfo->sortMethods = 0; + } + + /* + * Miscellaneous initialization + * + * Sort nodes don't initialize their ExprContexts because they never call + * ExecQual or ExecProject. + */ + + /* + * Initialize child nodes. + * + * We shield the child node from the need to support REWIND, BACKWARD, or + * MARK/RESTORE. + */ + eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize scan slot and type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple); + + /* + * Initialize return slot and type. No need to initialize projection info + * because we don't do any projections. + */ + ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple); + incrsortstate->ss.ps.ps_ProjInfo = NULL; + + /* + * Initialize standalone slots to store a tuple for pivot prefix keys and + * for carrying over a tuple from one batch to the next. + */ + incrsortstate->group_pivot = + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), + &TTSOpsMinimalTuple); + incrsortstate->transfer_tuple = + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), + &TTSOpsMinimalTuple); + + SO_printf("ExecInitIncrementalSort: sort node initialized\n"); + + return incrsortstate; +} + +/* ---------------------------------------------------------------- + * ExecEndIncrementalSort(node) + * ---------------------------------------------------------------- + */ +void +ExecEndIncrementalSort(IncrementalSortState *node) +{ + SO_printf("ExecEndIncrementalSort: shutting down sort node\n"); + + /* clean out the scan tuple */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + /* must drop stanalone tuple slots from outer node */ + ExecDropSingleTupleTableSlot(node->group_pivot); + ExecDropSingleTupleTableSlot(node->transfer_tuple); + + /* + * Release tuplesort resources. + */ + if (node->fullsort_state != NULL) + { + tuplesort_end(node->fullsort_state); + node->fullsort_state = NULL; + } + if (node->prefixsort_state != NULL) + { + tuplesort_end(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * Shut down the subplan. + */ + ExecEndNode(outerPlanState(node)); + + SO_printf("ExecEndIncrementalSort: sort node shutdown\n"); +} + +void +ExecReScanIncrementalSort(IncrementalSortState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* + * Incremental sort doesn't support efficient rescan even when paramters + * haven't changed (e.g., rewind) because unlike regular sort we don't + * store all tuples at once for the full sort. + * + * So even if EXEC_FLAG_REWIND is set we just reset all of our state and + * reexecute the sort along with the child node below us. + * + * In theory if we've only fill the full sort with one batch (and haven't + * reset it for a new batch yet) then we could efficiently rewind, but + * that seems a narrow enough case that it's not worth handling specially + * at this time. + */ + + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + if (node->group_pivot != NULL) + ExecClearTuple(node->group_pivot); + if (node->transfer_tuple != NULL) + ExecClearTuple(node->transfer_tuple); + + node->bounded = false; + node->outerNodeDone = false; + node->n_fullsort_remaining = 0; + node->bound_Done = 0; + node->presorted_keys = NULL; + + node->execution_status = INCSORT_LOADFULLSORT; + + /* + * If we've set up either of the sort states yet, we need to reset them. + * We could end them and null out the pointers, but there's no reason to + * repay the setup cost, and because guard setting up pivot comparator + * state similarly, doing so might actually cause a leak. + */ + if (node->fullsort_state != NULL) + { + tuplesort_reset(node->fullsort_state); + node->fullsort_state = NULL; + } + if (node->prefixsort_state != NULL) + { + tuplesort_reset(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * If chgParam of subnode is not null, theni the plan will be re-scanned + * by the first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecSortEstimate + * + * Estimate space required to propagate sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo)); + size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeDSM + * + * Initialize DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedIncrementalSortInfo, sinfo) + + pcxt->nworkers * sizeof(IncrementalSortInfo); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeWorker + * + * Attach worker to DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); + node->am_worker = true; +} + +/* ---------------------------------------------------------------- + * ExecSortRetrieveInstrumentation + * + * Transfer sort statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node) +{ + Size size; + SharedIncrementalSortInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedIncrementalSortInfo, sinfo) + + node->shared_info->num_workers * sizeof(IncrementalSortInfo); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 5d1debc196..9d2bfd7ed6 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -93,7 +93,8 @@ ExecSort(PlanState *pstate) plannode->collations, plannode->nullsFirst, work_mem, - NULL, node->randomAccess); + NULL, + node->randomAccess); if (node->bounded) tuplesort_set_bound(tuplesortstate, node->bound); node->tuplesortstate = (void *) tuplesortstate; diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index eaf93c64b8..f9d86859ee 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -927,6 +927,24 @@ _copyMaterial(const Material *from) } +/* + * CopySortFields + * + * This function copies the fields of the Sort node. It is used by + * all the copy functions for classes which inherit from Sort. + */ +static void +CopySortFields(const Sort *from, Sort *newnode) +{ + CopyPlanFields((const Plan *) from, (Plan *) newnode); + + COPY_SCALAR_FIELD(numCols); + COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); +} + /* * _copySort */ @@ -938,13 +956,29 @@ _copySort(const Sort *from) /* * copy node superclass fields */ - CopyPlanFields((const Plan *) from, (Plan *) newnode); + CopySortFields(from, newnode); - COPY_SCALAR_FIELD(numCols); - COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); - COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); - COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid)); - COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); + return newnode; +} + + +/* + * _copyIncrementalSort + */ +static IncrementalSort * +_copyIncrementalSort(const IncrementalSort *from) +{ + IncrementalSort *newnode = makeNode(IncrementalSort); + + /* + * copy node superclass fields + */ + CopySortFields((const Sort *) from, (Sort *) newnode); + + /* + * copy remainder of node + */ + COPY_SCALAR_FIELD(nPresortedCols); return newnode; } @@ -4898,6 +4932,9 @@ copyObjectImpl(const void *from) case T_Sort: retval = _copySort(from); break; + case T_IncrementalSort: + retval = _copyIncrementalSort(from); + break; case T_Group: retval = _copyGroup(from); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index f4aecdcbcd..35ed8c0d53 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -837,10 +837,8 @@ _outMaterial(StringInfo str, const Material *node) } static void -_outSort(StringInfo str, const Sort *node) +_outSortInfo(StringInfo str, const Sort *node) { - WRITE_NODE_TYPE("SORT"); - _outPlanInfo(str, (const Plan *) node); WRITE_INT_FIELD(numCols); @@ -850,6 +848,24 @@ _outSort(StringInfo str, const Sort *node) WRITE_BOOL_ARRAY(nullsFirst, node->numCols); } +static void +_outSort(StringInfo str, const Sort *node) +{ + WRITE_NODE_TYPE("SORT"); + + _outSortInfo(str, node); +} + +static void +_outIncrementalSort(StringInfo str, const IncrementalSort *node) +{ + WRITE_NODE_TYPE("INCREMENTALSORT"); + + _outSortInfo(str, (const Sort *) node); + + WRITE_INT_FIELD(nPresortedCols); +} + static void _outUnique(StringInfo str, const Unique *node) { @@ -3786,6 +3802,9 @@ outNode(StringInfo str, const void *obj) case T_Sort: _outSort(str, obj); break; + case T_IncrementalSort: + _outIncrementalSort(str, obj); + break; case T_Unique: _outUnique(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index d5b23a3479..2a2f39bf04 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -2150,12 +2150,13 @@ _readMaterial(void) } /* - * _readSort + * ReadCommonSort + * Assign the basic stuff of all nodes that inherit from Sort */ -static Sort * -_readSort(void) +static void +ReadCommonSort(Sort *local_node) { - READ_LOCALS(Sort); + READ_TEMP_LOCALS(); ReadCommonPlan(&local_node->plan); @@ -2164,6 +2165,32 @@ _readSort(void) READ_OID_ARRAY(sortOperators, local_node->numCols); READ_OID_ARRAY(collations, local_node->numCols); READ_BOOL_ARRAY(nullsFirst, local_node->numCols); +} + +/* + * _readSort + */ +static Sort * +_readSort(void) +{ + READ_LOCALS_NO_FIELDS(Sort); + + ReadCommonSort(local_node); + + READ_DONE(); +} + +/* + * _readIncrementalSort + */ +static IncrementalSort * +_readIncrementalSort(void) +{ + READ_LOCALS(IncrementalSort); + + ReadCommonSort(&local_node->sort); + + READ_INT_FIELD(nPresortedCols); READ_DONE(); } @@ -2801,6 +2828,8 @@ parseNodeString(void) return_value = _readMaterial(); else if (MATCH("SORT", 4)) return_value = _readSort(); + else if (MATCH("INCREMENTALSORT", 15)) + return_value = _readIncrementalSort(); else if (MATCH("GROUP", 5)) return_value = _readGroup(); else if (MATCH("AGG", 3)) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 905bbe77d8..ccf46dd0aa 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3881,6 +3881,10 @@ print_path(PlannerInfo *root, Path *path, int indent) ptype = "Sort"; subpath = ((SortPath *) path)->subpath; break; + case T_IncrementalSortPath: + ptype = "IncrementalSort"; + subpath = ((SortPath *) path)->subpath; + break; case T_GroupPath: ptype = "Group"; subpath = ((GroupPath *) path)->subpath; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 9e7e57f118..0eef5d7707 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -128,6 +128,7 @@ bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; +bool enable_incrementalsort = true; bool enable_hashagg = true; bool enable_hashagg_disk = true; bool enable_groupingsets_hash_disk = false; @@ -1648,9 +1649,9 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) } /* - * cost_sort - * Determines and returns the cost of sorting a relation, including - * the cost of reading the input data. + * cost_tuplesort + * Determines and returns the cost of sorting a relation using tuplesort, + * not including the cost of reading the input data. * * If the total volume of data to sort is less than sort_mem, we will do * an in-memory sort, which requires no I/O and about t*log2(t) tuple @@ -1677,39 +1678,23 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) * specifying nonzero comparison_cost; typically that's used for any extra * work that has to be done to prepare the inputs to the comparison operators. * - * 'pathkeys' is a list of sort keys - * 'input_cost' is the total cost for reading the input data * 'tuples' is the number of tuples in the relation * 'width' is the average tuple width in bytes * 'comparison_cost' is the extra cost per comparison, if any * 'sort_mem' is the number of kilobytes of work memory allowed for the sort * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound - * - * NOTE: some callers currently pass NIL for pathkeys because they - * can't conveniently supply the sort keys. Since this routine doesn't - * currently do anything with pathkeys anyway, that doesn't matter... - * but if it ever does, it should react gracefully to lack of key data. - * (Actually, the thing we'd most likely be interested in is just the number - * of sort keys, which all callers *could* supply.) */ -void -cost_sort(Path *path, PlannerInfo *root, - List *pathkeys, Cost input_cost, double tuples, int width, - Cost comparison_cost, int sort_mem, - double limit_tuples) +static void +cost_tuplesort(Cost *startup_cost, Cost *run_cost, + double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples) { - Cost startup_cost = input_cost; - Cost run_cost = 0; double input_bytes = relation_byte_size(tuples, width); double output_bytes; double output_tuples; long sort_mem_bytes = sort_mem * 1024L; - if (!enable_sort) - startup_cost += disable_cost; - - path->rows = tuples; - /* * We want to be sure the cost of a sort is never estimated as zero, even * if passed-in tuple count is zero. Besides, mustn't do log(0)... @@ -1748,7 +1733,7 @@ cost_sort(Path *path, PlannerInfo *root, * * Assume about N log2 N comparisons */ - startup_cost += comparison_cost * tuples * LOG2(tuples); + *startup_cost = comparison_cost * tuples * LOG2(tuples); /* Disk costs */ @@ -1759,7 +1744,7 @@ cost_sort(Path *path, PlannerInfo *root, log_runs = 1.0; npageaccesses = 2.0 * npages * log_runs; /* Assume 3/4ths of accesses are sequential, 1/4th are not */ - startup_cost += npageaccesses * + *startup_cost += npageaccesses * (seq_page_cost * 0.75 + random_page_cost * 0.25); } else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes) @@ -1770,12 +1755,12 @@ cost_sort(Path *path, PlannerInfo *root, * factor is a bit higher than for quicksort. Tweak it so that the * cost curve is continuous at the crossover point. */ - startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples); + *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples); } else { /* We'll use plain quicksort on all the input tuples */ - startup_cost += comparison_cost * tuples * LOG2(tuples); + *startup_cost = comparison_cost * tuples * LOG2(tuples); } /* @@ -1786,8 +1771,143 @@ cost_sort(Path *path, PlannerInfo *root, * here --- the upper LIMIT will pro-rate the run cost so we'd be double * counting the LIMIT otherwise. */ - run_cost += cpu_operator_cost * tuples; + *run_cost = cpu_operator_cost * tuples; +} +/* + * cost_incremental_sort + * Determines and returns the cost of sorting a relation incrementally, when + * the input path is presorted by a prefix of the pathkeys. + * + * 'presorted_keys' is the number of leading pathkeys by which the input path + * is sorted. + * + * We estimate the number of groups into which the relation is divided by the + * leading pathkeys, and then calculate the cost of sorting a single group + * with tuplesort using cost_tuplesort(). + */ +void +cost_incremental_sort(Path *path, + PlannerInfo *root, List *pathkeys, int presorted_keys, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples, int width, Cost comparison_cost, int sort_mem, + double limit_tuples) +{ + Cost startup_cost = 0, + run_cost = 0, + input_run_cost = input_total_cost - input_startup_cost; + double group_tuples, + input_groups; + Cost group_startup_cost, + group_run_cost, + group_input_run_cost; + List *presortedExprs = NIL; + ListCell *l; + int i = 0; + + Assert(presorted_keys != 0); + + /* + * We want to be sure the cost of a sort is never estimated as zero, even + * if passed-in tuple count is zero. Besides, mustn't do log(0)... + */ + if (input_tuples < 2.0) + input_tuples = 2.0; + + /* Extract presorted keys as list of expressions */ + foreach(l, pathkeys) + { + PathKey *key = (PathKey *) lfirst(l); + EquivalenceMember *member = (EquivalenceMember *) + linitial(key->pk_eclass->ec_members); + + presortedExprs = lappend(presortedExprs, member->em_expr); + + i++; + if (i >= presorted_keys) + break; + } + + /* Estimate number of groups with equal presorted keys */ + input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL); + group_tuples = input_tuples / input_groups; + group_input_run_cost = input_run_cost / input_groups; + + /* + * Estimate average cost of sorting of one group where presorted keys are + * equal. Incremental sort is sensitive to distribution of tuples to the + * groups, where we're relying on quite rough assumptions. Thus, we're + * pessimistic about incremental sort performance and increase its average + * group size by half. + */ + cost_tuplesort(&group_startup_cost, &group_run_cost, + 1.5 * group_tuples, width, comparison_cost, sort_mem, + limit_tuples); + + /* + * Startup cost of incremental sort is the startup cost of its first group + * plus the cost of its input. + */ + startup_cost += group_startup_cost + + input_startup_cost + group_input_run_cost; + + /* + * After we started producing tuples from the first group, the cost of + * producing all the tuples is given by the cost to finish processing this + * group, plus the total cost to process the remaining groups, plus the + * remaining cost of input. + */ + run_cost += group_run_cost + + (group_run_cost + group_startup_cost) * (input_groups - 1) + + group_input_run_cost * (input_groups - 1); + + /* + * Incremental sort adds some overhead by itself. Firstly, it has to + * detect the sort groups. This is roughly equal to one extra copy and + * comparison per tuple. Secondly, it has to reset the tuplesort context + * for every group. + */ + run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples; + run_cost += 2.0 * cpu_tuple_cost * input_groups; + + path->rows = input_tuples; + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} + +/* + * cost_sort + * Determines and returns the cost of sorting a relation, including + * the cost of reading the input data. + * + * NOTE: some callers currently pass NIL for pathkeys because they + * can't conveniently supply the sort keys. Since this routine doesn't + * currently do anything with pathkeys anyway, that doesn't matter... + * but if it ever does, it should react gracefully to lack of key data. + * (Actually, the thing we'd most likely be interested in is just the number + * of sort keys, which all callers *could* supply.) + */ +void +cost_sort(Path *path, PlannerInfo *root, + List *pathkeys, Cost input_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples) + +{ + Cost startup_cost; + Cost run_cost; + + cost_tuplesort(&startup_cost, &run_cost, + tuples, width, + comparison_cost, sort_mem, + limit_tuples); + + if (!enable_sort) + startup_cost += disable_cost; + + startup_cost += input_cost; + + path->rows = tuples; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; } diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 71b9d42c99..21e3f5a987 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -334,6 +334,60 @@ pathkeys_contained_in(List *keys1, List *keys2) return false; } +/* + * pathkeys_count_contained_in + * Same as pathkeys_contained_in, but also sets length of longest + * common prefix of keys1 and keys2. + */ +bool +pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common) +{ + int n = 0; + ListCell *key1, + *key2; + + /* + * See if we can avoiding looping through both lists. This optimization + * gains us several percent in planning time in a worst-case test. + */ + if (keys1 == keys2) + { + *n_common = list_length(keys1); + return true; + } + else if (keys1 == NIL) + { + *n_common = 0; + return true; + } + else if (keys2 == NIL) + { + *n_common = 0; + return false; + } + + /* + * If both lists are non-empty, iterate through both to find out how many + * items are shared. + */ + forboth(key1, keys1, key2, keys2) + { + PathKey *pathkey1 = (PathKey *) lfirst(key1); + PathKey *pathkey2 = (PathKey *) lfirst(key2); + + if (pathkey1 != pathkey2) + { + *n_common = n; + return false; + } + n++; + } + + /* If we ended with a null value, then we've processed the whole list. */ + *n_common = n; + return (key1 == NULL); +} + /* * get_cheapest_path_for_pathkeys * Find the cheapest path (according to the specified criterion) that @@ -1786,26 +1840,26 @@ right_merge_direction(PlannerInfo *root, PathKey *pathkey) * Count the number of pathkeys that are useful for meeting the * query's requested output ordering. * - * Unlike merge pathkeys, this is an all-or-nothing affair: it does us - * no good to order by just the first key(s) of the requested ordering. - * So the result is always either 0 or list_length(root->query_pathkeys). + * Because we the have the possibility of incremental sort, a prefix list of + * keys is potentially useful for improving the performance of the requested + * ordering. Thus we return 0, if no valuable keys are found, or the number + * of leading keys shared by the list and the requested ordering.. */ static int pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys) { + int n_common_pathkeys; + if (root->query_pathkeys == NIL) return 0; /* no special ordering requested */ if (pathkeys == NIL) return 0; /* unordered path */ - if (pathkeys_contained_in(root->query_pathkeys, pathkeys)) - { - /* It's useful ... or at least the first N keys are */ - return list_length(root->query_pathkeys); - } + (void) pathkeys_count_contained_in(root->query_pathkeys, pathkeys, + &n_common_pathkeys); - return 0; /* path ordering not useful */ + return n_common_pathkeys; } /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index fc25908dc6..6d26bfbeb5 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -98,6 +98,8 @@ static Plan *create_projection_plan(PlannerInfo *root, int flags); static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe); static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags); +static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root, + IncrementalSortPath *best_path, int flags); static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path); static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flags); @@ -244,6 +246,10 @@ static MergeJoin *make_mergejoin(List *tlist, static Sort *make_sort(Plan *lefttree, int numCols, AttrNumber *sortColIdx, Oid *sortOperators, Oid *collations, bool *nullsFirst); +static IncrementalSort *make_incrementalsort(Plan *lefttree, + int numCols, int nPresortedCols, + AttrNumber *sortColIdx, Oid *sortOperators, + Oid *collations, bool *nullsFirst); static Plan *prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids, const AttrNumber *reqColIdx, @@ -258,6 +264,8 @@ static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec, Relids relids); static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids); +static IncrementalSort *make_incrementalsort_from_pathkeys(Plan *lefttree, + List *pathkeys, Relids relids, int nPresortedCols); static Sort *make_sort_from_groupcols(List *groupcls, AttrNumber *grpColIdx, Plan *lefttree); @@ -460,6 +468,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) (SortPath *) best_path, flags); break; + case T_IncrementalSort: + plan = (Plan *) create_incrementalsort_plan(root, + (IncrementalSortPath *) best_path, + flags); + break; case T_Group: plan = (Plan *) create_group_plan(root, (GroupPath *) best_path); @@ -1994,6 +2007,32 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags) return plan; } +/* + * create_incrementalsort_plan + * + * Do the same as create_sort_plan, but create IncrementalSort plan. + */ +static IncrementalSort * +create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, + int flags) +{ + IncrementalSort *plan; + Plan *subplan; + + /* See comments in create_sort_plan() above */ + subplan = create_plan_recurse(root, best_path->spath.subpath, + flags | CP_SMALL_TLIST); + plan = make_incrementalsort_from_pathkeys(subplan, + best_path->spath.path.pathkeys, + IS_OTHER_REL(best_path->spath.subpath->parent) ? + best_path->spath.path.parent->relids : NULL, + best_path->nPresortedCols); + + copy_generic_path_info(&plan->sort.plan, (Path *) best_path); + + return plan; +} + /* * create_group_plan * @@ -5090,6 +5129,12 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples) Plan *lefttree = plan->plan.lefttree; Path sort_path; /* dummy for result of cost_sort */ + /* + * This function shouldn't have to deal with IncrementalSort plans because + * they are only created from corresponding Path nodes. + */ + Assert(IsA(plan, Sort)); + cost_sort(&sort_path, root, NIL, lefttree->total_cost, lefttree->plan_rows, @@ -5677,9 +5722,12 @@ make_sort(Plan *lefttree, int numCols, AttrNumber *sortColIdx, Oid *sortOperators, Oid *collations, bool *nullsFirst) { - Sort *node = makeNode(Sort); - Plan *plan = &node->plan; + Sort *node; + Plan *plan; + node = makeNode(Sort); + + plan = &node->plan; plan->targetlist = lefttree->targetlist; plan->qual = NIL; plan->lefttree = lefttree; @@ -5693,6 +5741,37 @@ make_sort(Plan *lefttree, int numCols, return node; } +/* + * make_incrementalsort --- basic routine to build an IncrementalSort plan node + * + * Caller must have built the sortColIdx, sortOperators, collations, and + * nullsFirst arrays already. + */ +static IncrementalSort * +make_incrementalsort(Plan *lefttree, int numCols, int nPresortedCols, + AttrNumber *sortColIdx, Oid *sortOperators, + Oid *collations, bool *nullsFirst) +{ + IncrementalSort *node; + Plan *plan; + + node = makeNode(IncrementalSort); + + plan = &node->sort.plan; + plan->targetlist = lefttree->targetlist; + plan->qual = NIL; + plan->lefttree = lefttree; + plan->righttree = NULL; + node->nPresortedCols = nPresortedCols; + node->sort.numCols = numCols; + node->sort.sortColIdx = sortColIdx; + node->sort.sortOperators = sortOperators; + node->sort.collations = collations; + node->sort.nullsFirst = nullsFirst; + + return node; +} + /* * prepare_sort_from_pathkeys * Prepare to sort according to given pathkeys @@ -6039,6 +6118,42 @@ make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids) collations, nullsFirst); } +/* + * make_incrementalsort_from_pathkeys + * Create sort plan to sort according to given pathkeys + * + * 'lefttree' is the node which yields input tuples + * 'pathkeys' is the list of pathkeys by which the result is to be sorted + * 'relids' is the set of relations required by prepare_sort_from_pathkeys() + * 'nPresortedCols' is the number of presorted columns in input tuples + */ +static IncrementalSort * +make_incrementalsort_from_pathkeys(Plan *lefttree, List *pathkeys, + Relids relids, int nPresortedCols) +{ + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *collations; + bool *nullsFirst; + + /* Compute sort column info, and adjust lefttree as needed */ + lefttree = prepare_sort_from_pathkeys(lefttree, pathkeys, + relids, + NULL, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &collations, + &nullsFirst); + + /* Now build the Sort node */ + return make_incrementalsort(lefttree, numsortkeys, nPresortedCols, + sortColIdx, sortOperators, + collations, nullsFirst); +} + /* * make_sort_from_sortclauses * Create sort plan to sort according to given sortclauses @@ -6774,6 +6889,7 @@ is_projection_capable_path(Path *path) case T_Hash: case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: case T_LockRows: diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index f52226ccec..aeb83841d7 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4924,13 +4924,16 @@ create_distinct_paths(PlannerInfo *root, * Build a new upperrel containing Paths for ORDER BY evaluation. * * All paths in the result must satisfy the ORDER BY ordering. - * The only new path we need consider is an explicit sort on the - * cheapest-total existing path. + * The only new paths we need consider are an explicit full sort + * and incremental sort on the cheapest-total existing path. * * input_rel: contains the source-data Paths * target: the output tlist the result Paths must emit * limit_tuples: estimated bound on the number of output tuples, * or -1 if no LIMIT or couldn't estimate + * + * XXX This only looks at sort_pathkeys. I wonder if it needs to look at the + * other pathkeys (grouping, ...) like generate_useful_gather_paths. */ static RelOptInfo * create_ordered_paths(PlannerInfo *root, @@ -4964,29 +4967,77 @@ create_ordered_paths(PlannerInfo *root, foreach(lc, input_rel->pathlist) { - Path *path = (Path *) lfirst(lc); + Path *input_path = (Path *) lfirst(lc); + Path *sorted_path = input_path; bool is_sorted; + int presorted_keys; - is_sorted = pathkeys_contained_in(root->sort_pathkeys, - path->pathkeys); - if (path == cheapest_input_path || is_sorted) + is_sorted = pathkeys_count_contained_in(root->sort_pathkeys, + input_path->pathkeys, &presorted_keys); + + if (is_sorted) { - if (!is_sorted) + /* Use the input path as is, but add a projection step if needed */ + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); + + add_path(ordered_rel, sorted_path); + } + else + { + /* + * Try adding an explicit sort, but only to the cheapest total path + * since a full sort should generally add the same cost to all + * paths. + */ + if (input_path == cheapest_input_path) { - /* An explicit sort here can take advantage of LIMIT */ - path = (Path *) create_sort_path(root, - ordered_rel, - path, - root->sort_pathkeys, - limit_tuples); + /* + * Sort the cheapest input path. An explicit sort here can + * take advantage of LIMIT. + */ + sorted_path = (Path *) create_sort_path(root, + ordered_rel, + input_path, + root->sort_pathkeys, + limit_tuples); + /* Add projection step if needed */ + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); + + add_path(ordered_rel, sorted_path); } - /* Add projection step if needed */ - if (path->pathtarget != target) - path = apply_projection_to_path(root, ordered_rel, - path, target); + /* + * If incremental sort is enabled, then try it as well. Unlike with + * regular sorts, we can't just look at the cheapest path, because + * the cost of incremental sort depends on how well presorted the + * path is. Additionally incremental sort may enable a cheaper + * startup path to win out despite higher total cost. + */ + if (!enable_incrementalsort) + continue; - add_path(ordered_rel, path); + /* Likewise, if the path can't be used for incremental sort. */ + if (!presorted_keys) + continue; + + /* Also consider incremental sort. */ + sorted_path = (Path *) create_incremental_sort_path(root, + ordered_rel, + input_path, + root->sort_pathkeys, + presorted_keys, + limit_tuples); + + /* Add projection step if needed */ + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); + + add_path(ordered_rel, sorted_path); } } diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 3dcded506b..2b676bf406 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -678,6 +678,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 3650e8329d..b02fcb9bfe 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2688,6 +2688,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, case T_Hash: case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: case T_Group: diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 8ba8122ee2..4538ed88e0 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2753,6 +2753,57 @@ create_set_projection_path(PlannerInfo *root, return pathnode; } +/* + * create_incremental_sort_path + * Creates a pathnode that represents performing an incremental sort. + * + * 'rel' is the parent relation associated with the result + * 'subpath' is the path representing the source of data + * 'pathkeys' represents the desired sort order + * 'presorted_keys' is the number of keys by which the input path is + * already sorted + * 'limit_tuples' is the estimated bound on the number of output tuples, + * or -1 if no LIMIT or couldn't estimate + */ +SortPath * +create_incremental_sort_path(PlannerInfo *root, + RelOptInfo *rel, + Path *subpath, + List *pathkeys, + int presorted_keys, + double limit_tuples) +{ + IncrementalSortPath *sort = makeNode(IncrementalSortPath); + SortPath *pathnode = &sort->spath; + + pathnode->path.pathtype = T_IncrementalSort; + pathnode->path.parent = rel; + /* Sort doesn't project, so use source path's pathtarget */ + pathnode->path.pathtarget = subpath->pathtarget; + /* For now, assume we are above any joins, so no parameterization */ + pathnode->path.param_info = NULL; + pathnode->path.parallel_aware = false; + pathnode->path.parallel_safe = rel->consider_parallel && + subpath->parallel_safe; + pathnode->path.parallel_workers = subpath->parallel_workers; + pathnode->path.pathkeys = pathkeys; + + pathnode->subpath = subpath; + + cost_incremental_sort(&pathnode->path, + root, pathkeys, presorted_keys, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, /* XXX comparison_cost shouldn't be 0? */ + work_mem, limit_tuples); + + sort->nPresortedCols = presorted_keys; + + return pathnode; +} + /* * create_sort_path * Creates a pathnode that represents performing an explicit sort. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 477af5d552..03a22d71ac 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -991,6 +991,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of incremental sort steps."), + NULL + }, + &enable_incrementalsort, + true, + NULL, NULL, NULL + }, { {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of hashed aggregation plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 91fa185053..1ae8b77306 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -360,6 +360,7 @@ #enable_parallel_append = on #enable_seqscan = on #enable_sort = on +#enable_incrementalsort = on #enable_tidscan = on #enable_partitionwise_join = off #enable_partitionwise_aggregate = off diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d02e676aa3..cc33a85731 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -125,6 +125,16 @@ #define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ (state)->worker >= 0 ? 1 : 2) +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + /* GUC variables */ #ifdef TRACE_SORT bool trace_sort = false; @@ -241,6 +251,14 @@ struct Tuplesortstate int64 allowedMem; /* total memory allowed, in bytes */ int maxTapes; /* number of tapes (Knuth's T) */ int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ MemoryContext sortcontext; /* memory context holding most sort data */ MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ @@ -591,6 +609,7 @@ struct Sharedsort static Tuplesortstate *tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); static bool consider_abort_common(Tuplesortstate *state); static void inittapes(Tuplesortstate *state, bool mergeruns); @@ -647,6 +666,8 @@ static void worker_freeze_result_tape(Tuplesortstate *state); static void worker_nomergeruns(Tuplesortstate *state); static void leader_takeover_tapes(Tuplesortstate *state); static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); /* * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts @@ -682,8 +703,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess) { Tuplesortstate *state; + MemoryContext maincontext; MemoryContext sortcontext; - MemoryContext tuplecontext; MemoryContext oldcontext; /* See leader_takeover_tapes() remarks on randomAccess support */ @@ -691,31 +712,31 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, elog(ERROR, "random access disallowed under parallel sort"); /* - * Create a working memory context for this sort operation. All data - * needed by the sort will live inside this context. + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. */ - sortcontext = AllocSetContextCreate(CurrentMemoryContext, + maincontext = AllocSetContextCreate(CurrentMemoryContext, "TupleSort main", ALLOCSET_DEFAULT_SIZES); /* - * Caller tuple (e.g. IndexTuple) memory context. - * - * A dedicated child context used exclusively for caller passed tuples - * eases memory management. Resetting at key points reduces - * fragmentation. Note that the memtuples array of SortTuples is allocated - * in the parent context, not this context, because there is no need to - * free memtuples early. + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. */ - tuplecontext = AllocSetContextCreate(sortcontext, - "Caller tuples", - ALLOCSET_DEFAULT_SIZES); + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); /* - * Make the Tuplesortstate within the per-sort context. This way, we + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we * don't need a separate pfree() operation for it at shutdown. */ - oldcontext = MemoryContextSwitchTo(sortcontext); + oldcontext = MemoryContextSwitchTo(maincontext); state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); @@ -724,11 +745,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, pg_rusage_init(&state->ru_start); #endif - state->status = TSS_INITIAL; state->randomAccess = randomAccess; - state->bounded = false; state->tuples = true; - state->boundUsed = false; /* * workMem is forced to be at least 64KB, the current minimum valid value @@ -737,38 +755,21 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, * with very little memory. */ state->allowedMem = Max(workMem, 64) * (int64) 1024; - state->availMem = state->allowedMem; state->sortcontext = sortcontext; - state->tuplecontext = tuplecontext; - state->tapeset = NULL; - - state->memtupcount = 0; + state->maincontext = maincontext; /* * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; * see comments in grow_memtuples(). */ - state->memtupsize = Max(1024, - ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); - - state->growmemtuples = true; - state->slabAllocatorUsed = false; - state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); - - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - - /* workMem must be large enough for the minimal memtuples array */ - if (LACKMEM(state)) - elog(ERROR, "insufficient memory allowed for sort"); - - state->currentRun = 0; + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; /* - * maxTapes, tapeRange, and Algorithm D variables will be initialized by - * inittapes(), if needed + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. */ - - state->result_tape = -1; /* flag that result tape has not been formed */ + tuplesort_begin_batch(state); /* * Initialize parallel-related state based on coordination information @@ -802,6 +803,77 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, return state; } +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reseti (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, @@ -814,7 +886,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); AssertArg(nkeys > 0); @@ -890,7 +962,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, Assert(indexRel->rd_rel->relam == BTREE_AM_OID); - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -985,7 +1057,7 @@ tuplesort_begin_index_btree(Relation heapRel, MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1063,7 +1135,7 @@ tuplesort_begin_index_hash(Relation heapRel, randomAccess); MemoryContext oldcontext; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1106,7 +1178,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, int16 typlen; bool typbyval; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1224,16 +1296,23 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound) } /* - * tuplesort_end + * tuplesort_used_bound * - * Release resources and clean up. - * - * NOTE: after calling this, any pointers returned by tuplesort_getXXX are - * pointing to garbage. Be careful not to attempt to use or free such - * pointers afterwards! + * Allow callers to find out if the sort state was able to use a bound. */ -void -tuplesort_end(Tuplesortstate *state) +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) { /* context swap probably not needed, but let's be safe */ MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); @@ -1291,10 +1370,104 @@ tuplesort_end(Tuplesortstate *state) MemoryContextSwitchTo(oldcontext); /* - * Free the per-sort memory context, thereby releasing all working memory, - * including the Tuplesortstate struct itself. + * Free the per-sort memory context, thereby releasing all working memory. */ - MemoryContextDelete(state->sortcontext); + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it didn't manage to fit those data to + * the main memory. This is why we assume space used on the disk to be + * more important for tracking resource usage than space used in memory. + * Note that amount of space occupied by some tuple set on the disk might + * be less than amount of space occupied by the same tuple set in the + * memory due to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; } /* @@ -2591,8 +2764,7 @@ mergeruns(Tuplesortstate *state) * Reset tuple memory. We've freed all the tuples that we previously * allocated. We will use the slab allocator from now on. */ - MemoryContextDelete(state->tuplecontext); - state->tuplecontext = NULL; + MemoryContextResetOnly(state->tuplecontext); /* * We no longer need a large memtuples array. (We will allocate a smaller @@ -2642,7 +2814,8 @@ mergeruns(Tuplesortstate *state) * from each input tape. */ state->memtupsize = numInputTapes; - state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); /* @@ -3138,18 +3311,15 @@ tuplesort_get_stats(Tuplesortstate *state, * to fix. Is it worth creating an API for the memory context code to * tell us how much is actually used in sortcontext? */ - if (state->tapeset) - { - stats->spaceType = SORT_SPACE_TYPE_DISK; - stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); - } - else - { - stats->spaceType = SORT_SPACE_TYPE_MEMORY; - stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; - } + tuplesort_updatemax(state); - switch (state->status) + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) { case TSS_SORTEDINMEM: if (state->boundUsed) diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h index 2e9920111f..4af6e0013d 100644 --- a/src/include/executor/execdebug.h +++ b/src/include/executor/execdebug.h @@ -86,10 +86,12 @@ #define SO_nodeDisplay(l) nodeDisplay(l) #define SO_printf(s) printf(s) #define SO1_printf(s, p) printf(s, p) +#define SO2_printf(s, p1, p2) printf(s, p1, p2) #else #define SO_nodeDisplay(l) #define SO_printf(s) #define SO1_printf(s, p) +#define SO2_printf(s, p1, p2) #endif /* EXEC_SORTDEBUG */ /* ---------------- diff --git a/src/include/executor/nodeIncrementalSort.h b/src/include/executor/nodeIncrementalSort.h new file mode 100644 index 0000000000..e62c02a4f3 --- /dev/null +++ b/src/include/executor/nodeIncrementalSort.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.h + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeIncrementalSort.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODEINCREMENTALSORT_H +#define NODEINCREMENTALSORT_H + +#include "access/parallel.h" +#include "nodes/execnodes.h" + +extern IncrementalSortState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags); +extern void ExecEndIncrementalSort(IncrementalSortState *node); +extern void ExecReScanIncrementalSort(IncrementalSortState *node); + +/* parallel instrumentation support */ +extern void ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt); +extern void ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt); +extern void ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pcxt); +extern void ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node); + +#endif /* NODEINCREMENTALSORT_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0fb5d61a3f..fb490b404c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1982,6 +1982,21 @@ typedef struct MaterialState Tuplestorestate *tuplestorestate; } MaterialState; + +/* ---------------- + * When performing sorting by multiple keys, it's possible that the input + * dataset is already sorted on a prefix of those keys. We call these + * "presorted keys". + * PresortedKeyData represents information about one such key. + * ---------------- + */ +typedef struct PresortedKeyData +{ + FmgrInfo flinfo; /* comparison function info */ + FunctionCallInfo fcinfo; /* comparison function call info */ + OffsetNumber attno; /* attribute number in tuple */ +} PresortedKeyData; + /* ---------------- * Shared memory container for per-worker sort information * ---------------- @@ -2010,6 +2025,71 @@ typedef struct SortState SharedSortInfo *shared_info; /* one entry per worker */ } SortState; +/* ---------------- + * Instrumentation information for IncrementalSort + * ---------------- + */ +typedef struct IncrementalSortGroupInfo +{ + int64 groupCount; + long maxDiskSpaceUsed; + long totalDiskSpaceUsed; + long maxMemorySpaceUsed; + long totalMemorySpaceUsed; + bits32 sortMethods; /* bitmask of TuplesortMethod */ +} IncrementalSortGroupInfo; + +typedef struct IncrementalSortInfo +{ + IncrementalSortGroupInfo fullsortGroupInfo; + IncrementalSortGroupInfo prefixsortGroupInfo; +} IncrementalSortInfo; + +/* ---------------- + * Shared memory container for per-worker incremental sort information + * ---------------- + */ +typedef struct SharedIncrementalSortInfo +{ + int num_workers; + IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER]; +} SharedIncrementalSortInfo; + +/* ---------------- + * IncrementalSortState information + * ---------------- + */ +typedef enum +{ + INCSORT_LOADFULLSORT, + INCSORT_LOADPREFIXSORT, + INCSORT_READFULLSORT, + INCSORT_READPREFIXSORT, +} IncrementalSortExecutionStatus; + +typedef struct IncrementalSortState +{ + ScanState ss; /* its first field is NodeTag */ + bool bounded; /* is the result set bounded? */ + int64 bound; /* if bounded, how many tuples are needed */ + bool outerNodeDone; /* finished fetching tuples from outer node */ + int64 bound_Done; /* value of bound we did the sort with */ + IncrementalSortExecutionStatus execution_status; + int64 n_fullsort_remaining; + Tuplesortstate *fullsort_state; /* private state of tuplesort.c */ + Tuplesortstate *prefixsort_state; /* private state of tuplesort.c */ + /* the keys by which the input path is already sorted */ + PresortedKeyData *presorted_keys; + + IncrementalSortInfo incsort_info; + + /* slot for pivot tuple defining values of presorted keys within group */ + TupleTableSlot *group_pivot; + TupleTableSlot *transfer_tuple; + bool am_worker; /* are we a worker? */ + SharedIncrementalSortInfo *shared_info; /* one entry per worker */ +} IncrementalSortState; + /* --------------------- * GroupState information * --------------------- diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 8a76afe8cc..50b1ba5186 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -74,6 +74,7 @@ typedef enum NodeTag T_HashJoin, T_Material, T_Sort, + T_IncrementalSort, T_Group, T_Agg, T_WindowAgg, @@ -130,6 +131,7 @@ typedef enum NodeTag T_HashJoinState, T_MaterialState, T_SortState, + T_IncrementalSortState, T_GroupState, T_AggState, T_WindowAggState, @@ -245,6 +247,7 @@ typedef enum NodeTag T_ProjectionPath, T_ProjectSetPath, T_SortPath, + T_IncrementalSortPath, T_GroupPath, T_UpperUniquePath, T_AggPath, diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 469c686e3f..a6d206b25a 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1638,6 +1638,15 @@ typedef struct SortPath Path *subpath; /* path representing input source */ } SortPath; +/* + * IncrementalSortPath + */ +typedef struct IncrementalSortPath +{ + SortPath spath; + int nPresortedCols; /* number of presorted columns */ +} IncrementalSortPath; + /* * GroupPath represents grouping (of presorted input) * diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 4869fe7b6d..be8ef54a1e 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -774,6 +774,16 @@ typedef struct Sort bool *nullsFirst; /* NULLS FIRST/LAST directions */ } Sort; +/* ---------------- + * incremental sort node + * ---------------- + */ +typedef struct IncrementalSort +{ + Sort sort; + int nPresortedCols; /* number of presorted columns */ +} IncrementalSort; + /* --------------- * group node - * Used for queries with GROUP BY (but no aggregates) specified. diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 735ba09650..9710e5c0a4 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -53,6 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; +extern PGDLLIMPORT bool enable_incrementalsort; extern PGDLLIMPORT bool enable_hashagg; extern PGDLLIMPORT bool enable_hashagg_disk; extern PGDLLIMPORT bool enable_groupingsets_hash_disk; @@ -103,6 +104,11 @@ extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, Cost input_cost, double tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples); +extern void cost_incremental_sort(Path *path, + PlannerInfo *root, List *pathkeys, int presorted_keys, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples, int width, Cost comparison_cost, int sort_mem, + double limit_tuples); extern void cost_append(AppendPath *path); extern void cost_merge_append(Path *path, PlannerInfo *root, List *pathkeys, int n_streams, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e450fe112a..bcd08af753 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -184,6 +184,12 @@ extern ProjectSetPath *create_set_projection_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, PathTarget *target); +extern SortPath *create_incremental_sort_path(PlannerInfo *root, + RelOptInfo *rel, + Path *subpath, + List *pathkeys, + int presorted_keys, + double limit_tuples); extern SortPath *create_sort_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index c689fe8e26..ab61f306cb 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -185,6 +185,7 @@ typedef enum extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2); extern bool pathkeys_contained_in(List *keys1, List *keys2); +extern bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common); extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys, Relids required_outer, CostSelector cost_criterion, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index a2fdd3fcd3..8d00a9e501 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -61,14 +61,17 @@ typedef struct SortCoordinateData *SortCoordinate; * Data structures for reporting sort statistics. Note that * TuplesortInstrumentation can't contain any pointers because we * sometimes put it in shared memory. + * + * TuplesortMethod is used in a bitmask in Increment Sort's shared memory + * instrumentation so needs to have each value be a separate bit. */ typedef enum { - SORT_TYPE_STILL_IN_PROGRESS = 0, - SORT_TYPE_TOP_N_HEAPSORT, - SORT_TYPE_QUICKSORT, - SORT_TYPE_EXTERNAL_SORT, - SORT_TYPE_EXTERNAL_MERGE + SORT_TYPE_STILL_IN_PROGRESS = 1 << 0, + SORT_TYPE_TOP_N_HEAPSORT = 1 << 1, + SORT_TYPE_QUICKSORT = 1 << 2, + SORT_TYPE_EXTERNAL_SORT = 1 << 3, + SORT_TYPE_EXTERNAL_MERGE = 1 << 4 } TuplesortMethod; typedef enum @@ -215,6 +218,7 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, bool randomAccess); extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); +extern bool tuplesort_used_bound(Tuplesortstate *state); extern void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot); @@ -239,6 +243,8 @@ extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, extern void tuplesort_end(Tuplesortstate *state); +extern void tuplesort_reset(Tuplesortstate *state); + extern void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats); extern const char *tuplesort_method_name(TuplesortMethod m); diff --git a/src/test/isolation/expected/drop-index-concurrently-1.out b/src/test/isolation/expected/drop-index-concurrently-1.out index 75dff56bc4..8e6adb66bb 100644 --- a/src/test/isolation/expected/drop-index-concurrently-1.out +++ b/src/test/isolation/expected/drop-index-concurrently-1.out @@ -21,7 +21,7 @@ QUERY PLAN Sort Sort Key: id, data - -> Seq Scan on test_dc + -> Index Scan using test_dc_pkey on test_dc Filter: ((data)::text = '34'::text) step select2: SELECT * FROM test_dc WHERE data=34 ORDER BY id,data; id data diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out new file mode 100644 index 0000000000..f130c606c8 --- /dev/null +++ b/src/test/regress/expected/incremental_sort.out @@ -0,0 +1,1441 @@ +-- When we have to sort the entire table, incremental sort will +-- be slower than plain sort, so it should not be used. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + QUERY PLAN +----------------------------------- + Sort + Sort Key: tenk1.four, tenk1.ten + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(5 rows) + +-- When there is a LIMIT clause, incremental sort is beneficial because +-- it only has to sort some of the groups, and not the entire table. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + QUERY PLAN +----------------------------------------- + Limit + -> Incremental Sort + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(7 rows) + +-- When work_mem is not enough to sort the entire table, incremental sort +-- may be faster if individual groups still fit into work_mem. +set work_mem to '2MB'; +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + QUERY PLAN +----------------------------------- + Incremental Sort + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(6 rows) + +reset work_mem; +create table t(a integer, b integer); +create or replace function explain_analyze_without_memory(query text) +returns table (out_line text) language plpgsql +as +$$ +declare + line text; +begin + for line in + execute 'explain (analyze, costs off, summary off, timing off) ' || query + loop + out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); + return next; + end loop; +end; +$$; +create or replace function explain_analyze_inc_sort_nodes(query text) +returns jsonb language plpgsql +as +$$ +declare + elements jsonb; + element jsonb; + matching_nodes jsonb := '[]'::jsonb; +begin + execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements; + while jsonb_array_length(elements) > 0 loop + element := elements->0; + elements := elements - 0; + case jsonb_typeof(element) + when 'array' then + if jsonb_array_length(element) > 0 then + elements := elements || element; + end if; + when 'object' then + if element ? 'Plan' then + elements := elements || jsonb_build_array(element->'Plan'); + element := element - 'Plan'; + else + if element ? 'Plans' then + elements := elements || jsonb_build_array(element->'Plans'); + element := element - 'Plans'; + end if; + if (element->>'Node Type')::text = 'Incremental Sort' then + matching_nodes := matching_nodes || element; + end if; + end if; + end case; + end loop; + return matching_nodes; +end; +$$; +create or replace function explain_analyze_inc_sort_nodes_without_memory(query text) +returns jsonb language plpgsql +as +$$ +declare + nodes jsonb := '[]'::jsonb; + node jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false); + node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false); + end loop; + end loop; + nodes := nodes || node; + end loop; + return nodes; +end; +$$; +create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text) +returns bool language plpgsql +as +$$ +declare + node jsonb; + group_stats jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + group_stats := node->group_key; + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then + raise exception '% has invalid max space < average space', group_key; + end if; + end loop; + end loop; + end loop; + return true; +end; +$$; +-- A single large group tested around each mode transition point. +insert into t(a, b) select 1, i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 1 | 51 + 1 | 52 + 1 | 53 + 1 | 54 + 1 | 55 + 1 | 56 + 1 | 57 + 1 | 58 + 1 | 59 + 1 | 60 + 1 | 61 + 1 | 62 + 1 | 63 + 1 | 64 + 1 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 1 | 51 + 1 | 52 + 1 | 53 + 1 | 54 + 1 | 55 + 1 | 56 + 1 | 57 + 1 | 58 + 1 | 59 + 1 | 60 + 1 | 61 + 1 | 62 + 1 | 63 + 1 | 64 + 1 | 65 + 1 | 66 +(66 rows) + +delete from t; +-- An initial large group followed by a small group. +insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 55; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 2 | 50 + 2 | 51 + 2 | 52 + 2 | 53 + 2 | 54 + 2 | 55 +(55 rows) + +-- Test EXPLAIN ANALYZE with only a fullsort group. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); + explain_analyze_without_memory +------------------------------------------------------------------------------------------------ + Limit (actual rows=55 loops=1) + -> Incremental Sort (actual rows=55 loops=1) + Sort Key: t.a, t.b + Presorted Key: t.a + Full-sort Groups: 2 Sort Methods: top-N heapsort, quicksort Memory: avg=NNkB peak=NNkB + -> Sort (actual rows=100 loops=1) + Sort Key: t.a + Sort Method: quicksort Memory: NNkB + -> Seq Scan on t (actual rows=100 loops=1) +(9 rows) + +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); + jsonb_pretty +-------------------------------------------------- + [ + + { + + "Sort Key": [ + + "t.a", + + "t.b" + + ], + + "Node Type": "Incremental Sort", + + "Actual Rows": 55, + + "Actual Loops": 1, + + "Presorted Key": [ + + "t.a" + + ], + + "Parallel Aware": false, + + "Full-sort Groups": { + + "Group Count": 2, + + "Sort Methods Used": [ + + "top-N heapsort", + + "quicksort" + + ], + + "Sort Space Memory": { + + "Average Sort Space Used": "NN",+ + "Maximum Sort Space Used": "NN" + + } + + }, + + "Parent Relationship": "Outer" + + } + + ] +(1 row) + +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55'); + explain_analyze_inc_sort_nodes_verify_invariants +-------------------------------------------------- + t +(1 row) + +delete from t; +-- An initial small group followed by a large group. +insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 70; + a | b +---+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 9 | 5 + 9 | 6 + 9 | 7 + 9 | 8 + 9 | 9 + 9 | 10 + 9 | 11 + 9 | 12 + 9 | 13 + 9 | 14 + 9 | 15 + 9 | 16 + 9 | 17 + 9 | 18 + 9 | 19 + 9 | 20 + 9 | 21 + 9 | 22 + 9 | 23 + 9 | 24 + 9 | 25 + 9 | 26 + 9 | 27 + 9 | 28 + 9 | 29 + 9 | 30 + 9 | 31 + 9 | 32 + 9 | 33 + 9 | 34 + 9 | 35 + 9 | 36 + 9 | 37 + 9 | 38 + 9 | 39 + 9 | 40 + 9 | 41 + 9 | 42 + 9 | 43 + 9 | 44 + 9 | 45 + 9 | 46 + 9 | 47 + 9 | 48 + 9 | 49 + 9 | 50 + 9 | 51 + 9 | 52 + 9 | 53 + 9 | 54 + 9 | 55 + 9 | 56 + 9 | 57 + 9 | 58 + 9 | 59 + 9 | 60 + 9 | 61 + 9 | 62 + 9 | 63 + 9 | 64 + 9 | 65 + 9 | 66 + 9 | 67 + 9 | 68 + 9 | 69 + 9 | 70 +(70 rows) + +-- Test rescan. +begin; +-- We force the planner to choose a plan with incremental sort on the right side +-- of a nested loop join node. That way we trigger the rescan code path. +set local enable_hashjoin = off; +set local enable_mergejoin = off; +set local enable_material = off; +set local enable_sort = off; +explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); + QUERY PLAN +------------------------------------------------ + Nested Loop Left Join + Join Filter: (t_1.a = t.a) + -> Seq Scan on t + Filter: (a = ANY ('{1,2}'::integer[])) + -> Incremental Sort + Sort Key: t_1.a, t_1.b + Presorted Key: t_1.a + -> Sort + Sort Key: t_1.a + -> Seq Scan on t t_1 +(10 rows) + +select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); + a | b | a | b +---+---+---+--- + 1 | 1 | 1 | 1 + 2 | 2 | 2 | 2 +(2 rows) + +rollback; +-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); + explain_analyze_without_memory +----------------------------------------------------------------------------------------------------------------------------------------------------- + Limit (actual rows=70 loops=1) + -> Incremental Sort (actual rows=70 loops=1) + Sort Key: t.a, t.b + Presorted Key: t.a + Full-sort Groups: 1 Sort Method: quicksort Memory: avg=NNkB peak=NNkB Presorted Groups: 5 Sort Method: quicksort Memory: avg=NNkB peak=NNkB + -> Sort (actual rows=100 loops=1) + Sort Key: t.a + Sort Method: quicksort Memory: NNkB + -> Seq Scan on t (actual rows=100 loops=1) +(9 rows) + +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); + jsonb_pretty +-------------------------------------------------- + [ + + { + + "Sort Key": [ + + "t.a", + + "t.b" + + ], + + "Node Type": "Incremental Sort", + + "Actual Rows": 70, + + "Actual Loops": 1, + + "Presorted Key": [ + + "t.a" + + ], + + "Parallel Aware": false, + + "Full-sort Groups": { + + "Group Count": 1, + + "Sort Methods Used": [ + + "quicksort" + + ], + + "Sort Space Memory": { + + "Average Sort Space Used": "NN",+ + "Maximum Sort Space Used": "NN" + + } + + }, + + "Presorted Groups": { + + "Group Count": 5, + + "Sort Methods Used": [ + + "quicksort" + + ], + + "Sort Space Memory": { + + "Average Sort Space Used": "NN",+ + "Maximum Sort Space Used": "NN" + + } + + }, + + "Parent Relationship": "Outer" + + } + + ] +(1 row) + +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70'); + explain_analyze_inc_sort_nodes_verify_invariants +-------------------------------------------------- + t +(1 row) + +delete from t; +-- Small groups of 10 tuples each tested around each mode transition point. +insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 + 3 | 34 + 3 | 35 + 3 | 36 + 3 | 37 + 3 | 38 + 3 | 39 + 4 | 40 + 4 | 41 + 4 | 42 + 4 | 43 + 4 | 44 + 4 | 45 + 4 | 46 + 4 | 47 + 4 | 48 + 4 | 49 + 5 | 50 + 5 | 51 + 5 | 52 + 5 | 53 + 5 | 54 + 5 | 55 + 5 | 56 + 5 | 57 + 5 | 58 + 5 | 59 + 6 | 60 + 6 | 61 + 6 | 62 + 6 | 63 + 6 | 64 + 6 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 + 3 | 34 + 3 | 35 + 3 | 36 + 3 | 37 + 3 | 38 + 3 | 39 + 4 | 40 + 4 | 41 + 4 | 42 + 4 | 43 + 4 | 44 + 4 | 45 + 4 | 46 + 4 | 47 + 4 | 48 + 4 | 49 + 5 | 50 + 5 | 51 + 5 | 52 + 5 | 53 + 5 | 54 + 5 | 55 + 5 | 56 + 5 | 57 + 5 | 58 + 5 | 59 + 6 | 60 + 6 | 61 + 6 | 62 + 6 | 63 + 6 | 64 + 6 | 65 + 6 | 66 +(66 rows) + +delete from t; +-- Small groups of only 1 tuple each tested around each mode transition point. +insert into t(a, b) select i, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 + 34 | 34 + 35 | 35 + 36 | 36 + 37 | 37 + 38 | 38 + 39 | 39 + 40 | 40 + 41 | 41 + 42 | 42 + 43 | 43 + 44 | 44 + 45 | 45 + 46 | 46 + 47 | 47 + 48 | 48 + 49 | 49 + 50 | 50 + 51 | 51 + 52 | 52 + 53 | 53 + 54 | 54 + 55 | 55 + 56 | 56 + 57 | 57 + 58 | 58 + 59 | 59 + 60 | 60 + 61 | 61 + 62 | 62 + 63 | 63 + 64 | 64 + 65 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 + 34 | 34 + 35 | 35 + 36 | 36 + 37 | 37 + 38 | 38 + 39 | 39 + 40 | 40 + 41 | 41 + 42 | 42 + 43 | 43 + 44 | 44 + 45 | 45 + 46 | 46 + 47 | 47 + 48 | 48 + 49 | 49 + 50 | 50 + 51 | 51 + 52 | 52 + 53 | 53 + 54 | 54 + 55 | 55 + 56 | 56 + 57 | 57 + 58 | 58 + 59 | 59 + 60 | 60 + 61 | 61 + 62 | 62 + 63 | 63 + 64 | 64 + 65 | 65 + 66 | 66 +(66 rows) + +delete from t; +drop table t; +-- Incremental sort vs. parallel queries +set min_parallel_table_scan_size = '1kB'; +set min_parallel_index_scan_size = '1kB'; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +create table t (a int, b int, c int); +insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i); +create index on t (a); +analyze t; +set enable_incrementalsort = off; +explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; + QUERY PLAN +------------------------------------------------------ + Limit + -> Sort + Sort Key: a, b, (sum(c)) + -> Finalize HashAggregate + Group Key: a, b + -> Gather + Workers Planned: 2 + -> Partial HashAggregate + Group Key: a, b + -> Parallel Seq Scan on t +(10 rows) + +set enable_incrementalsort = on; +explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; + QUERY PLAN +------------------------------------------------------ + Limit + -> Sort + Sort Key: a, b, (sum(c)) + -> Finalize HashAggregate + Group Key: a, b + -> Gather + Workers Planned: 2 + -> Partial HashAggregate + Group Key: a, b + -> Parallel Seq Scan on t +(10 rows) + +drop table t; diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out index fb4b342261..c36970575f 100644 --- a/src/test/regress/expected/partition_aggregate.out +++ b/src/test/regress/expected/partition_aggregate.out @@ -11,6 +11,8 @@ SET enable_partitionwise_aggregate TO true; SET enable_partitionwise_join TO true; -- Disable parallel plans. SET max_parallel_workers_per_gather TO 0; +-- Disable incremental sort, which can influence selected plans due to fuzz factor. +SET enable_incrementalsort TO off; -- -- Tests for list partitioned tables. -- diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 715842b87a..a126f0ad61 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashagg | on enable_hashagg_disk | on enable_hashjoin | on + enable_incrementalsort | on enable_indexonlyscan | on enable_indexscan | on enable_material | on @@ -91,7 +92,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(19 rows) +(20 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index a98dba7b2f..a741e89616 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- # Another group of parallel tests # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 +test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 incremental_sort # rules cannot run concurrently with any test that creates # a view or rule in the public schema diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 3f66e0b859..1a6821ca46 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -89,6 +89,7 @@ test: select_distinct_on test: select_implicit test: select_having test: subselect +test: incremental_sort test: union test: case test: join diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql new file mode 100644 index 0000000000..3b359efa29 --- /dev/null +++ b/src/test/regress/sql/incremental_sort.sql @@ -0,0 +1,213 @@ +-- When we have to sort the entire table, incremental sort will +-- be slower than plain sort, so it should not be used. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + +-- When there is a LIMIT clause, incremental sort is beneficial because +-- it only has to sort some of the groups, and not the entire table. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + +-- When work_mem is not enough to sort the entire table, incremental sort +-- may be faster if individual groups still fit into work_mem. +set work_mem to '2MB'; +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; +reset work_mem; + +create table t(a integer, b integer); + +create or replace function explain_analyze_without_memory(query text) +returns table (out_line text) language plpgsql +as +$$ +declare + line text; +begin + for line in + execute 'explain (analyze, costs off, summary off, timing off) ' || query + loop + out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); + return next; + end loop; +end; +$$; + +create or replace function explain_analyze_inc_sort_nodes(query text) +returns jsonb language plpgsql +as +$$ +declare + elements jsonb; + element jsonb; + matching_nodes jsonb := '[]'::jsonb; +begin + execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements; + while jsonb_array_length(elements) > 0 loop + element := elements->0; + elements := elements - 0; + case jsonb_typeof(element) + when 'array' then + if jsonb_array_length(element) > 0 then + elements := elements || element; + end if; + when 'object' then + if element ? 'Plan' then + elements := elements || jsonb_build_array(element->'Plan'); + element := element - 'Plan'; + else + if element ? 'Plans' then + elements := elements || jsonb_build_array(element->'Plans'); + element := element - 'Plans'; + end if; + if (element->>'Node Type')::text = 'Incremental Sort' then + matching_nodes := matching_nodes || element; + end if; + end if; + end case; + end loop; + return matching_nodes; +end; +$$; + +create or replace function explain_analyze_inc_sort_nodes_without_memory(query text) +returns jsonb language plpgsql +as +$$ +declare + nodes jsonb := '[]'::jsonb; + node jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false); + node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false); + end loop; + end loop; + nodes := nodes || node; + end loop; + return nodes; +end; +$$; + +create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text) +returns bool language plpgsql +as +$$ +declare + node jsonb; + group_stats jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + group_stats := node->group_key; + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then + raise exception '% has invalid max space < average space', group_key; + end if; + end loop; + end loop; + end loop; + return true; +end; +$$; + +-- A single large group tested around each mode transition point. +insert into t(a, b) select 1, i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +-- An initial large group followed by a small group. +insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; +select * from (select * from t order by a) s order by a, b limit 55; +-- Test EXPLAIN ANALYZE with only a fullsort group. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55'); +delete from t; + +-- An initial small group followed by a large group. +insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; +select * from (select * from t order by a) s order by a, b limit 70; +-- Test rescan. +begin; +-- We force the planner to choose a plan with incremental sort on the right side +-- of a nested loop join node. That way we trigger the rescan code path. +set local enable_hashjoin = off; +set local enable_mergejoin = off; +set local enable_material = off; +set local enable_sort = off; +explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); +select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); +rollback; +-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70'); +delete from t; + +-- Small groups of 10 tuples each tested around each mode transition point. +insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +-- Small groups of only 1 tuple each tested around each mode transition point. +insert into t(a, b) select i, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +drop table t; + +-- Incremental sort vs. parallel queries +set min_parallel_table_scan_size = '1kB'; +set min_parallel_index_scan_size = '1kB'; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; + +create table t (a int, b int, c int); +insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i); +create index on t (a); +analyze t; + +set enable_incrementalsort = off; +explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; + +set enable_incrementalsort = on; +explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1; + +drop table t; diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql index ba4fed4d43..178f2079fa 100644 --- a/src/test/regress/sql/partition_aggregate.sql +++ b/src/test/regress/sql/partition_aggregate.sql @@ -12,6 +12,8 @@ SET enable_partitionwise_aggregate TO true; SET enable_partitionwise_join TO true; -- Disable parallel plans. SET max_parallel_workers_per_gather TO 0; +-- Disable incremental sort, which can influence selected plans due to fuzz factor. +SET enable_incrementalsort TO off; -- -- Tests for list partitioned tables.