diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index fdeda90f03..2f04702b6e 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1472,6 +1472,45 @@ include_dir 'conf.d' + + replacement_sort_tuples (integer) + + replacement_sort_tuples configuration parameter + + + + + When the number of tuples to be sorted is smaller than this number, + a sort will produce its first output run using replacement selection + rather than quicksort. This may be useful in memory-constrained + environments where tuples that are input into larger sort operations + have a strong physical-to-logical correlation. Note that this does + not include input tuples with an inverse + correlation. It is possible for the replacement selection algorithm + to generate one long run that requires no merging, where use of the + default strategy would result in many runs that must be merged + to produce a final sorted output. This may allow sort + operations to complete sooner. + + + The default is 150,000 tuples. Note that higher values are typically + not much more effective, and may be counter-productive, since the + priority queue is sensitive to the size of available CPU cache, whereas + the default strategy sorts runs using a cache + oblivious algorithm. This property allows the default sort + strategy to automatically and transparently make effective use + of available CPU cache. + + + Setting maintenance_work_mem to its default + value usually prevents utility command external sorts (e.g., + sorts used by CREATE INDEX to build B-Tree + indexes) from ever using replacement selection sort, unless the + input tuples are quite wide. + + + + autovacuum_work_mem (integer) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 8e4caeba0a..4917922ab5 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1432,8 +1432,8 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) * total, but we will also need to write and read each tuple once per * merge pass. We expect about ceil(logM(r)) merge passes where r is the * number of initial runs formed and M is the merge order used by tuplesort.c. - * Since the average initial run should be about twice sort_mem, we have - * disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem))) + * Since the average initial run should be about sort_mem, we have + * disk traffic = 2 * relsize * ceil(logM(p / sort_mem)) * cpu = comparison_cost * t * log2(t) * * If the sort is bounded (i.e., only the first k result tuples are needed) @@ -1509,7 +1509,7 @@ cost_sort(Path *path, PlannerInfo *root, * We'll have to use a disk-based sort of all the tuples */ double npages = ceil(input_bytes / BLCKSZ); - double nruns = (input_bytes / sort_mem_bytes) * 0.5; + double nruns = input_bytes / sort_mem_bytes; double mergeorder = tuplesort_merge_order(sort_mem_bytes); double log_runs; double npageaccesses; diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 597dab4301..5a7783b981 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -109,6 +109,7 @@ bool enableFsync = true; bool allowSystemTableMods = false; int work_mem = 1024; int maintenance_work_mem = 16384; +int replacement_sort_tuples = 150000; /* * Primary determinants of sizes of shared-memory structures. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index a07050017e..f7ed167d7f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1928,6 +1928,16 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"replacement_sort_tuples", PGC_USERSET, RESOURCES_MEM, + gettext_noop("Sets the maximum number of tuples to be sorted using replacement selection."), + gettext_noop("When more tuples than this are present, quicksort will be used.") + }, + &replacement_sort_tuples, + 150000, 0, INT_MAX, + NULL, NULL, NULL + }, + /* * We use the hopefully-safely-small value of 100kB as the compiled-in * default for max_stack_depth. InitializeGUCOptions will increase it if diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 8da3ff14c6..bcc86e29d2 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -125,6 +125,7 @@ # actively intend to use prepared transactions. #work_mem = 4MB # min 64kB #maintenance_work_mem = 64MB # min 1MB +#replacement_sort_tuples = 150000 # limits use of replacement selection sort #autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem #max_stack_depth = 2MB # min 100kB #dynamic_shared_memory_type = posix # the default is the first option diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d033c95e78..78c5cfc8af 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -11,14 +11,16 @@ * algorithm. * * See Knuth, volume 3, for more than you want to know about the external - * sorting algorithm. We divide the input into sorted runs using replacement - * selection, in the form of a priority tree implemented as a heap - * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase - * merge, Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D - * are implemented by logtape.c, which avoids space wastage by recycling - * disk space as soon as each block is read from its "tape". + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H -- although that strategy is + * often avoided altogether), but that can now only happen first the first + * run. We merge the runs using polyphase merge, Knuth's Algorithm + * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by + * logtape.c, which avoids space wastage by recycling disk space as soon + * as each block is read from its "tape". * - * We do not form the initial runs using Knuth's recommended replacement + * We never form the initial runs using Knuth's recommended replacement * selection data structure (Algorithm 5.4.1R), because it uses a fixed * number of records in memory at all times. Since we are dealing with * tuples that may vary considerably in size, we want to be able to vary @@ -28,7 +30,30 @@ * Algorithm 5.4.1R, each record is stored with the run number that it * must go into, and we use (run number, key) as the ordering key for the * heap. When the run number at the top of the heap changes, we know that - * no more records of the prior run are left in the heap. + * no more records of the prior run are left in the heap. Note that there + * are in practice only ever two distinct run numbers, due to the greatly + * reduced use of replacement selection in PostgreSQL 9.6. + * + * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small + * customizations) is only used with the aim of producing just one run, + * thereby avoiding all merging. Only the first run can use replacement + * selection, which is why there are now only two possible valid run + * numbers, and why heapification is customized to not distinguish between + * tuples in the second run (those will be quicksorted). We generally + * prefer a simple hybrid sort-merge strategy, where runs are sorted in much + * the same way as the entire input of an internal sort is sorted (using + * qsort()). The replacement_sort_tuples GUC controls the limited remaining + * use of replacement selection for the first run. + * + * There are several reasons to favor a hybrid sort-merge strategy. + * Maintaining a priority tree/heap has poor CPU cache characteristics. + * Furthermore, the growth in main memory sizes has greatly diminished the + * value of having runs that are larger than available memory, even in the + * case where there is partially sorted input and runs can be made far + * larger by using a heap. In most cases, a single-pass merge step is all + * that is required even when runs are no larger than available memory. + * Avoiding multiple merge passes was traditionally considered to be the + * major advantage of using replacement selection. * * The approximate amount of memory allowed for any one sort operation * is specified in kilobytes by the caller (most pass work_mem). Initially, @@ -36,13 +61,12 @@ * we haven't exceeded workMem. If we reach the end of the input without * exceeding workMem, we sort the array using qsort() and subsequently return * tuples just by scanning the tuple array sequentially. If we do exceed - * workMem, we construct a heap using Algorithm H and begin to emit tuples - * into sorted runs in temporary tapes, emitting just enough tuples at each - * step to get back within the workMem limit. Whenever the run number at - * the top of the heap changes, we begin a new run with a new output tape - * (selected per Algorithm D). After the end of the input is reached, - * we dump out remaining tuples in memory into a final run (or two), - * then merge the runs using Algorithm D. + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run + * (or two, when replacement selection is still used), then merge the runs + * using Algorithm D. * * When merging runs, we use a heap containing just the frontmost tuple from * each source run; we repeatedly output the smallest tuple and insert the @@ -162,15 +186,18 @@ bool optimize_bounded_sort = true; * described above. Accordingly, "tuple" is always used in preference to * datum1 as the authoritative value for pass-by-reference cases. * - * While building initial runs, tupindex holds the tuple's run number. During - * merge passes, we re-use it to hold the input tape number that each tuple in - * the heap was read from, or to hold the index of the next tuple pre-read - * from the same tape in the case of pre-read entries. tupindex goes unused - * if the sort occurs entirely in memory. + * While building initial runs, tupindex holds the tuple's run number. + * Historically, the run number could meaningfully distinguish many runs, but + * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement + * selection is always abandoned after the first run; no other run number + * should be represented here. During merge passes, we re-use it to hold the + * input tape number that each tuple in the heap was read from, or to hold the + * index of the next tuple pre-read from the same tape in the case of pre-read + * entries. tupindex goes unused if the sort occurs entirely in memory. */ typedef struct { - void *tuple; /* the tuple proper */ + void *tuple; /* the tuple itself */ Datum datum1; /* value of first key column */ bool isnull1; /* is first key column NULL? */ int tupindex; /* see notes above */ @@ -206,6 +233,15 @@ typedef enum #define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3) #define MERGE_BUFFER_SIZE (BLCKSZ * 32) + /* + * Run numbers, used during external sort operations. + * + * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun. + */ +#define RUN_FIRST 0 +#define HEAP_RUN_NEXT INT_MAX +#define RUN_SECOND 1 + typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, Tuplesortstate *state); @@ -292,9 +328,17 @@ struct Tuplesortstate */ bool batchUsed; + /* + * While building initial runs, this indicates if the replacement + * selection strategy is in use. When it isn't, then a simple hybrid + * sort-merge strategy is in use instead (runs are quicksorted). + */ + bool replaceActive; + /* * While building initial runs, this is the current output run number - * (starting at 0). Afterwards, it is the number of initial runs we made. + * (starting at RUN_FIRST). Afterwards, it is the number of initial + * runs we made. */ int currentRun; @@ -493,6 +537,7 @@ struct Tuplesortstate static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); static bool consider_abort_common(Tuplesortstate *state); +static bool useselection(Tuplesortstate *state); static void inittapes(Tuplesortstate *state); static void selectnewtape(Tuplesortstate *state); static void mergeruns(Tuplesortstate *state); @@ -508,8 +553,10 @@ static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen); static void mergepreread(Tuplesortstate *state); static void mergeprereadone(Tuplesortstate *state, int srcTape); static void dumptuples(Tuplesortstate *state, bool alltuples); +static void dumpbatch(Tuplesortstate *state, bool alltuples); static void make_bounded_heap(Tuplesortstate *state); static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, int tupleindex, bool checkIndex); static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex); @@ -654,7 +701,7 @@ tuplesort_begin_common(int workMem, bool randomAccess) if (LACKMEM(state)) elog(ERROR, "insufficient memory allowed for sort"); - state->currentRun = 0; + state->currentRun = RUN_FIRST; /* * maxTapes, tapeRange, and Algorithm D variables will be initialized by @@ -1566,22 +1613,61 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) /* * Insert the tuple into the heap, with run number currentRun if - * it can go into the current run, else run number currentRun+1. - * The tuple can go into the current run if it is >= the first + * it can go into the current run, else HEAP_RUN_NEXT. The tuple + * can go into the current run if it is >= the first * not-yet-output tuple. (Actually, it could go into the current * run if it is >= the most recently output tuple ... but that * would require keeping around the tuple we last output, and it's * simplest to let writetup free each tuple as soon as it's * written.) * - * Note there will always be at least one tuple in the heap at - * this point; see dumptuples. + * Note that this only applies when: + * + * - currentRun is RUN_FIRST + * + * - Replacement selection is in use (typically it is never used). + * + * When these two conditions are not both true, all tuples are + * appended indifferently, much like the TSS_INITIAL case. + * + * There should always be room to store the incoming tuple. */ - Assert(state->memtupcount > 0); - if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) + Assert(!state->replaceActive || state->memtupcount > 0); + if (state->replaceActive && + COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) + { + Assert(state->currentRun == RUN_FIRST); + + /* + * Insert tuple into first, fully heapified run. + * + * Unlike classic replacement selection, which this module was + * previously based on, only RUN_FIRST tuples are fully + * heapified. Any second/next run tuples are appended + * indifferently. While HEAP_RUN_NEXT tuples may be sifted + * out of the way of first run tuples, COMPARETUP() will never + * be called for the run's tuples during sifting (only our + * initial COMPARETUP() call is required for the tuple, to + * determine that the tuple does not belong in RUN_FIRST). + */ tuplesort_heap_insert(state, tuple, state->currentRun, true); + } else - tuplesort_heap_insert(state, tuple, state->currentRun + 1, true); + { + /* + * Tuple was determined to not belong to heapified RUN_FIRST, + * or replacement selection not in play. Append the tuple to + * memtuples indifferently. + * + * dumptuples() does not trust that the next run's tuples are + * heapified. Anything past the first run will always be + * quicksorted even when replacement selection is initially + * used. (When it's never used, every tuple still takes this + * path.) + */ + tuple->tupindex = HEAP_RUN_NEXT; + state->memtuples[state->memtupcount++] = *tuple; + } /* * If we are over the memory limit, dump tuples till we're under. @@ -1658,18 +1744,7 @@ tuplesort_performsort(Tuplesortstate *state) * We were able to accumulate all the tuples within the allowed * amount of memory. Just qsort 'em and we're done. */ - if (state->memtupcount > 1) - { - /* Can we use the single-key sort function? */ - if (state->onlyKey != NULL) - qsort_ssup(state->memtuples, state->memtupcount, - state->onlyKey); - else - qsort_tuple(state->memtuples, - state->memtupcount, - state->comparetup, - state); - } + tuplesort_sort_memtuples(state); state->current = 0; state->eof_reached = false; state->markpos_offset = 0; @@ -2181,6 +2256,28 @@ tuplesort_merge_order(int64 allowedMem) return mOrder; } +/* + * useselection - determine algorithm to use to sort first run. + * + * It can sometimes be useful to use the replacement selection algorithm if it + * results in one large run, and there is little available workMem. See + * remarks on RUN_SECOND optimization within dumptuples(). + */ +static bool +useselection(Tuplesortstate *state) +{ + /* + * memtupsize might be noticeably higher than memtupcount here in atypical + * cases. It seems slightly preferable to not allow recent outliers to + * impact this determination. Note that caller's trace_sort output reports + * memtupcount instead. + */ + if (state->memtupsize <= replacement_sort_tuples) + return true; + + return false; +} + /* * inittapes - initialize for tape sorting. * @@ -2190,7 +2287,6 @@ static void inittapes(Tuplesortstate *state) { int maxTapes, - ntuples, j; int64 tapeSpace; @@ -2253,25 +2349,42 @@ inittapes(Tuplesortstate *state) state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); /* - * Convert the unsorted contents of memtuples[] into a heap. Each tuple is - * marked as belonging to run number zero. - * - * NOTE: we pass false for checkIndex since there's no point in comparing - * indexes in this step, even though we do intend the indexes to be part - * of the sort key... + * Give replacement selection a try based on user setting. There will + * be a switch to a simple hybrid sort-merge strategy after the first + * run (iff we could not output one long run). */ - ntuples = state->memtupcount; - state->memtupcount = 0; /* make the heap empty */ - for (j = 0; j < ntuples; j++) + state->replaceActive = useselection(state); + + if (state->replaceActive) { - /* Must copy source tuple to avoid possible overwrite */ - SortTuple stup = state->memtuples[j]; +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "replacement selection will sort %d first run tuples", + state->memtupcount); +#endif + /* + * Convert the unsorted contents of memtuples[] into a heap. Each + * tuple is marked as belonging to run number zero. + * + * NOTE: we pass false for checkIndex since there's no point in + * comparing indexes in this step, even though we do intend the + * indexes to be part of the sort key... + */ + int ntuples = state->memtupcount; - tuplesort_heap_insert(state, &stup, 0, false); + state->memtupcount = 0; /* make the heap empty */ + + for (j = 0; j < ntuples; j++) + { + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[j]; + + tuplesort_heap_insert(state, &stup, 0, false); + } + Assert(state->memtupcount == ntuples); } - Assert(state->memtupcount == ntuples); - state->currentRun = 0; + state->currentRun = RUN_FIRST; /* * Initialize variables of Algorithm D (step D1). @@ -2362,11 +2475,12 @@ mergeruns(Tuplesortstate *state) /* * If we produced only one initial run (quite likely if the total data - * volume is between 1X and 2X workMem), we can just use that tape as the - * finished output, rather than doing a useless merge. (This obvious - * optimization is not in Knuth's algorithm.) + * volume is between 1X and 2X workMem when replacement selection is used, + * but something we particular count on when input is presorted), we can + * just use that tape as the finished output, rather than doing a useless + * merge. (This obvious optimization is not in Knuth's algorithm.) */ - if (state->currentRun == 1) + if (state->currentRun == RUN_SECOND) { state->result_tape = state->tp_tapenum[state->destTape]; /* must freeze and rewind the finished output tape */ @@ -3094,21 +3208,25 @@ mergeprereadone(Tuplesortstate *state, int srcTape) } /* - * dumptuples - remove tuples from heap and write to tape + * dumptuples - remove tuples from memtuples and write to tape * * This is used during initial-run building, but not during merging. * - * When alltuples = false, dump only enough tuples to get under the - * availMem limit (and leave at least one tuple in the heap in any case, - * since puttuple assumes it always has a tuple to compare to). We also - * insist there be at least one free slot in the memtuples[] array. + * When alltuples = false and replacement selection is still active, dump + * only enough tuples to get under the availMem limit (and leave at least + * one tuple in memtuples, since puttuple will then assume it is a heap that + * has a tuple to compare to). We always insist there be at least one free + * slot in the memtuples[] array. * - * When alltuples = true, dump everything currently in memory. - * (This case is only used at end of input data.) + * When alltuples = true, dump everything currently in memory. (This + * case is only used at end of input data, although in practice only the + * first run could fail to dump all tuples when we LACKMEM(), and only + * when replacement selection is active.) * - * If we empty the heap, close out the current run and return (this should - * only happen at end of input data). If we see that the tuple run number - * at the top of the heap has changed, start a new run. + * If, when replacement selection is active, we see that the tuple run + * number at the top of the heap has changed, start a new run. This must be + * the first run, because replacement selection is always abandoned for all + * further runs. */ static void dumptuples(Tuplesortstate *state, bool alltuples) @@ -3117,46 +3235,183 @@ dumptuples(Tuplesortstate *state, bool alltuples) (LACKMEM(state) && state->memtupcount > 1) || state->memtupcount >= state->memtupsize) { - /* - * Dump the heap's frontmost entry, and sift up to remove it from the - * heap. - */ - Assert(state->memtupcount > 0); - WRITETUP(state, state->tp_tapenum[state->destTape], - &state->memtuples[0]); - tuplesort_heap_siftup(state, true); + if (state->replaceActive) + { + /* + * Still holding out for a case favorable to replacement selection. + * Still incrementally spilling using heap. + * + * Dump the heap's frontmost entry, and sift up to remove it from + * the heap. + */ + Assert(state->memtupcount > 0); + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[0]); + tuplesort_heap_siftup(state, true); + } + else + { + /* + * Once committed to quicksorting runs, never incrementally + * spill + */ + dumpbatch(state, alltuples); + break; + } /* - * If the heap is empty *or* top run number has changed, we've - * finished the current run. + * If top run number has changed, we've finished the current run + * (this can only be the first run), and will no longer spill + * incrementally. */ if (state->memtupcount == 0 || - state->currentRun != state->memtuples[0].tupindex) + state->memtuples[0].tupindex == HEAP_RUN_NEXT) { markrunend(state, state->tp_tapenum[state->destTape]); + Assert(state->currentRun == RUN_FIRST); state->currentRun++; state->tp_runs[state->destTape]++; state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ #ifdef TRACE_SORT if (trace_sort) - elog(LOG, "finished writing%s run %d to tape %d: %s", - (state->memtupcount == 0) ? " final" : "", + elog(LOG, "finished incrementally writing %s run %d to tape %d: %s", + (state->memtupcount == 0) ? "only" : "first", state->currentRun, state->destTape, pg_rusage_show(&state->ru_start)); #endif + /* + * Done if heap is empty, which is possible when there is only one + * long run. + */ + Assert(state->currentRun == RUN_SECOND); + if (state->memtupcount == 0) + { + /* + * Replacement selection best case; no final merge required, + * because there was only one initial run (second run has no + * tuples). See RUN_SECOND case in mergeruns(). + */ + break; + } /* - * Done if heap is empty, else prepare for new run. + * Abandon replacement selection for second run (as well as any + * subsequent runs). */ - if (state->memtupcount == 0) - break; - Assert(state->currentRun == state->memtuples[0].tupindex); + state->replaceActive = false; + + /* + * First tuple of next run should not be heapified, and so will + * bear placeholder run number. In practice this must actually be + * the second run, which just became the currentRun, so we're + * clear to quicksort and dump the tuples in batch next time + * memtuples becomes full. + */ + Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT); selectnewtape(state); } } } +/* + * dumpbatch - sort and dump all memtuples, forming one run on tape + * + * Second or subsequent runs are never heapified by this module (although + * heapification still respects run number differences between the first and + * second runs), and a heap (replacement selection priority queue) is often + * avoided in the first place. + */ +static void +dumpbatch(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than + * allowing a special case where there was a superfluous + * selectnewtape() call (i.e. a call with no subsequent run actually + * written to destTape), we prefer to write out a 0 tuple run. + * + * mergepreread()/mergeprereadone() are prepared for 0 tuple runs, and + * will reliably mark the tape inactive for the merge when called from + * beginmerge(). This case is therefore similar to the case where + * mergeonerun() finds a dummy run for the tape, and so doesn't need to + * merge a run from the tape (or conceptually "merges" the dummy run, + * if you prefer). According to Knuth, Algorithm D "isn't strictly + * optimal" in its method of distribution and dummy run assignment; + * this edge case seems very unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for this + * run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in allocation patterns due to the use of batch memory. + * Fragmentation due to AllocSetFree's bucketing by size class might be + * particularly bad if this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished writing run %d to tape %d: %s", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + /* * tuplesort_rescan - rewind and replay the scan */ @@ -3315,10 +3570,15 @@ tuplesort_get_stats(Tuplesortstate *state, * * Compare two SortTuples. If checkIndex is true, use the tuple index * as the front of the sort key; otherwise, no. + * + * Note that for checkIndex callers, the heap invariant is never + * maintained beyond the first run, and so there are no COMPARETUP() + * calls needed to distinguish tuples in HEAP_RUN_NEXT. */ #define HEAPCOMPARE(tup1,tup2) \ - (checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \ + (checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \ + (tup1)->tupindex == HEAP_RUN_NEXT) ? \ ((tup1)->tupindex) - ((tup2)->tupindex) : \ COMPARETUP(state, tup1, tup2)) @@ -3416,6 +3676,31 @@ sort_bounded_heap(Tuplesortstate *state) state->boundUsed = true; } +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts. Quicksort is also generally + * preferred to replacement selection for generating runs during external sort + * operations, although replacement selection is sometimes used for the first + * run. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + /* * Insert a new tuple into an empty or existing heap, maintaining the * heap invariant. Caller is responsible for ensuring there's room. @@ -3443,6 +3728,7 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, memtuples = state->memtuples; Assert(state->memtupcount < state->memtupsize); + Assert(!checkIndex || tupleindex == RUN_FIRST); CHECK_FOR_INTERRUPTS(); @@ -3475,6 +3761,7 @@ tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex) int i, n; + Assert(!checkIndex || state->currentRun == RUN_FIRST); if (--state->memtupcount <= 0) return; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9200f045a2..356fcc6eda 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -239,6 +239,7 @@ extern bool enableFsync; extern bool allowSystemTableMods; extern PGDLLIMPORT int work_mem; extern PGDLLIMPORT int maintenance_work_mem; +extern PGDLLIMPORT int replacement_sort_tuples; extern int VacuumCostPageHit; extern int VacuumCostPageMiss;