diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index f946c69b9b..331068d464 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -49,7 +49,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.153 2006/02/05 02:59:16 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.154 2006/02/19 05:54:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -70,10 +70,10 @@ #include "utils/selfuncs.h" #include "utils/lsyscache.h" #include "utils/syscache.h" +#include "utils/tuplesort.h" #define LOG2(x) (log(x) / 0.693147180559945) -#define LOG6(x) (log(x) / 1.79175946922805) /* * Some Paths return less than the nominal number of rows of their parent @@ -767,11 +767,10 @@ cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel) * If the total volume exceeds work_mem, we switch to a tape-style merge * algorithm. There will still be about t*log2(t) tuple comparisons in * total, but we will also need to write and read each tuple once per - * merge pass. We expect about ceil(log6(r)) merge passes where r is the - * number of initial runs formed (log6 because tuplesort.c uses six-tape - * merging). Since the average initial run should be about twice work_mem, - * we have - * disk traffic = 2 * relsize * ceil(log6(p / (2*work_mem))) + * merge pass. We expect about ceil(logM(r)) merge passes where r is the + * number of initial runs formed and M is the merge order used by tuplesort.c. + * Since the average initial run should be about twice work_mem, we have + * disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem))) * cpu = comparison_cost * t * log2(t) * * The disk traffic is assumed to be half sequential and half random @@ -824,10 +823,14 @@ cost_sort(Path *path, PlannerInfo *root, { double npages = ceil(nbytes / BLCKSZ); double nruns = (nbytes / work_mem_bytes) * 0.5; - double log_runs = ceil(LOG6(nruns)); + double mergeorder = tuplesort_merge_order(work_mem_bytes); + double log_runs; double npageaccesses; - if (log_runs < 1.0) + /* Compute logM(r) as log(r) / log(M) */ + if (nruns > mergeorder) + log_runs = ceil(log(nruns) / log(mergeorder)); + else log_runs = 1.0; npageaccesses = 2.0 * npages * log_runs; /* Assume half are sequential (cost 1), half are not */ diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 94d2c24fbb..8e2d693164 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -48,7 +48,7 @@ * each source run; we repeatedly output the smallest tuple and insert the * next tuple from its source tape (if any). When the heap empties, the merge * is complete. The basic merge algorithm thus needs very little memory --- - * only M tuples for an M-way merge, and M is at most six in the present code. + * only M tuples for an M-way merge, and M is constrained to a small number. * However, we can still make good use of our full workMem allocation by * pre-reading additional tuples from each source tape. Without prereading, * our access pattern to the temporary file would be very erratic; on average @@ -73,12 +73,25 @@ * on-the-fly as the caller repeatedly calls tuplesort_gettuple; this * saves one cycle of writing all the data out to disk and reading it in. * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes. + * * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.57 2006/01/05 01:56:29 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.58 2006/02/19 05:54:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -120,11 +133,18 @@ typedef enum } TupSortStatus; /* - * We use a seven-tape polyphase merge, which is the "sweet spot" on the - * tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2). + * Parameters for calculation of number of tapes to use --- see inittapes(). + * + * In this calculation we assume that each tape will cost us about 3 blocks + * worth of buffer space (which is an underestimate for very large data + * volumes, but it's probably close enough --- see logtape.c). + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each + * tape during a preread cycle (see discussion at top of file). */ -#define MAXTAPES 7 /* Knuth's T */ -#define TAPERANGE (MAXTAPES-1) /* Knuth's P */ +#define MINTAPES 7 /* minimum number of tapes */ +#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3) +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) /* * Private state of a Tuplesort operation. @@ -135,6 +155,8 @@ struct Tuplesortstate bool randomAccess; /* did caller request random access? */ long availMem; /* remaining memory available, in bytes */ long allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ /* @@ -179,7 +201,7 @@ struct Tuplesortstate * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS * and FINALMERGE, the tuples are organized in "heap" order per Algorithm * H. (Note that memtupcount only counts the tuples that are part of the - * heap --- during merge passes, memtuples[] entries beyond TAPERANGE are + * heap --- during merge passes, memtuples[] entries beyond tapeRange are * never in the heap and are used to hold pre-read tuples.) In state * SORTEDONTAPE, the array is not used. */ @@ -204,6 +226,11 @@ struct Tuplesortstate */ int currentRun; + /* + * Unless otherwise noted, all pointer variables below are pointers + * to arrays of length maxTapes, holding per-tape data. + */ + /* * These variables are only used during merge passes. mergeactive[i] is * true if we are reading an input run from (actual) tape number i and @@ -218,11 +245,10 @@ struct Tuplesortstate * in these lists, because memtuples[0] is part of the merge heap and is * never a pre-read tuple. */ - bool mergeactive[MAXTAPES]; /* Active input run source? */ - int mergenext[MAXTAPES]; /* first preread tuple for each source */ - int mergelast[MAXTAPES]; /* last preread tuple for each source */ - long mergeavailmem[MAXTAPES]; /* availMem for prereading - * tapes */ + bool *mergeactive; /* Active input run source? */ + int *mergenext; /* first preread tuple for each source */ + int *mergelast; /* last preread tuple for each source */ + long *mergeavailmem; /* availMem for prereading tapes */ long spacePerTape; /* actual per-tape target usage */ int mergefreelist; /* head of freelist of recycled slots */ int mergefirstfree; /* first slot never used in this merge */ @@ -234,10 +260,10 @@ struct Tuplesortstate */ int Level; /* Knuth's l */ int destTape; /* current output tape (Knuth's j, less 1) */ - int tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */ - int tp_runs[MAXTAPES]; /* # of real runs on each tape */ - int tp_dummy[MAXTAPES]; /* # of dummy runs for each tape (D[]) */ - int tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ /* * These variables are used after completion of sorting to keep track of @@ -259,8 +285,8 @@ struct Tuplesortstate */ TupleDesc tupDesc; int nKeys; - ScanKey scanKeys; - SortFunctionKind *sortFnKinds; + ScanKey scanKeys; /* array of length nKeys */ + SortFunctionKind *sortFnKinds; /* array of length nKeys */ /* * These variables are specific to the IndexTuple case; they are set by @@ -448,7 +474,10 @@ tuplesort_begin_common(int workMem, bool randomAccess) state->currentRun = 0; - /* Algorithm D variables will be initialized by inittapes, if needed */ + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ state->result_tape = -1; /* flag that result tape has not been formed */ @@ -1041,6 +1070,29 @@ tuplesort_getdatum(Tuplesortstate *state, bool forward, return true; } +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * + * This is exported for use by the planner. allowedMem is in bytes. + * + * This must match the calculation in inittapes. The only reason we + * don't fold the code together is that inittapes wants to know if the + * MINTAPES limitation applies or not. + */ +int +tuplesort_merge_order(long allowedMem) +{ + int maxTapes; + + /* see inittapes for comments */ + maxTapes = (int) ((allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD)) + 1; + + maxTapes = Max(maxTapes, MINTAPES); + + /* The merge order is one less than the number of tapes */ + return maxTapes - 1; +} /* * inittapes - initialize for tape sorting. @@ -1050,16 +1102,64 @@ tuplesort_getdatum(Tuplesortstate *state, bool forward, static void inittapes(Tuplesortstate *state) { - int ntuples, + int maxTapes, + ntuples, j; + /* + * Determine the number of tapes to use based on allowed memory. + * + * We need T+1 tapes to do a T-way merge, and we want MERGE_BUFFER_SIZE + * tuple workspace for each input tape of the merge. The output tape + * doesn't account for tuple workspace but it does need tape buffer space. + * + * Keep this code in sync with tuplesort_merge_order! + */ + maxTapes = (int) ((state->allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD)) + 1; + + /* + * We will use at least MINTAPES regardless, but otherwise we decrease + * availMem to reflect the space that goes into buffers. + */ + if (maxTapes >= MINTAPES) + { + /* maxTapes is OK, adjust availMem */ + USEMEM(state, maxTapes * TAPE_BUFFER_OVERHEAD); + } + else + { + /* + * Force minimum tape count. In this path we ignore the tape buffers + * in our space calculation, to avoid driving availMem permanently + * negative if allowedMem is really tiny. (This matches the pre-8.2 + * behavior which was to ignore the tape buffers always, on the + * grounds that they were fixed-size overhead.) + */ + maxTapes = MINTAPES; + } + state->maxTapes = maxTapes; + state->tapeRange = maxTapes - 1; + #ifdef TRACE_SORT if (trace_sort) - elog(LOG, "switching to external sort: %s", - pg_rusage_show(&state->ru_start)); + elog(LOG, "switching to external sort with %d tapes: %s", + maxTapes, pg_rusage_show(&state->ru_start)); #endif - state->tapeset = LogicalTapeSetCreate(MAXTAPES); + /* + * Create the tape set and allocate the per-tape data arrays. + */ + state->tapeset = LogicalTapeSetCreate(maxTapes); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->mergenext = (int *) palloc0(maxTapes * sizeof(int)); + state->mergelast = (int *) palloc0(maxTapes * sizeof(int)); + state->mergeavailmem = (long *) palloc0(maxTapes * sizeof(long)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); /* * Allocate the memtupindex array, same size as memtuples. @@ -1087,15 +1187,15 @@ inittapes(Tuplesortstate *state) /* * Initialize variables of Algorithm D (step D1). */ - for (j = 0; j < MAXTAPES; j++) + for (j = 0; j < maxTapes; j++) { state->tp_fib[j] = 1; state->tp_runs[j] = 0; state->tp_dummy[j] = 1; state->tp_tapenum[j] = j; } - state->tp_fib[TAPERANGE] = 0; - state->tp_dummy[TAPERANGE] = 0; + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; state->Level = 1; state->destTape = 0; @@ -1130,7 +1230,7 @@ selectnewtape(Tuplesortstate *state) /* Step D4: increase level */ state->Level++; a = state->tp_fib[0]; - for (j = 0; j < TAPERANGE; j++) + for (j = 0; j < state->tapeRange; j++) { state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; state->tp_fib[j] = a + state->tp_fib[j + 1]; @@ -1170,18 +1270,19 @@ mergeruns(Tuplesortstate *state) } /* End of step D2: rewind all output tapes to prepare for merging */ - for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) LogicalTapeRewind(state->tapeset, tapenum, false); for (;;) { /* Step D5: merge runs onto tape[T] until tape[P] is empty */ - while (state->tp_runs[TAPERANGE - 1] || state->tp_dummy[TAPERANGE - 1]) + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) { bool allDummy = true; bool allOneRun = true; - for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) { if (state->tp_dummy[tapenum] == 0) allDummy = false; @@ -1203,8 +1304,8 @@ mergeruns(Tuplesortstate *state) } if (allDummy) { - state->tp_dummy[TAPERANGE]++; - for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) state->tp_dummy[tapenum]--; } else @@ -1214,20 +1315,20 @@ mergeruns(Tuplesortstate *state) if (--state->Level == 0) break; /* rewind output tape T to use as new input */ - LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE], + LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange], false); /* rewind used-up input tape P, and prepare it for write pass */ - LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE - 1], + LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange - 1], true); - state->tp_runs[TAPERANGE - 1] = 0; + state->tp_runs[state->tapeRange - 1] = 0; /* * reassign tape units per step D6; note we no longer care about A[] */ - svTape = state->tp_tapenum[TAPERANGE]; - svDummy = state->tp_dummy[TAPERANGE]; - svRuns = state->tp_runs[TAPERANGE]; - for (tapenum = TAPERANGE; tapenum > 0; tapenum--) + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) { state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; @@ -1246,7 +1347,7 @@ mergeruns(Tuplesortstate *state) * output tape while rewinding it. The last iteration of step D6 would be * a waste of cycles anyway... */ - state->result_tape = state->tp_tapenum[TAPERANGE]; + state->result_tape = state->tp_tapenum[state->tapeRange]; LogicalTapeFreeze(state->tapeset, state->result_tape); state->status = TSS_SORTEDONTAPE; } @@ -1260,7 +1361,7 @@ mergeruns(Tuplesortstate *state) static void mergeonerun(Tuplesortstate *state) { - int destTape = state->tp_tapenum[TAPERANGE]; + int destTape = state->tp_tapenum[state->tapeRange]; int srcTape; int tupIndex; void *tup; @@ -1313,7 +1414,7 @@ mergeonerun(Tuplesortstate *state) * output tape, and increment its count of real runs. */ markrunend(state, destTape); - state->tp_runs[TAPERANGE]++; + state->tp_runs[state->tapeRange]++; #ifdef TRACE_SORT if (trace_sort) @@ -1341,16 +1442,16 @@ beginmerge(Tuplesortstate *state) Assert(state->memtupcount == 0); /* Clear merge-pass state variables */ - memset(state->mergeactive, 0, sizeof(state->mergeactive)); - memset(state->mergenext, 0, sizeof(state->mergenext)); - memset(state->mergelast, 0, sizeof(state->mergelast)); - memset(state->mergeavailmem, 0, sizeof(state->mergeavailmem)); + memset(state->mergeactive, 0, state->maxTapes * sizeof(*state->mergeactive)); + memset(state->mergenext, 0, state->maxTapes * sizeof(*state->mergenext)); + memset(state->mergelast, 0, state->maxTapes * sizeof(*state->mergelast)); + memset(state->mergeavailmem, 0, state->maxTapes * sizeof(*state->mergeavailmem)); state->mergefreelist = 0; /* nothing in the freelist */ - state->mergefirstfree = MAXTAPES; /* first slot available for preread */ + state->mergefirstfree = state->maxTapes; /* 1st slot avail for preread */ /* Adjust run counts and mark the active tapes */ activeTapes = 0; - for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) { if (state->tp_dummy[tapenum] > 0) state->tp_dummy[tapenum]--; @@ -1370,7 +1471,7 @@ beginmerge(Tuplesortstate *state) */ Assert(activeTapes > 0); state->spacePerTape = state->availMem / activeTapes; - for (srcTape = 0; srcTape < MAXTAPES; srcTape++) + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) { if (state->mergeactive[srcTape]) state->mergeavailmem[srcTape] = state->spacePerTape; @@ -1383,7 +1484,7 @@ beginmerge(Tuplesortstate *state) mergepreread(state); /* Load the merge heap with the first tuple from each input tape */ - for (srcTape = 0; srcTape < MAXTAPES; srcTape++) + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) { int tupIndex = state->mergenext[srcTape]; void *tup; @@ -1420,7 +1521,7 @@ mergepreread(Tuplesortstate *state) long priorAvail, spaceUsed; - for (srcTape = 0; srcTape < MAXTAPES; srcTape++) + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) { if (!state->mergeactive[srcTape]) continue; @@ -1534,9 +1635,9 @@ dumptuples(Tuplesortstate *state, bool alltuples) #ifdef TRACE_SORT if (trace_sort) - elog(LOG, "finished writing%s run %d: %s", + elog(LOG, "finished writing%s run %d to tape %d: %s", (state->memtupcount == 0) ? " final" : "", - state->currentRun, + state->currentRun, state->destTape, pg_rusage_show(&state->ru_start)); #endif diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 3947077e12..2aaa7d9c4f 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -13,7 +13,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.17 2004/12/31 22:03:46 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.18 2006/02/19 05:54:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -67,6 +67,8 @@ extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, extern void tuplesort_end(Tuplesortstate *state); +extern int tuplesort_merge_order(long allowedMem); + /* * These routines may only be called if randomAccess was specified 'true'. * Likewise, backwards scan in gettuple/getdatum is only allowed if