Improve tuplesort.c to support variable merge order. The original coding

with fixed merge order (fixed number of "tapes") was based on obsolete
assumptions, namely that tape drives are expensive.  Since our "tapes"
are really just a couple of buffers, we can have a lot of them given
adequate workspace.  This allows reduction of the number of merge passes
with consequent savings of I/O during large sorts.

Simon Riggs with some rework by Tom Lane
This commit is contained in:
Tom Lane 2006-02-19 05:54:06 +00:00
parent 85c0eac1af
commit df700e6b40
3 changed files with 169 additions and 63 deletions

View File

@ -49,7 +49,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.153 2006/02/05 02:59:16 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.154 2006/02/19 05:54:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -70,10 +70,10 @@
#include "utils/selfuncs.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "utils/tuplesort.h"
#define LOG2(x) (log(x) / 0.693147180559945)
#define LOG6(x) (log(x) / 1.79175946922805)
/*
* Some Paths return less than the nominal number of rows of their parent
@ -767,11 +767,10 @@ cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
* If the total volume exceeds work_mem, we switch to a tape-style merge
* algorithm. There will still be about t*log2(t) tuple comparisons in
* total, but we will also need to write and read each tuple once per
* merge pass. We expect about ceil(log6(r)) merge passes where r is the
* number of initial runs formed (log6 because tuplesort.c uses six-tape
* merging). Since the average initial run should be about twice work_mem,
* we have
* disk traffic = 2 * relsize * ceil(log6(p / (2*work_mem)))
* merge pass. We expect about ceil(logM(r)) merge passes where r is the
* number of initial runs formed and M is the merge order used by tuplesort.c.
* Since the average initial run should be about twice work_mem, we have
* disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
* cpu = comparison_cost * t * log2(t)
*
* The disk traffic is assumed to be half sequential and half random
@ -824,10 +823,14 @@ cost_sort(Path *path, PlannerInfo *root,
{
double npages = ceil(nbytes / BLCKSZ);
double nruns = (nbytes / work_mem_bytes) * 0.5;
double log_runs = ceil(LOG6(nruns));
double mergeorder = tuplesort_merge_order(work_mem_bytes);
double log_runs;
double npageaccesses;
if (log_runs < 1.0)
/* Compute logM(r) as log(r) / log(M) */
if (nruns > mergeorder)
log_runs = ceil(log(nruns) / log(mergeorder));
else
log_runs = 1.0;
npageaccesses = 2.0 * npages * log_runs;
/* Assume half are sequential (cost 1), half are not */

View File

@ -48,7 +48,7 @@
* each source run; we repeatedly output the smallest tuple and insert the
* next tuple from its source tape (if any). When the heap empties, the merge
* is complete. The basic merge algorithm thus needs very little memory ---
* only M tuples for an M-way merge, and M is at most six in the present code.
* only M tuples for an M-way merge, and M is constrained to a small number.
* However, we can still make good use of our full workMem allocation by
* pre-reading additional tuples from each source tape. Without prereading,
* our access pattern to the temporary file would be very erratic; on average
@ -73,12 +73,25 @@
* on-the-fly as the caller repeatedly calls tuplesort_gettuple; this
* saves one cycle of writing all the data out to disk and reading it in.
*
* Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
* grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
* to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that
* tape drives are expensive beasts, and in particular that there will always
* be many more runs than tape drives. In our implementation a "tape drive"
* doesn't cost much more than a few Kb of memory buffers, so we can afford
* to have lots of them. In particular, if we can have as many tape drives
* as sorted runs, we can eliminate any repeated I/O at all. In the current
* code we determine the number of tapes M on the basis of workMem: we want
* workMem/M to be large enough that we read a fair amount of data each time
* we preread from a tape, so as to maintain the locality of access described
* above. Nonetheless, with large workMem we can have many tapes.
*
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.57 2006/01/05 01:56:29 momjian Exp $
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.58 2006/02/19 05:54:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -120,11 +133,18 @@ typedef enum
} TupSortStatus;
/*
* We use a seven-tape polyphase merge, which is the "sweet spot" on the
* tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2).
* Parameters for calculation of number of tapes to use --- see inittapes().
*
* In this calculation we assume that each tape will cost us about 3 blocks
* worth of buffer space (which is an underestimate for very large data
* volumes, but it's probably close enough --- see logtape.c).
*
* MERGE_BUFFER_SIZE is how much data we'd like to read from each
* tape during a preread cycle (see discussion at top of file).
*/
#define MAXTAPES 7 /* Knuth's T */
#define TAPERANGE (MAXTAPES-1) /* Knuth's P */
#define MINTAPES 7 /* minimum number of tapes */
#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3)
#define MERGE_BUFFER_SIZE (BLCKSZ * 32)
/*
* Private state of a Tuplesort operation.
@ -135,6 +155,8 @@ struct Tuplesortstate
bool randomAccess; /* did caller request random access? */
long availMem; /* remaining memory available, in bytes */
long allowedMem; /* total memory allowed, in bytes */
int maxTapes; /* number of tapes (Knuth's T) */
int tapeRange; /* maxTapes-1 (Knuth's P) */
LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
/*
@ -179,7 +201,7 @@ struct Tuplesortstate
* SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
* and FINALMERGE, the tuples are organized in "heap" order per Algorithm
* H. (Note that memtupcount only counts the tuples that are part of the
* heap --- during merge passes, memtuples[] entries beyond TAPERANGE are
* heap --- during merge passes, memtuples[] entries beyond tapeRange are
* never in the heap and are used to hold pre-read tuples.) In state
* SORTEDONTAPE, the array is not used.
*/
@ -204,6 +226,11 @@ struct Tuplesortstate
*/
int currentRun;
/*
* Unless otherwise noted, all pointer variables below are pointers
* to arrays of length maxTapes, holding per-tape data.
*/
/*
* These variables are only used during merge passes. mergeactive[i] is
* true if we are reading an input run from (actual) tape number i and
@ -218,11 +245,10 @@ struct Tuplesortstate
* in these lists, because memtuples[0] is part of the merge heap and is
* never a pre-read tuple.
*/
bool mergeactive[MAXTAPES]; /* Active input run source? */
int mergenext[MAXTAPES]; /* first preread tuple for each source */
int mergelast[MAXTAPES]; /* last preread tuple for each source */
long mergeavailmem[MAXTAPES]; /* availMem for prereading
* tapes */
bool *mergeactive; /* Active input run source? */
int *mergenext; /* first preread tuple for each source */
int *mergelast; /* last preread tuple for each source */
long *mergeavailmem; /* availMem for prereading tapes */
long spacePerTape; /* actual per-tape target usage */
int mergefreelist; /* head of freelist of recycled slots */
int mergefirstfree; /* first slot never used in this merge */
@ -234,10 +260,10 @@ struct Tuplesortstate
*/
int Level; /* Knuth's l */
int destTape; /* current output tape (Knuth's j, less 1) */
int tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */
int tp_runs[MAXTAPES]; /* # of real runs on each tape */
int tp_dummy[MAXTAPES]; /* # of dummy runs for each tape (D[]) */
int tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */
int *tp_fib; /* Target Fibonacci run counts (A[]) */
int *tp_runs; /* # of real runs on each tape */
int *tp_dummy; /* # of dummy runs for each tape (D[]) */
int *tp_tapenum; /* Actual tape numbers (TAPE[]) */
/*
* These variables are used after completion of sorting to keep track of
@ -259,8 +285,8 @@ struct Tuplesortstate
*/
TupleDesc tupDesc;
int nKeys;
ScanKey scanKeys;
SortFunctionKind *sortFnKinds;
ScanKey scanKeys; /* array of length nKeys */
SortFunctionKind *sortFnKinds; /* array of length nKeys */
/*
* These variables are specific to the IndexTuple case; they are set by
@ -448,7 +474,10 @@ tuplesort_begin_common(int workMem, bool randomAccess)
state->currentRun = 0;
/* Algorithm D variables will be initialized by inittapes, if needed */
/*
* maxTapes, tapeRange, and Algorithm D variables will be initialized by
* inittapes(), if needed
*/
state->result_tape = -1; /* flag that result tape has not been formed */
@ -1041,6 +1070,29 @@ tuplesort_getdatum(Tuplesortstate *state, bool forward,
return true;
}
/*
* tuplesort_merge_order - report merge order we'll use for given memory
*
* This is exported for use by the planner. allowedMem is in bytes.
*
* This must match the calculation in inittapes. The only reason we
* don't fold the code together is that inittapes wants to know if the
* MINTAPES limitation applies or not.
*/
int
tuplesort_merge_order(long allowedMem)
{
int maxTapes;
/* see inittapes for comments */
maxTapes = (int) ((allowedMem - TAPE_BUFFER_OVERHEAD) /
(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD)) + 1;
maxTapes = Max(maxTapes, MINTAPES);
/* The merge order is one less than the number of tapes */
return maxTapes - 1;
}
/*
* inittapes - initialize for tape sorting.
@ -1050,16 +1102,64 @@ tuplesort_getdatum(Tuplesortstate *state, bool forward,
static void
inittapes(Tuplesortstate *state)
{
int ntuples,
int maxTapes,
ntuples,
j;
/*
* Determine the number of tapes to use based on allowed memory.
*
* We need T+1 tapes to do a T-way merge, and we want MERGE_BUFFER_SIZE
* tuple workspace for each input tape of the merge. The output tape
* doesn't account for tuple workspace but it does need tape buffer space.
*
* Keep this code in sync with tuplesort_merge_order!
*/
maxTapes = (int) ((state->allowedMem - TAPE_BUFFER_OVERHEAD) /
(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD)) + 1;
/*
* We will use at least MINTAPES regardless, but otherwise we decrease
* availMem to reflect the space that goes into buffers.
*/
if (maxTapes >= MINTAPES)
{
/* maxTapes is OK, adjust availMem */
USEMEM(state, maxTapes * TAPE_BUFFER_OVERHEAD);
}
else
{
/*
* Force minimum tape count. In this path we ignore the tape buffers
* in our space calculation, to avoid driving availMem permanently
* negative if allowedMem is really tiny. (This matches the pre-8.2
* behavior which was to ignore the tape buffers always, on the
* grounds that they were fixed-size overhead.)
*/
maxTapes = MINTAPES;
}
state->maxTapes = maxTapes;
state->tapeRange = maxTapes - 1;
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG, "switching to external sort: %s",
pg_rusage_show(&state->ru_start));
elog(LOG, "switching to external sort with %d tapes: %s",
maxTapes, pg_rusage_show(&state->ru_start));
#endif
state->tapeset = LogicalTapeSetCreate(MAXTAPES);
/*
* Create the tape set and allocate the per-tape data arrays.
*/
state->tapeset = LogicalTapeSetCreate(maxTapes);
state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
state->mergenext = (int *) palloc0(maxTapes * sizeof(int));
state->mergelast = (int *) palloc0(maxTapes * sizeof(int));
state->mergeavailmem = (long *) palloc0(maxTapes * sizeof(long));
state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
/*
* Allocate the memtupindex array, same size as memtuples.
@ -1087,15 +1187,15 @@ inittapes(Tuplesortstate *state)
/*
* Initialize variables of Algorithm D (step D1).
*/
for (j = 0; j < MAXTAPES; j++)
for (j = 0; j < maxTapes; j++)
{
state->tp_fib[j] = 1;
state->tp_runs[j] = 0;
state->tp_dummy[j] = 1;
state->tp_tapenum[j] = j;
}
state->tp_fib[TAPERANGE] = 0;
state->tp_dummy[TAPERANGE] = 0;
state->tp_fib[state->tapeRange] = 0;
state->tp_dummy[state->tapeRange] = 0;
state->Level = 1;
state->destTape = 0;
@ -1130,7 +1230,7 @@ selectnewtape(Tuplesortstate *state)
/* Step D4: increase level */
state->Level++;
a = state->tp_fib[0];
for (j = 0; j < TAPERANGE; j++)
for (j = 0; j < state->tapeRange; j++)
{
state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
state->tp_fib[j] = a + state->tp_fib[j + 1];
@ -1170,18 +1270,19 @@ mergeruns(Tuplesortstate *state)
}
/* End of step D2: rewind all output tapes to prepare for merging */
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
LogicalTapeRewind(state->tapeset, tapenum, false);
for (;;)
{
/* Step D5: merge runs onto tape[T] until tape[P] is empty */
while (state->tp_runs[TAPERANGE - 1] || state->tp_dummy[TAPERANGE - 1])
while (state->tp_runs[state->tapeRange - 1] ||
state->tp_dummy[state->tapeRange - 1])
{
bool allDummy = true;
bool allOneRun = true;
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
{
if (state->tp_dummy[tapenum] == 0)
allDummy = false;
@ -1203,8 +1304,8 @@ mergeruns(Tuplesortstate *state)
}
if (allDummy)
{
state->tp_dummy[TAPERANGE]++;
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
state->tp_dummy[state->tapeRange]++;
for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
state->tp_dummy[tapenum]--;
}
else
@ -1214,20 +1315,20 @@ mergeruns(Tuplesortstate *state)
if (--state->Level == 0)
break;
/* rewind output tape T to use as new input */
LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE],
LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange],
false);
/* rewind used-up input tape P, and prepare it for write pass */
LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE - 1],
LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange - 1],
true);
state->tp_runs[TAPERANGE - 1] = 0;
state->tp_runs[state->tapeRange - 1] = 0;
/*
* reassign tape units per step D6; note we no longer care about A[]
*/
svTape = state->tp_tapenum[TAPERANGE];
svDummy = state->tp_dummy[TAPERANGE];
svRuns = state->tp_runs[TAPERANGE];
for (tapenum = TAPERANGE; tapenum > 0; tapenum--)
svTape = state->tp_tapenum[state->tapeRange];
svDummy = state->tp_dummy[state->tapeRange];
svRuns = state->tp_runs[state->tapeRange];
for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
{
state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
@ -1246,7 +1347,7 @@ mergeruns(Tuplesortstate *state)
* output tape while rewinding it. The last iteration of step D6 would be
* a waste of cycles anyway...
*/
state->result_tape = state->tp_tapenum[TAPERANGE];
state->result_tape = state->tp_tapenum[state->tapeRange];
LogicalTapeFreeze(state->tapeset, state->result_tape);
state->status = TSS_SORTEDONTAPE;
}
@ -1260,7 +1361,7 @@ mergeruns(Tuplesortstate *state)
static void
mergeonerun(Tuplesortstate *state)
{
int destTape = state->tp_tapenum[TAPERANGE];
int destTape = state->tp_tapenum[state->tapeRange];
int srcTape;
int tupIndex;
void *tup;
@ -1313,7 +1414,7 @@ mergeonerun(Tuplesortstate *state)
* output tape, and increment its count of real runs.
*/
markrunend(state, destTape);
state->tp_runs[TAPERANGE]++;
state->tp_runs[state->tapeRange]++;
#ifdef TRACE_SORT
if (trace_sort)
@ -1341,16 +1442,16 @@ beginmerge(Tuplesortstate *state)
Assert(state->memtupcount == 0);
/* Clear merge-pass state variables */
memset(state->mergeactive, 0, sizeof(state->mergeactive));
memset(state->mergenext, 0, sizeof(state->mergenext));
memset(state->mergelast, 0, sizeof(state->mergelast));
memset(state->mergeavailmem, 0, sizeof(state->mergeavailmem));
memset(state->mergeactive, 0, state->maxTapes * sizeof(*state->mergeactive));
memset(state->mergenext, 0, state->maxTapes * sizeof(*state->mergenext));
memset(state->mergelast, 0, state->maxTapes * sizeof(*state->mergelast));
memset(state->mergeavailmem, 0, state->maxTapes * sizeof(*state->mergeavailmem));
state->mergefreelist = 0; /* nothing in the freelist */
state->mergefirstfree = MAXTAPES; /* first slot available for preread */
state->mergefirstfree = state->maxTapes; /* 1st slot avail for preread */
/* Adjust run counts and mark the active tapes */
activeTapes = 0;
for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
{
if (state->tp_dummy[tapenum] > 0)
state->tp_dummy[tapenum]--;
@ -1370,7 +1471,7 @@ beginmerge(Tuplesortstate *state)
*/
Assert(activeTapes > 0);
state->spacePerTape = state->availMem / activeTapes;
for (srcTape = 0; srcTape < MAXTAPES; srcTape++)
for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
{
if (state->mergeactive[srcTape])
state->mergeavailmem[srcTape] = state->spacePerTape;
@ -1383,7 +1484,7 @@ beginmerge(Tuplesortstate *state)
mergepreread(state);
/* Load the merge heap with the first tuple from each input tape */
for (srcTape = 0; srcTape < MAXTAPES; srcTape++)
for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
{
int tupIndex = state->mergenext[srcTape];
void *tup;
@ -1420,7 +1521,7 @@ mergepreread(Tuplesortstate *state)
long priorAvail,
spaceUsed;
for (srcTape = 0; srcTape < MAXTAPES; srcTape++)
for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
{
if (!state->mergeactive[srcTape])
continue;
@ -1534,9 +1635,9 @@ dumptuples(Tuplesortstate *state, bool alltuples)
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG, "finished writing%s run %d: %s",
elog(LOG, "finished writing%s run %d to tape %d: %s",
(state->memtupcount == 0) ? " final" : "",
state->currentRun,
state->currentRun, state->destTape,
pg_rusage_show(&state->ru_start));
#endif

View File

@ -13,7 +13,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.17 2004/12/31 22:03:46 pgsql Exp $
* $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.18 2006/02/19 05:54:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -67,6 +67,8 @@ extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
extern void tuplesort_end(Tuplesortstate *state);
extern int tuplesort_merge_order(long allowedMem);
/*
* These routines may only be called if randomAccess was specified 'true'.
* Likewise, backwards scan in gettuple/getdatum is only allowed if