Code review for EXPLAIN patch. Fix some typos, make it behave sanely

across multiple loops, get rid of the shaky assumption that exactly one
tuple is returned per node iteration.
This commit is contained in:
Tom Lane 2006-05-30 19:24:25 +00:00
parent 3993bd8e07
commit a18ebc5541
2 changed files with 113 additions and 76 deletions

View File

@ -7,90 +7,117 @@
* Copyright (c) 2001-2006, PostgreSQL Global Development Group * Copyright (c) 2001-2006, PostgreSQL Global Development Group
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $ * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
#include "postgres.h" #include "postgres.h"
#include <unistd.h>
#include <math.h> #include <math.h>
#include <unistd.h>
#include "executor/instrument.h" #include "executor/instrument.h"
/* This is the function that is used to determine the sampling intervals. In
* general, if the function is f(x), then for N tuples we will take on the
* order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
* number of samples that would be collected over 1,000,000 tuples.
f(x) = x => log2(N) 20
f(x) = x^(1/2) => 2 * N^(1/2) 2000
f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
/*
* As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
* by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
* (Because that requires a kernel call on most systems, it's expensive.)
*
* This macro determines the sampling interval: that is, after how many more
* iterations will we take the next time sample, given that niters iterations
* have occurred already. In general, if the function is f(x), then for N
* iterations we will take on the order of integral(1/f(x), x=0..N)
* samples. Some examples follow, with the number of samples that would be
* collected over 1,000,000 iterations:
*
* f(x) = x => log2(N) 20
* f(x) = x^(1/2) => 2 * N^(1/2) 2000
* f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
*
* I've chosen the last one as it seems to provide a good compromise between * I've chosen the last one as it seems to provide a good compromise between
* low overhead but still getting a meaningful number of samples. However, * low overhead but still getting a meaningful number of samples. However,
* not all machines have the cbrt() function so on those we substitute * not all machines have the cbrt() function so on those we substitute
* sqrt(). The difference is not very significant in the tests I made. * sqrt(). The difference is not very significant in the tests I made.
*/ *
* The actual sampling interval is randomized with the SampleFunc() value
* as the mean; this hopefully will reduce any measurement bias due to
* cyclic variation in the node execution time.
*/
#ifdef HAVE_CBRT #ifdef HAVE_CBRT
#define SampleFunc cbrt #define SampleFunc(niters) cbrt(niters)
#else #else
#define SampleFunc sqrt #define SampleFunc(niters) sqrt(niters)
#endif #endif
#define SampleInterval(niters) \
(SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
/*
* We sample at every node iteration until we've reached this threshold,
* so that nodes not called a large number of times are completely accurate.
* (Perhaps this should be a GUC variable? But beware of messing up
* CalculateSampleOverhead if value is too small.)
*/
#define SAMPLE_THRESHOLD 50 #define SAMPLE_THRESHOLD 50
/*
* Determine the sampling overhead. This only needs to be done once per
* backend (actually, probably once per postmaster would do ...)
*/
static double SampleOverhead; static double SampleOverhead;
static bool SampleOverheadCalculated; static bool SampleOverheadCalculated = false;
static void static void
CalculateSampleOverhead() CalculateSampleOverhead(void)
{ {
Instrumentation instr; Instrumentation instr;
int i; int i;
/* We want to determine the sampling overhead, to correct /*
* calculations later. This only needs to be done once per backend. * We could get a wrong result due to being interrupted by task switch.
* Is this the place? A wrong value here (due to a mistimed
* task-switch) will cause bad calculations later.
*
* To minimize the risk we do it a few times and take the lowest. * To minimize the risk we do it a few times and take the lowest.
*/ */
SampleOverhead = 1.0e6; SampleOverhead = 1.0e6;
for( i = 0; i<5; i++ ) for (i = 0; i < 5; i++)
{ {
int j; int j;
double overhead; double overhead;
memset( &instr, 0, sizeof(instr) ); memset(&instr, 0, sizeof(instr));
/*
/* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */ * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ ) * so that's as far as we can test.
*/
for (j=0; j < SAMPLE_THRESHOLD; j++)
{ {
InstrStartNode( &instr ); InstrStartNode(&instr);
InstrStopNode( &instr, 1 ); InstrStopNode(&instr, 1);
} }
overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount; overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
if( overhead < SampleOverhead ) if (overhead < SampleOverhead)
SampleOverhead = overhead; SampleOverhead = overhead;
} }
SampleOverheadCalculated = true; SampleOverheadCalculated = true;
} }
/* Allocate new instrumentation structure(s) */ /* Allocate new instrumentation structure(s) */
Instrumentation * Instrumentation *
InstrAlloc(int n) InstrAlloc(int n)
{ {
Instrumentation *instr = palloc0(n * sizeof(Instrumentation)); Instrumentation *instr;
/* Calculate sampling overhead, if not done yet in this backend */
if (!SampleOverheadCalculated)
CalculateSampleOverhead();
instr = palloc0(n * sizeof(Instrumentation));
/* we don't need to do any initialization except zero 'em */ /* we don't need to do any initialization except zero 'em */
/* Calculate overhead, if not done yet */
if( !SampleOverheadCalculated )
CalculateSampleOverhead();
return instr; return instr;
} }
@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
{ {
if (INSTR_TIME_IS_ZERO(instr->starttime)) if (INSTR_TIME_IS_ZERO(instr->starttime))
{ {
/* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */ /*
if (instr->tuplecount < SAMPLE_THRESHOLD) * Always sample if not yet up to threshold, else check whether
* next threshold has been reached
*/
if (instr->itercount < SAMPLE_THRESHOLD)
instr->sampling = true; instr->sampling = true;
else else if (instr->itercount >= instr->nextsample)
{
/* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
if( instr->tuplecount > instr->nextsample )
{ {
instr->sampling = true; instr->sampling = true;
/* The doubling is so the random will average 1 over time */ instr->nextsample =
instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX; instr->itercount + SampleInterval(instr->itercount);
}
} }
if (instr->sampling) if (instr->sampling)
INSTR_TIME_SET_CURRENT(instr->starttime); INSTR_TIME_SET_CURRENT(instr->starttime);
@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
void void
InstrStopNode(Instrumentation *instr, double nTuples) InstrStopNode(Instrumentation *instr, double nTuples)
{ {
instr_time endtime; /* count the returned tuples and iterations */
/* count the returned tuples */
instr->tuplecount += nTuples; instr->tuplecount += nTuples;
instr->itercount += 1;
/* measure runtime if appropriate */
if (instr->sampling) if (instr->sampling)
{ {
instr_time endtime;
if (INSTR_TIME_IS_ZERO(instr->starttime)) if (INSTR_TIME_IS_ZERO(instr->starttime))
{ {
elog(DEBUG2, "InstrStopNode called without start"); elog(DEBUG2, "InstrStopNode called without start");
@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
#endif #endif
INSTR_TIME_SET_ZERO(instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime);
instr->samplecount += nTuples;
instr->samplecount += 1;
instr->sampling = false; instr->sampling = false;
} }
@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
if (!INSTR_TIME_IS_ZERO(instr->starttime)) if (!INSTR_TIME_IS_ZERO(instr->starttime))
elog(DEBUG2, "InstrEndLoop called on running node"); elog(DEBUG2, "InstrEndLoop called on running node");
/* Accumulate per-cycle statistics into totals */ /* Compute time spent in node */
totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
instr->startup += instr->firsttuple; /*
* If we didn't measure runtime on every iteration, then we must increase
/* Here we take into account sampling effects. Doing it naively ends * the measured total to account for the other iterations. Naively
* up assuming the sampling overhead applies to all tuples, even the * multiplying totaltime by itercount/samplecount would be wrong because
* ones we didn't measure. We've calculated an overhead, so we * it effectively assumes the sampling overhead applies to all iterations,
* subtract that for all samples we didn't measure. The first tuple * even the ones we didn't measure. (Note that what we are trying to
* is also special cased, because it usually takes longer. */ * estimate here is the actual time spent in the node, including the
* actual measurement overhead; not the time exclusive of measurement
if( instr->samplecount < instr->tuplecount ) * overhead.) We exclude the first iteration from the correction basis,
* because it often takes longer than others.
*/
if (instr->itercount > instr->samplecount)
{ {
double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1); double per_iter;
instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
+ ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
}
else
instr->total += totaltime;
per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
- SampleOverhead;
if (per_iter > 0) /* sanity check */
totaltime += per_iter * (instr->itercount - instr->samplecount);
}
/* Accumulate per-cycle statistics into totals */
instr->startup += instr->firsttuple;
instr->total += totaltime;
instr->ntuples += instr->tuplecount; instr->ntuples += instr->tuplecount;
instr->nsamples += instr->samplecount;
instr->nloops += 1; instr->nloops += 1;
/* Reset for next cycle (if any) */ /* Reset for next cycle (if any) */
instr->running = false; instr->running = false;
instr->sampling = false;
INSTR_TIME_SET_ZERO(instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime);
INSTR_TIME_SET_ZERO(instr->counter); INSTR_TIME_SET_ZERO(instr->counter);
instr->firsttuple = 0; instr->firsttuple = 0;
instr->samplecount = 0;
instr->tuplecount = 0; instr->tuplecount = 0;
instr->itercount = 0;
instr->samplecount = 0;
instr->nextsample = 0;
} }

View File

@ -6,7 +6,7 @@
* *
* Copyright (c) 2001-2006, PostgreSQL Global Development Group * Copyright (c) 2001-2006, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.14 2006/05/30 14:01:58 momjian Exp $ * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.15 2006/05/30 19:24:25 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -57,20 +57,19 @@ typedef struct Instrumentation
{ {
/* Info about current plan cycle: */ /* Info about current plan cycle: */
bool running; /* TRUE if we've completed first tuple */ bool running; /* TRUE if we've completed first tuple */
bool sampling; /* Are we sampling in current iteration? */
instr_time starttime; /* Start time of current iteration of node */ instr_time starttime; /* Start time of current iteration of node */
instr_time counter; /* Accumulated runtime for this node */ instr_time counter; /* Accumulated runtime for this node */
double firsttuple; /* Time for first tuple of this cycle */ double firsttuple; /* Time for first tuple of this cycle */
double tuplecount; /* Tuples emitted so far this cycle */ double tuplecount; /* Tuples emitted so far this cycle */
double samplecount; /* Samples collected this cycle */ double itercount; /* Plan node iterations this cycle */
double samplecount; /* Iterations in which we sampled runtime */
double nextsample; /* Next itercount to sample at */
/* Accumulated statistics across all completed cycles: */ /* Accumulated statistics across all completed cycles: */
double startup; /* Total startup time (in seconds) */ double startup; /* Total startup time (in seconds) */
double total; /* Total total time (in seconds) */ double total; /* Total total time (in seconds) */
double ntuples; /* Total tuples produced */ double ntuples; /* Total tuples produced */
double nloops; /* # of run cycles for this node */ double nloops; /* # of run cycles for this node */
double nsamples; /* # of samples taken */
/* Tracking for sampling */
bool sampling; /* Are we sampling this iteration */
double nextsample; /* The next tuplecount we're going to sample */
} Instrumentation; } Instrumentation;
extern Instrumentation *InstrAlloc(int n); extern Instrumentation *InstrAlloc(int n);