Allow per-tablespace effective_io_concurrency
Per discussion, nowadays it is possible to have tablespaces that have wildly different I/O characteristics from others. Setting different effective_io_concurrency parameters for those has been measured to improve performance. Author: Julien Rouhaud Reviewed by: Andres Freund
This commit is contained in:
parent
665a00c9e2
commit
1aba62ec63
@ -1901,7 +1901,10 @@ include_dir 'conf.d'
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The default is 1 on supported systems, otherwise 0.
|
||||
The default is 1 on supported systems, otherwise 0. This value can
|
||||
be overriden for tables in a particular tablespace by setting the
|
||||
tablespace parameter of the same name (see
|
||||
<xref linkend="sql-altertablespace">).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
|
||||
<listitem>
|
||||
<para>
|
||||
A tablespace parameter to be set or reset. Currently, the only
|
||||
available parameters are <varname>seq_page_cost</> and
|
||||
<varname>random_page_cost</>. Setting either value for a particular
|
||||
tablespace will override the planner's usual estimate of the cost of
|
||||
reading pages from tables in that tablespace, as established by
|
||||
the configuration parameters of the same name (see
|
||||
<xref linkend="guc-seq-page-cost">,
|
||||
<xref linkend="guc-random-page-cost">). This may be useful if one
|
||||
tablespace is located on a disk which is faster or slower than the
|
||||
available parameters are <varname>seq_page_cost</>,
|
||||
<varname>random_page_cost</> and <varname>effective_io_concurrency</>.
|
||||
Setting either value for a particular tablespace will override the
|
||||
planner's usual estimate of the cost of reading pages from tables in
|
||||
that tablespace, as established by the configuration parameters of the
|
||||
same name (see <xref linkend="guc-seq-page-cost">,
|
||||
<xref linkend="guc-random-page-cost">,
|
||||
<xref linkend="guc-effective-io-concurrency">). This may be useful if
|
||||
one tablespace is located on a disk which is faster or slower than the
|
||||
remainder of the I/O subsystem.
|
||||
</para>
|
||||
</listitem>
|
||||
|
@ -254,6 +254,19 @@ static relopt_int intRelOpts[] =
|
||||
},
|
||||
-1, 64, MAX_KILOBYTES
|
||||
},
|
||||
{
|
||||
{
|
||||
"effective_io_concurrency",
|
||||
"Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
|
||||
RELOPT_KIND_TABLESPACE,
|
||||
AccessExclusiveLock
|
||||
},
|
||||
#ifdef USE_PREFETCH
|
||||
-1, 0, MAX_IO_CONCURRENCY
|
||||
#else
|
||||
0, 0, 0
|
||||
#endif
|
||||
},
|
||||
|
||||
/* list terminator */
|
||||
{{NULL}}
|
||||
@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
|
||||
int numoptions;
|
||||
static const relopt_parse_elt tab[] = {
|
||||
{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
|
||||
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}
|
||||
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
|
||||
{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
|
||||
};
|
||||
|
||||
options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,
|
||||
|
@ -44,6 +44,7 @@
|
||||
#include "storage/predicate.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/spccache.h"
|
||||
#include "utils/snapmgr.h"
|
||||
#include "utils/tqual.h"
|
||||
|
||||
@ -95,9 +96,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
|
||||
* prefetching. node->prefetch_pages tracks exactly how many pages ahead
|
||||
* the prefetch iterator is. Also, node->prefetch_target tracks the
|
||||
* desired prefetch distance, which starts small and increases up to the
|
||||
* GUC-controlled maximum, target_prefetch_pages. This is to avoid doing
|
||||
* a lot of prefetching in a scan that stops after a few tuples because of
|
||||
* a LIMIT.
|
||||
* node->prefetch_maximum. This is to avoid doing a lot of prefetching in
|
||||
* a scan that stops after a few tuples because of a LIMIT.
|
||||
*/
|
||||
if (tbm == NULL)
|
||||
{
|
||||
@ -111,7 +111,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
|
||||
node->tbmres = tbmres = NULL;
|
||||
|
||||
#ifdef USE_PREFETCH
|
||||
if (target_prefetch_pages > 0)
|
||||
if (node->prefetch_maximum > 0)
|
||||
{
|
||||
node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
|
||||
node->prefetch_pages = 0;
|
||||
@ -188,10 +188,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
|
||||
* page/tuple, then to one after the second tuple is fetched, then
|
||||
* it doubles as later pages are fetched.
|
||||
*/
|
||||
if (node->prefetch_target >= target_prefetch_pages)
|
||||
if (node->prefetch_target >= node->prefetch_maximum)
|
||||
/* don't increase any further */ ;
|
||||
else if (node->prefetch_target >= target_prefetch_pages / 2)
|
||||
node->prefetch_target = target_prefetch_pages;
|
||||
else if (node->prefetch_target >= node->prefetch_maximum / 2)
|
||||
node->prefetch_target = node->prefetch_maximum;
|
||||
else if (node->prefetch_target > 0)
|
||||
node->prefetch_target *= 2;
|
||||
else
|
||||
@ -211,7 +211,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
|
||||
* Try to prefetch at least a few pages even before we get to the
|
||||
* second page if we don't stop reading after the first tuple.
|
||||
*/
|
||||
if (node->prefetch_target < target_prefetch_pages)
|
||||
if (node->prefetch_target < node->prefetch_maximum)
|
||||
node->prefetch_target++;
|
||||
#endif /* USE_PREFETCH */
|
||||
}
|
||||
@ -539,6 +539,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
|
||||
{
|
||||
BitmapHeapScanState *scanstate;
|
||||
Relation currentRelation;
|
||||
int io_concurrency;
|
||||
|
||||
/* check for unsupported flags */
|
||||
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
|
||||
@ -564,6 +565,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
|
||||
scanstate->prefetch_iterator = NULL;
|
||||
scanstate->prefetch_pages = 0;
|
||||
scanstate->prefetch_target = 0;
|
||||
/* may be updated below */
|
||||
scanstate->prefetch_maximum = target_prefetch_pages;
|
||||
|
||||
/*
|
||||
* Miscellaneous initialization
|
||||
@ -598,6 +601,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
|
||||
*/
|
||||
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
|
||||
|
||||
/*
|
||||
* Determine the maximum for prefetch_target. If the tablespace has a
|
||||
* specific IO concurrency set, use that to compute the corresponding
|
||||
* maximum value; otherwise, we already initialized to the value computed
|
||||
* by the GUC machinery.
|
||||
*/
|
||||
io_concurrency =
|
||||
get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
|
||||
if (io_concurrency != effective_io_concurrency)
|
||||
{
|
||||
double maximum;
|
||||
|
||||
if (ComputeIoConcurrency(io_concurrency, &maximum))
|
||||
scanstate->prefetch_maximum = rint(maximum);
|
||||
}
|
||||
|
||||
scanstate->ss.ss_currentRelation = currentRelation;
|
||||
|
||||
/*
|
||||
|
@ -80,11 +80,14 @@ bool zero_damaged_pages = false;
|
||||
int bgwriter_lru_maxpages = 100;
|
||||
double bgwriter_lru_multiplier = 2.0;
|
||||
bool track_io_timing = false;
|
||||
int effective_io_concurrency = 0;
|
||||
|
||||
/*
|
||||
* How many buffers PrefetchBuffer callers should try to stay ahead of their
|
||||
* ReadBuffer calls by. This is maintained by the assign hook for
|
||||
* effective_io_concurrency. Zero means "never prefetch".
|
||||
* effective_io_concurrency. Zero means "never prefetch". This value is
|
||||
* only used for buffers not belonging to tablespaces that have their
|
||||
* effective_io_concurrency parameter set.
|
||||
*/
|
||||
int target_prefetch_pages = 0;
|
||||
|
||||
@ -415,6 +418,64 @@ static void CheckForBufferLeaks(void);
|
||||
static int rnode_comparator(const void *p1, const void *p2);
|
||||
|
||||
|
||||
/*
|
||||
* ComputeIoConcurrency -- get the number of pages to prefetch for a given
|
||||
* number of spindles.
|
||||
*/
|
||||
bool
|
||||
ComputeIoConcurrency(int io_concurrency, double *target)
|
||||
{
|
||||
double new_prefetch_pages = 0.0;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Make sure the io_concurrency value is within valid range; it may have
|
||||
* been forced with a manual pg_tablespace update.
|
||||
*/
|
||||
io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
|
||||
|
||||
/*----------
|
||||
* The user-visible GUC parameter is the number of drives (spindles),
|
||||
* which we need to translate to a number-of-pages-to-prefetch target.
|
||||
* The target value is stashed in *extra and then assigned to the actual
|
||||
* variable by assign_effective_io_concurrency.
|
||||
*
|
||||
* The expected number of prefetch pages needed to keep N drives busy is:
|
||||
*
|
||||
* drives | I/O requests
|
||||
* -------+----------------
|
||||
* 1 | 1
|
||||
* 2 | 2/1 + 2/2 = 3
|
||||
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
|
||||
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
|
||||
* n | n * H(n)
|
||||
*
|
||||
* This is called the "coupon collector problem" and H(n) is called the
|
||||
* harmonic series. This could be approximated by n * ln(n), but for
|
||||
* reasonable numbers of drives we might as well just compute the series.
|
||||
*
|
||||
* Alternatively we could set the target to the number of pages necessary
|
||||
* so that the expected number of active spindles is some arbitrary
|
||||
* percentage of the total. This sounds the same but is actually slightly
|
||||
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
|
||||
* that desired fraction.
|
||||
*
|
||||
* Experimental results show that both of these formulas aren't aggressive
|
||||
* enough, but we don't really have any better proposals.
|
||||
*
|
||||
* Note that if io_concurrency = 0 (disabled), we must set target = 0.
|
||||
*----------
|
||||
*/
|
||||
|
||||
for (i = 1; i <= io_concurrency; i++)
|
||||
new_prefetch_pages += (double) io_concurrency / (double) i;
|
||||
|
||||
*target = new_prefetch_pages;
|
||||
|
||||
/* This range check shouldn't fail, but let's be paranoid */
|
||||
return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
|
||||
}
|
||||
|
||||
/*
|
||||
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
|
||||
*
|
||||
|
12
src/backend/utils/cache/spccache.c
vendored
12
src/backend/utils/cache/spccache.c
vendored
@ -23,6 +23,7 @@
|
||||
#include "commands/tablespace.h"
|
||||
#include "miscadmin.h"
|
||||
#include "optimizer/cost.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "utils/catcache.h"
|
||||
#include "utils/hsearch.h"
|
||||
#include "utils/inval.h"
|
||||
@ -198,3 +199,14 @@ get_tablespace_page_costs(Oid spcid,
|
||||
*spc_seq_page_cost = spc->opts->seq_page_cost;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
get_tablespace_io_concurrency(Oid spcid)
|
||||
{
|
||||
TableSpaceCacheEntry *spc = get_tablespace(spcid);
|
||||
|
||||
if (!spc->opts || spc->opts->effective_io_concurrency < 0)
|
||||
return effective_io_concurrency;
|
||||
else
|
||||
return spc->opts->effective_io_concurrency;
|
||||
}
|
||||
|
@ -490,7 +490,6 @@ static int wal_block_size;
|
||||
static bool data_checksums;
|
||||
static int wal_segment_size;
|
||||
static bool integer_datetimes;
|
||||
static int effective_io_concurrency;
|
||||
static bool assert_enabled;
|
||||
|
||||
/* should be static, but commands/variable.c needs to get at this */
|
||||
@ -2352,7 +2351,7 @@ static struct config_int ConfigureNamesInt[] =
|
||||
},
|
||||
&effective_io_concurrency,
|
||||
#ifdef USE_PREFETCH
|
||||
1, 0, 1000,
|
||||
1, 0, MAX_IO_CONCURRENCY,
|
||||
#else
|
||||
0, 0, 0,
|
||||
#endif
|
||||
@ -9986,47 +9985,9 @@ static bool
|
||||
check_effective_io_concurrency(int *newval, void **extra, GucSource source)
|
||||
{
|
||||
#ifdef USE_PREFETCH
|
||||
double new_prefetch_pages = 0.0;
|
||||
int i;
|
||||
double new_prefetch_pages;
|
||||
|
||||
/*----------
|
||||
* The user-visible GUC parameter is the number of drives (spindles),
|
||||
* which we need to translate to a number-of-pages-to-prefetch target.
|
||||
* The target value is stashed in *extra and then assigned to the actual
|
||||
* variable by assign_effective_io_concurrency.
|
||||
*
|
||||
* The expected number of prefetch pages needed to keep N drives busy is:
|
||||
*
|
||||
* drives | I/O requests
|
||||
* -------+----------------
|
||||
* 1 | 1
|
||||
* 2 | 2/1 + 2/2 = 3
|
||||
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
|
||||
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
|
||||
* n | n * H(n)
|
||||
*
|
||||
* This is called the "coupon collector problem" and H(n) is called the
|
||||
* harmonic series. This could be approximated by n * ln(n), but for
|
||||
* reasonable numbers of drives we might as well just compute the series.
|
||||
*
|
||||
* Alternatively we could set the target to the number of pages necessary
|
||||
* so that the expected number of active spindles is some arbitrary
|
||||
* percentage of the total. This sounds the same but is actually slightly
|
||||
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
|
||||
* that desired fraction.
|
||||
*
|
||||
* Experimental results show that both of these formulas aren't aggressive
|
||||
* enough, but we don't really have any better proposals.
|
||||
*
|
||||
* Note that if *newval = 0 (disabled), we must set target = 0.
|
||||
*----------
|
||||
*/
|
||||
|
||||
for (i = 1; i <= *newval; i++)
|
||||
new_prefetch_pages += (double) *newval / (double) i;
|
||||
|
||||
/* This range check shouldn't fail, but let's be paranoid */
|
||||
if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
|
||||
if (ComputeIoConcurrency(*newval, &new_prefetch_pages))
|
||||
{
|
||||
int *myextra = (int *) guc_malloc(ERROR, sizeof(int));
|
||||
|
||||
|
@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end)
|
||||
pg_strcasecmp(prev_wd, "(") == 0)
|
||||
{
|
||||
static const char *const list_TABLESPACEOPTIONS[] =
|
||||
{"seq_page_cost", "random_page_cost", NULL};
|
||||
{"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
|
||||
|
||||
COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
|
||||
}
|
||||
|
@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
|
||||
int32 vl_len_; /* varlena header (do not touch directly!) */
|
||||
float8 random_page_cost;
|
||||
float8 seq_page_cost;
|
||||
int effective_io_concurrency;
|
||||
} TableSpaceOpts;
|
||||
|
||||
extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt);
|
||||
|
@ -1424,7 +1424,8 @@ typedef struct BitmapIndexScanState
|
||||
* lossy_pages total number of lossy pages retrieved
|
||||
* prefetch_iterator iterator for prefetching ahead of current page
|
||||
* prefetch_pages # pages prefetch iterator is ahead of current
|
||||
* prefetch_target target prefetch distance
|
||||
* prefetch_target current target prefetch distance
|
||||
* prefetch_maximum maximum value for prefetch_target
|
||||
* ----------------
|
||||
*/
|
||||
typedef struct BitmapHeapScanState
|
||||
@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState
|
||||
TBMIterator *prefetch_iterator;
|
||||
int prefetch_pages;
|
||||
int prefetch_target;
|
||||
int prefetch_maximum;
|
||||
} BitmapHeapScanState;
|
||||
|
||||
/* ----------------
|
||||
|
@ -58,11 +58,17 @@ extern int target_prefetch_pages;
|
||||
/* in buf_init.c */
|
||||
extern PGDLLIMPORT char *BufferBlocks;
|
||||
|
||||
/* in guc.c */
|
||||
extern int effective_io_concurrency;
|
||||
|
||||
/* in localbuf.c */
|
||||
extern PGDLLIMPORT int NLocBuffer;
|
||||
extern PGDLLIMPORT Block *LocalBufferBlockPointers;
|
||||
extern PGDLLIMPORT int32 *LocalRefCount;
|
||||
|
||||
/* upper limit for effective_io_concurrency */
|
||||
#define MAX_IO_CONCURRENCY 1000
|
||||
|
||||
/* special block number for ReadBuffer() */
|
||||
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
|
||||
|
||||
@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount;
|
||||
/*
|
||||
* prototypes for functions in bufmgr.c
|
||||
*/
|
||||
extern bool ComputeIoConcurrency(int io_concurrency, double *target);
|
||||
extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
|
||||
BlockNumber blockNum);
|
||||
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
|
||||
|
@ -15,5 +15,6 @@
|
||||
|
||||
void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
|
||||
float8 *spc_seq_page_cost);
|
||||
int get_tablespace_io_concurrency(Oid spcid);
|
||||
|
||||
#endif /* SPCCACHE_H */
|
||||
|
Loading…
x
Reference in New Issue
Block a user