Make use of statistics on index expressions. There are still some
corner cases that could stand improvement, but it does all the basic stuff. A byproduct is that the selectivity routines are no longer constrained to working on simple Vars; we might in future be able to improve the behavior for subexpressions that don't match indexes.
This commit is contained in:
parent
d372bba02d
commit
a536ed53bc
@ -49,7 +49,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.124 2004/02/03 17:34:03 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.125 2004/02/17 00:52:53 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -102,8 +102,6 @@ bool enable_mergejoin = true;
|
||||
bool enable_hashjoin = true;
|
||||
|
||||
|
||||
static Selectivity estimate_hash_bucketsize(Query *root, Var *var,
|
||||
int nbuckets);
|
||||
static bool cost_qual_eval_walker(Node *node, QualCost *total);
|
||||
static Selectivity approx_selectivity(Query *root, List *quals,
|
||||
JoinType jointype);
|
||||
@ -1152,7 +1150,7 @@ cost_hashjoin(HashPath *path, Query *root)
|
||||
/* not cached yet */
|
||||
thisbucketsize =
|
||||
estimate_hash_bucketsize(root,
|
||||
(Var *) get_rightop(restrictinfo->clause),
|
||||
get_rightop(restrictinfo->clause),
|
||||
virtualbuckets);
|
||||
restrictinfo->right_bucketsize = thisbucketsize;
|
||||
}
|
||||
@ -1168,7 +1166,7 @@ cost_hashjoin(HashPath *path, Query *root)
|
||||
/* not cached yet */
|
||||
thisbucketsize =
|
||||
estimate_hash_bucketsize(root,
|
||||
(Var *) get_leftop(restrictinfo->clause),
|
||||
get_leftop(restrictinfo->clause),
|
||||
virtualbuckets);
|
||||
restrictinfo->left_bucketsize = thisbucketsize;
|
||||
}
|
||||
@ -1249,179 +1247,6 @@ cost_hashjoin(HashPath *path, Query *root)
|
||||
path->jpath.path.total_cost = startup_cost + run_cost;
|
||||
}
|
||||
|
||||
/*
|
||||
* Estimate hash bucketsize fraction (ie, number of entries in a bucket
|
||||
* divided by total tuples in relation) if the specified Var is used
|
||||
* as a hash key.
|
||||
*
|
||||
* XXX This is really pretty bogus since we're effectively assuming that the
|
||||
* distribution of hash keys will be the same after applying restriction
|
||||
* clauses as it was in the underlying relation. However, we are not nearly
|
||||
* smart enough to figure out how the restrict clauses might change the
|
||||
* distribution, so this will have to do for now.
|
||||
*
|
||||
* We are passed the number of buckets the executor will use for the given
|
||||
* input relation. If the data were perfectly distributed, with the same
|
||||
* number of tuples going into each available bucket, then the bucketsize
|
||||
* fraction would be 1/nbuckets. But this happy state of affairs will occur
|
||||
* only if (a) there are at least nbuckets distinct data values, and (b)
|
||||
* we have a not-too-skewed data distribution. Otherwise the buckets will
|
||||
* be nonuniformly occupied. If the other relation in the join has a key
|
||||
* distribution similar to this one's, then the most-loaded buckets are
|
||||
* exactly those that will be probed most often. Therefore, the "average"
|
||||
* bucket size for costing purposes should really be taken as something close
|
||||
* to the "worst case" bucket size. We try to estimate this by adjusting the
|
||||
* fraction if there are too few distinct data values, and then scaling up
|
||||
* by the ratio of the most common value's frequency to the average frequency.
|
||||
*
|
||||
* If no statistics are available, use a default estimate of 0.1. This will
|
||||
* discourage use of a hash rather strongly if the inner relation is large,
|
||||
* which is what we want. We do not want to hash unless we know that the
|
||||
* inner rel is well-dispersed (or the alternatives seem much worse).
|
||||
*/
|
||||
static Selectivity
|
||||
estimate_hash_bucketsize(Query *root, Var *var, int nbuckets)
|
||||
{
|
||||
Oid relid;
|
||||
RelOptInfo *rel;
|
||||
HeapTuple tuple;
|
||||
Form_pg_statistic stats;
|
||||
double estfract,
|
||||
ndistinct,
|
||||
mcvfreq,
|
||||
avgfreq;
|
||||
float4 *numbers;
|
||||
int nnumbers;
|
||||
|
||||
/* Ignore any binary-compatible relabeling */
|
||||
if (var && IsA(var, RelabelType))
|
||||
var = (Var *) ((RelabelType *) var)->arg;
|
||||
|
||||
/*
|
||||
* Lookup info about var's relation and attribute; if none available,
|
||||
* return default estimate.
|
||||
*/
|
||||
if (var == NULL || !IsA(var, Var))
|
||||
return 0.1;
|
||||
|
||||
relid = getrelid(var->varno, root->rtable);
|
||||
if (relid == InvalidOid)
|
||||
return 0.1;
|
||||
|
||||
rel = find_base_rel(root, var->varno);
|
||||
|
||||
if (rel->tuples <= 0.0 || rel->rows <= 0.0)
|
||||
return 0.1; /* ensure we can divide below */
|
||||
|
||||
tuple = SearchSysCache(STATRELATT,
|
||||
ObjectIdGetDatum(relid),
|
||||
Int16GetDatum(var->varattno),
|
||||
0, 0);
|
||||
if (!HeapTupleIsValid(tuple))
|
||||
{
|
||||
/*
|
||||
* If the attribute is known unique because of an index,
|
||||
* we can treat it as well-distributed.
|
||||
*/
|
||||
if (has_unique_index(rel, var->varattno))
|
||||
return 1.0 / (double) nbuckets;
|
||||
|
||||
/*
|
||||
* Perhaps the Var is a system attribute; if so, it will have no
|
||||
* entry in pg_statistic, but we may be able to guess something
|
||||
* about its distribution anyway.
|
||||
*/
|
||||
switch (var->varattno)
|
||||
{
|
||||
case ObjectIdAttributeNumber:
|
||||
case SelfItemPointerAttributeNumber:
|
||||
/* these are unique, so buckets should be well-distributed */
|
||||
return 1.0 / (double) nbuckets;
|
||||
case TableOidAttributeNumber:
|
||||
/* hashing this is a terrible idea... */
|
||||
return 1.0;
|
||||
}
|
||||
return 0.1;
|
||||
}
|
||||
stats = (Form_pg_statistic) GETSTRUCT(tuple);
|
||||
|
||||
/*
|
||||
* Obtain number of distinct data values in raw relation.
|
||||
*/
|
||||
ndistinct = stats->stadistinct;
|
||||
if (ndistinct < 0.0)
|
||||
ndistinct = -ndistinct * rel->tuples;
|
||||
|
||||
if (ndistinct <= 0.0) /* ensure we can divide */
|
||||
{
|
||||
ReleaseSysCache(tuple);
|
||||
return 0.1;
|
||||
}
|
||||
|
||||
/* Also compute avg freq of all distinct data values in raw relation */
|
||||
avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
|
||||
|
||||
/*
|
||||
* Adjust ndistinct to account for restriction clauses. Observe we
|
||||
* are assuming that the data distribution is affected uniformly by
|
||||
* the restriction clauses!
|
||||
*
|
||||
* XXX Possibly better way, but much more expensive: multiply by
|
||||
* selectivity of rel's restriction clauses that mention the target
|
||||
* Var.
|
||||
*/
|
||||
ndistinct *= rel->rows / rel->tuples;
|
||||
|
||||
/*
|
||||
* Initial estimate of bucketsize fraction is 1/nbuckets as long as
|
||||
* the number of buckets is less than the expected number of distinct
|
||||
* values; otherwise it is 1/ndistinct.
|
||||
*/
|
||||
if (ndistinct > (double) nbuckets)
|
||||
estfract = 1.0 / (double) nbuckets;
|
||||
else
|
||||
estfract = 1.0 / ndistinct;
|
||||
|
||||
/*
|
||||
* Look up the frequency of the most common value, if available.
|
||||
*/
|
||||
mcvfreq = 0.0;
|
||||
|
||||
if (get_attstatsslot(tuple, var->vartype, var->vartypmod,
|
||||
STATISTIC_KIND_MCV, InvalidOid,
|
||||
NULL, NULL, &numbers, &nnumbers))
|
||||
{
|
||||
/*
|
||||
* The first MCV stat is for the most common value.
|
||||
*/
|
||||
if (nnumbers > 0)
|
||||
mcvfreq = numbers[0];
|
||||
free_attstatsslot(var->vartype, NULL, 0,
|
||||
numbers, nnumbers);
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust estimated bucketsize upward to account for skewed
|
||||
* distribution.
|
||||
*/
|
||||
if (avgfreq > 0.0 && mcvfreq > avgfreq)
|
||||
estfract *= mcvfreq / avgfreq;
|
||||
|
||||
/*
|
||||
* Clamp bucketsize to sane range (the above adjustment could easily
|
||||
* produce an out-of-range result). We set the lower bound a little
|
||||
* above zero, since zero isn't a very sane result.
|
||||
*/
|
||||
if (estfract < 1.0e-6)
|
||||
estfract = 1.0e-6;
|
||||
else if (estfract > 1.0)
|
||||
estfract = 1.0;
|
||||
|
||||
ReleaseSysCache(tuple);
|
||||
|
||||
return (Selectivity) estfract;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* cost_qual_eval
|
||||
|
@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.54 2003/12/08 18:19:58 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/optimizer/util/relnode.c,v 1.55 2004/02/17 00:52:53 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -214,12 +214,8 @@ find_base_rel(Query *root, int relid)
|
||||
* find_join_rel
|
||||
* Returns relation entry corresponding to 'relids' (a set of RT indexes),
|
||||
* or NULL if none exists. This is for join relations.
|
||||
*
|
||||
* Note: there is probably no good reason for this to be called from
|
||||
* anywhere except build_join_rel, but keep it as a separate routine
|
||||
* just in case.
|
||||
*/
|
||||
static RelOptInfo *
|
||||
RelOptInfo *
|
||||
find_join_rel(Query *root, Relids relids)
|
||||
{
|
||||
List *joinrels;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.53 2003/11/29 22:41:07 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.54 2004/02/17 00:52:53 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -77,6 +77,7 @@ extern HashPath *create_hashjoin_path(Query *root,
|
||||
extern void build_base_rel(Query *root, int relid);
|
||||
extern RelOptInfo *build_other_rel(Query *root, int relid);
|
||||
extern RelOptInfo *find_base_rel(Query *root, int relid);
|
||||
extern RelOptInfo *find_join_rel(Query *root, Relids relids);
|
||||
extern RelOptInfo *build_join_rel(Query *root,
|
||||
Relids joinrelids,
|
||||
RelOptInfo *outer_rel,
|
||||
|
@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.16 2003/11/29 22:41:16 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.17 2004/02/17 00:52:53 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -77,6 +77,9 @@ extern void mergejoinscansel(Query *root, Node *clause,
|
||||
extern double estimate_num_groups(Query *root, List *groupExprs,
|
||||
double input_rows);
|
||||
|
||||
extern Selectivity estimate_hash_bucketsize(Query *root, Node *hashkey,
|
||||
int nbuckets);
|
||||
|
||||
extern Datum btcostestimate(PG_FUNCTION_ARGS);
|
||||
extern Datum rtcostestimate(PG_FUNCTION_ARGS);
|
||||
extern Datum hashcostestimate(PG_FUNCTION_ARGS);
|
||||
|
Loading…
Reference in New Issue
Block a user