Fix eqjoinsel() to make use of new statistics.
This commit is contained in:
parent
a001f13506
commit
73d1040bd9
@ -15,7 +15,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -940,9 +940,7 @@ Datum
|
|||||||
eqjoinsel(PG_FUNCTION_ARGS)
|
eqjoinsel(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
Query *root = (Query *) PG_GETARG_POINTER(0);
|
Query *root = (Query *) PG_GETARG_POINTER(0);
|
||||||
#ifdef NOT_USED /* see neqjoinsel() before removing me! */
|
|
||||||
Oid operator = PG_GETARG_OID(1);
|
Oid operator = PG_GETARG_OID(1);
|
||||||
#endif
|
|
||||||
List *args = (List *) PG_GETARG_POINTER(2);
|
List *args = (List *) PG_GETARG_POINTER(2);
|
||||||
Var *var1;
|
Var *var1;
|
||||||
Var *var2;
|
Var *var2;
|
||||||
@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
|
|||||||
HeapTuple statsTuple2 = NULL;
|
HeapTuple statsTuple2 = NULL;
|
||||||
Form_pg_statistic stats1 = NULL;
|
Form_pg_statistic stats1 = NULL;
|
||||||
Form_pg_statistic stats2 = NULL;
|
Form_pg_statistic stats2 = NULL;
|
||||||
double nd1,
|
double nd1 = DEFAULT_NUM_DISTINCT;
|
||||||
nd2;
|
double nd2 = DEFAULT_NUM_DISTINCT;
|
||||||
|
bool have_mcvs1 = false;
|
||||||
|
Datum *values1 = NULL;
|
||||||
|
int nvalues1 = 0;
|
||||||
|
float4 *numbers1 = NULL;
|
||||||
|
int nnumbers1 = 0;
|
||||||
|
bool have_mcvs2 = false;
|
||||||
|
Datum *values2 = NULL;
|
||||||
|
int nvalues2 = 0;
|
||||||
|
float4 *numbers2 = NULL;
|
||||||
|
int nnumbers2 = 0;
|
||||||
|
|
||||||
if (var1 == NULL)
|
if (var1 != NULL)
|
||||||
{
|
|
||||||
nd1 = DEFAULT_NUM_DISTINCT;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
/* get stats for the attribute, if available */
|
/* get stats for the attribute, if available */
|
||||||
Oid relid1 = getrelid(var1->varno, root->rtable);
|
Oid relid1 = getrelid(var1->varno, root->rtable);
|
||||||
|
|
||||||
if (relid1 == InvalidOid)
|
if (relid1 != InvalidOid)
|
||||||
nd1 = DEFAULT_NUM_DISTINCT;
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
statsTuple1 = SearchSysCache(STATRELATT,
|
statsTuple1 = SearchSysCache(STATRELATT,
|
||||||
ObjectIdGetDatum(relid1),
|
ObjectIdGetDatum(relid1),
|
||||||
Int16GetDatum(var1->varattno),
|
Int16GetDatum(var1->varattno),
|
||||||
0, 0);
|
0, 0);
|
||||||
if (HeapTupleIsValid(statsTuple1))
|
if (HeapTupleIsValid(statsTuple1))
|
||||||
|
{
|
||||||
stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
|
stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
|
||||||
|
have_mcvs1 = get_attstatsslot(statsTuple1,
|
||||||
|
var1->vartype,
|
||||||
|
var1->vartypmod,
|
||||||
|
STATISTIC_KIND_MCV,
|
||||||
|
InvalidOid,
|
||||||
|
&values1, &nvalues1,
|
||||||
|
&numbers1, &nnumbers1);
|
||||||
|
}
|
||||||
|
|
||||||
nd1 = get_att_numdistinct(root, var1, stats1);
|
nd1 = get_att_numdistinct(root, var1, stats1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (var2 == NULL)
|
if (var2 != NULL)
|
||||||
{
|
|
||||||
nd2 = DEFAULT_NUM_DISTINCT;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
/* get stats for the attribute, if available */
|
/* get stats for the attribute, if available */
|
||||||
Oid relid2 = getrelid(var2->varno, root->rtable);
|
Oid relid2 = getrelid(var2->varno, root->rtable);
|
||||||
|
|
||||||
if (relid2 == InvalidOid)
|
if (relid2 != InvalidOid)
|
||||||
nd2 = DEFAULT_NUM_DISTINCT;
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
statsTuple2 = SearchSysCache(STATRELATT,
|
statsTuple2 = SearchSysCache(STATRELATT,
|
||||||
ObjectIdGetDatum(relid2),
|
ObjectIdGetDatum(relid2),
|
||||||
Int16GetDatum(var2->varattno),
|
Int16GetDatum(var2->varattno),
|
||||||
0, 0);
|
0, 0);
|
||||||
if (HeapTupleIsValid(statsTuple2))
|
if (HeapTupleIsValid(statsTuple2))
|
||||||
|
{
|
||||||
stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
|
stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
|
||||||
|
have_mcvs2 = get_attstatsslot(statsTuple2,
|
||||||
|
var2->vartype,
|
||||||
|
var2->vartypmod,
|
||||||
|
STATISTIC_KIND_MCV,
|
||||||
|
InvalidOid,
|
||||||
|
&values2, &nvalues2,
|
||||||
|
&numbers2, &nnumbers2);
|
||||||
|
}
|
||||||
|
|
||||||
nd2 = get_att_numdistinct(root, var2, stats2);
|
nd2 = get_att_numdistinct(root, var2, stats2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
if (have_mcvs1 && have_mcvs2)
|
||||||
* Estimate the join selectivity as 1 / sqrt(nd1*nd2)
|
{
|
||||||
* (can we produce any theory for this)?
|
/*
|
||||||
*
|
* We have most-common-value lists for both relations. Run
|
||||||
* XXX possibility to do better: if both attributes have histograms
|
* through the lists to see which MCVs actually join to each
|
||||||
* then we could determine the exact join selectivity between the
|
* other with the given operator. This allows us to determine
|
||||||
* MCV sets, and only have to assume the join behavior of the non-MCV
|
* the exact join selectivity for the portion of the relations
|
||||||
* values. This could be a big win when the MCVs cover a large part
|
* represented by the MCV lists. We still have to estimate for
|
||||||
* of the population.
|
* the remaining population, but in a skewed distribution this
|
||||||
*
|
* gives us a big leg up in accuracy. For motivation see the
|
||||||
* XXX what about nulls?
|
* analysis in Y. Ioannidis and S. Christodoulakis, "On the
|
||||||
*/
|
* propagation of errors in the size of join results", Technical
|
||||||
selec = 1.0 / sqrt(nd1 * nd2);
|
* Report 1018, Computer Science Dept., University of Wisconsin,
|
||||||
if (selec > 1.0)
|
* Madison, March 1991 (available from ftp.cs.wisc.edu).
|
||||||
selec = 1.0;
|
*/
|
||||||
|
FmgrInfo eqproc;
|
||||||
|
bool *hasmatch1;
|
||||||
|
bool *hasmatch2;
|
||||||
|
double matchprodfreq,
|
||||||
|
matchfreq1,
|
||||||
|
matchfreq2,
|
||||||
|
unmatchfreq1,
|
||||||
|
unmatchfreq2,
|
||||||
|
otherfreq1,
|
||||||
|
otherfreq2,
|
||||||
|
totalsel1,
|
||||||
|
totalsel2;
|
||||||
|
int i,
|
||||||
|
nmatches;
|
||||||
|
|
||||||
|
fmgr_info(get_opcode(operator), &eqproc);
|
||||||
|
hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
|
||||||
|
memset(hasmatch1, 0, nvalues1 * sizeof(bool));
|
||||||
|
hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
|
||||||
|
memset(hasmatch2, 0, nvalues2 * sizeof(bool));
|
||||||
|
/*
|
||||||
|
* Note we assume that each MCV will match at most one member of
|
||||||
|
* the other MCV list. If the operator isn't really equality,
|
||||||
|
* there could be multiple matches --- but we don't look for them,
|
||||||
|
* both for speed and because the math wouldn't add up...
|
||||||
|
*/
|
||||||
|
matchprodfreq = 0.0;
|
||||||
|
nmatches = 0;
|
||||||
|
for (i = 0; i < nvalues1; i++)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < nvalues2; j++)
|
||||||
|
{
|
||||||
|
if (hasmatch2[j])
|
||||||
|
continue;
|
||||||
|
if (DatumGetBool(FunctionCall2(&eqproc,
|
||||||
|
values1[i],
|
||||||
|
values2[j])))
|
||||||
|
{
|
||||||
|
hasmatch1[i] = hasmatch2[j] = true;
|
||||||
|
matchprodfreq += numbers1[i] * numbers2[j];
|
||||||
|
nmatches++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Sum up frequencies of matched and unmatched MCVs */
|
||||||
|
matchfreq1 = unmatchfreq1 = 0.0;
|
||||||
|
for (i = 0; i < nvalues1; i++)
|
||||||
|
{
|
||||||
|
if (hasmatch1[i])
|
||||||
|
matchfreq1 += numbers1[i];
|
||||||
|
else
|
||||||
|
unmatchfreq1 += numbers1[i];
|
||||||
|
}
|
||||||
|
matchfreq2 = unmatchfreq2 = 0.0;
|
||||||
|
for (i = 0; i < nvalues2; i++)
|
||||||
|
{
|
||||||
|
if (hasmatch2[i])
|
||||||
|
matchfreq2 += numbers2[i];
|
||||||
|
else
|
||||||
|
unmatchfreq2 += numbers2[i];
|
||||||
|
}
|
||||||
|
pfree(hasmatch1);
|
||||||
|
pfree(hasmatch2);
|
||||||
|
/*
|
||||||
|
* Compute total frequency of non-null values that are not in
|
||||||
|
* the MCV lists.
|
||||||
|
*/
|
||||||
|
otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
|
||||||
|
otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
|
||||||
|
/*
|
||||||
|
* We can estimate the total selectivity from the point of view
|
||||||
|
* of relation 1 as: the known selectivity for matched MCVs, plus
|
||||||
|
* unmatched MCVs that are assumed to match against random members
|
||||||
|
* of relation 2's non-MCV population, plus non-MCV values that
|
||||||
|
* are assumed to match against random members of relation 2's
|
||||||
|
* unmatched MCVs plus non-MCV values.
|
||||||
|
*/
|
||||||
|
totalsel1 = matchprodfreq;
|
||||||
|
if (nd2 > nvalues2)
|
||||||
|
totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
|
||||||
|
if (nd2 > nmatches)
|
||||||
|
totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
|
||||||
|
(nd2 - nmatches);
|
||||||
|
/* Same estimate from the point of view of relation 2. */
|
||||||
|
totalsel2 = matchprodfreq;
|
||||||
|
if (nd1 > nvalues1)
|
||||||
|
totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
|
||||||
|
if (nd1 > nmatches)
|
||||||
|
totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
|
||||||
|
(nd1 - nmatches);
|
||||||
|
/*
|
||||||
|
* For robustness, we average the two estimates. (Can a case
|
||||||
|
* be made for taking the min or max instead?)
|
||||||
|
*/
|
||||||
|
selec = (totalsel1 + totalsel2) * 0.5;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* We do not have MCV lists for both sides. Estimate the
|
||||||
|
* join selectivity as MIN(1/nd1, 1/nd2). This is plausible
|
||||||
|
* if we assume that the values are about equally distributed:
|
||||||
|
* a given tuple of rel1 will join to either 0 or N2/nd2 rows
|
||||||
|
* of rel2, so total join rows are at most N1*N2/nd2 giving
|
||||||
|
* a join selectivity of not more than 1/nd2. By the same logic
|
||||||
|
* it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
|
||||||
|
* bound. Using the MIN() means we estimate from the point of
|
||||||
|
* view of the relation with smaller nd (since the larger nd is
|
||||||
|
* determining the MIN). It is reasonable to assume that most
|
||||||
|
* tuples in this rel will have join partners, so the bound is
|
||||||
|
* probably reasonably tight and should be taken as-is.
|
||||||
|
*
|
||||||
|
* XXX Can we be smarter if we have an MCV list for just one side?
|
||||||
|
* It seems that if we assume equal distribution for the other
|
||||||
|
* side, we end up with the same answer anyway.
|
||||||
|
*/
|
||||||
|
if (nd1 > nd2)
|
||||||
|
selec = 1.0 / nd1;
|
||||||
|
else
|
||||||
|
selec = 1.0 / nd2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (have_mcvs1)
|
||||||
|
free_attstatsslot(var1->vartype, values1, nvalues1,
|
||||||
|
numbers1, nnumbers1);
|
||||||
|
if (have_mcvs2)
|
||||||
|
free_attstatsslot(var2->vartype, values2, nvalues2,
|
||||||
|
numbers2, nnumbers2);
|
||||||
if (HeapTupleIsValid(statsTuple1))
|
if (HeapTupleIsValid(statsTuple1))
|
||||||
ReleaseSysCache(statsTuple1);
|
ReleaseSysCache(statsTuple1);
|
||||||
if (HeapTupleIsValid(statsTuple2))
|
if (HeapTupleIsValid(statsTuple2))
|
||||||
@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
|
|||||||
Datum
|
Datum
|
||||||
neqjoinsel(PG_FUNCTION_ARGS)
|
neqjoinsel(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
|
Query *root = (Query *) PG_GETARG_POINTER(0);
|
||||||
|
Oid operator = PG_GETARG_OID(1);
|
||||||
|
List *args = (List *) PG_GETARG_POINTER(2);
|
||||||
|
Oid eqop;
|
||||||
float8 result;
|
float8 result;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXX we skip looking up the negator operator here because we know
|
* We want 1 - eqjoinsel() where the equality operator is the one
|
||||||
* eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
|
* associated with this != operator, that is, its negator.
|
||||||
* look, this routine will need to look more like neqsel() does.
|
|
||||||
*/
|
*/
|
||||||
result = DatumGetFloat8(eqjoinsel(fcinfo));
|
eqop = get_negator(operator);
|
||||||
|
if (eqop)
|
||||||
|
{
|
||||||
|
result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
|
||||||
|
PointerGetDatum(root),
|
||||||
|
ObjectIdGetDatum(eqop),
|
||||||
|
PointerGetDatum(args)));
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Use default selectivity (should we raise an error instead?) */
|
||||||
|
result = DEFAULT_EQ_SEL;
|
||||||
|
}
|
||||||
result = 1.0 - result;
|
result = 1.0 - result;
|
||||||
PG_RETURN_FLOAT8(result);
|
PG_RETURN_FLOAT8(result);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user