Fix eqjoinsel() to make use of new statistics.

2001-05-27 17:37:48 +00:00 · 2001-05-27 17:37:48 +00:00 · 73d1040bd9
commit 73d1040bd9
parent a001f13506
1 changed files with 200 additions and 40 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -940,9 +940,7 @@ Datum
 eqjoinsel(PG_FUNCTION_ARGS)
 {
 	Query	   *root = (Query *) PG_GETARG_POINTER(0);
 #ifdef NOT_USED					/* see neqjoinsel() before removing me! */
 	Oid			operator = PG_GETARG_OID(1);
 #endif
 	List	   *args = (List *) PG_GETARG_POINTER(2);
 	Var		   *var1;
 	Var		   *var2;
@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
 		HeapTuple	statsTuple2 = NULL;
 		Form_pg_statistic stats1 = NULL;
 		Form_pg_statistic stats2 = NULL;
-		double		nd1,
+		double		nd1 = DEFAULT_NUM_DISTINCT;
-					nd2;
+		double		nd2 = DEFAULT_NUM_DISTINCT;
 		bool		have_mcvs1 = false;
 		Datum	   *values1 = NULL;
 		int			nvalues1 = 0;
 		float4	   *numbers1 = NULL;
 		int			nnumbers1 = 0;
 		bool		have_mcvs2 = false;
 		Datum	   *values2 = NULL;
 		int			nvalues2 = 0;
 		float4	   *numbers2 = NULL;
 		int			nnumbers2 = 0;
-		if (var1 == NULL)
+		if (var1 != NULL)
 		{
 			nd1 = DEFAULT_NUM_DISTINCT;
 		}
 		else
 		{
 			/* get stats for the attribute, if available */
 			Oid		relid1 = getrelid(var1->varno, root->rtable);
-			if (relid1 == InvalidOid)
+			if (relid1 != InvalidOid)
 				nd1 = DEFAULT_NUM_DISTINCT;
 			else
 			{
 				statsTuple1 = SearchSysCache(STATRELATT,
 											 ObjectIdGetDatum(relid1),
 											 Int16GetDatum(var1->varattno),
 											 0, 0);
 				if (HeapTupleIsValid(statsTuple1))
 				{
 					stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
 					have_mcvs1 = get_attstatsslot(statsTuple1,
 												  var1->vartype,
 												  var1->vartypmod,
 												  STATISTIC_KIND_MCV,
 												  InvalidOid,
 												  &values1, &nvalues1,
 												  &numbers1, &nnumbers1);
 				}
 				nd1 = get_att_numdistinct(root, var1, stats1);
 			}
 		}
-		if (var2 == NULL)
+		if (var2 != NULL)
 		{
 			nd2 = DEFAULT_NUM_DISTINCT;
 		}
 		else
 		{
 			/* get stats for the attribute, if available */
 			Oid		relid2 = getrelid(var2->varno, root->rtable);
-			if (relid2 == InvalidOid)
+			if (relid2 != InvalidOid)
 				nd2 = DEFAULT_NUM_DISTINCT;
 			else
 			{
 				statsTuple2 = SearchSysCache(STATRELATT,
 											 ObjectIdGetDatum(relid2),
 											 Int16GetDatum(var2->varattno),
 											 0, 0);
 				if (HeapTupleIsValid(statsTuple2))
 				{
 					stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
 					have_mcvs2 = get_attstatsslot(statsTuple2,
 												  var2->vartype,
 												  var2->vartypmod,
 												  STATISTIC_KIND_MCV,
 												  InvalidOid,
 												  &values2, &nvalues2,
 												  &numbers2, &nnumbers2);
 				}
 				nd2 = get_att_numdistinct(root, var2, stats2);
 			}
 		}
-		/*
+		if (have_mcvs1 && have_mcvs2)
-		 * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
+		{
-		 * (can we produce any theory for this)?
+			/*
-		 *
+			 * We have most-common-value lists for both relations.  Run
-		 * XXX possibility to do better: if both attributes have histograms
+			 * through the lists to see which MCVs actually join to each
-		 * then we could determine the exact join selectivity between the
+			 * other with the given operator.  This allows us to determine
-		 * MCV sets, and only have to assume the join behavior of the non-MCV
+			 * the exact join selectivity for the portion of the relations
-		 * values.  This could be a big win when the MCVs cover a large part
+			 * represented by the MCV lists.  We still have to estimate for
-		 * of the population.
+			 * the remaining population, but in a skewed distribution this
-		 *
+			 * gives us a big leg up in accuracy.  For motivation see the
-		 * XXX what about nulls?
+			 * analysis in Y. Ioannidis and S. Christodoulakis, "On the
-		 */
+			 * propagation of errors in the size of join results", Technical
-		selec = 1.0 / sqrt(nd1 * nd2);
+			 * Report 1018, Computer Science Dept., University of Wisconsin,
-		if (selec > 1.0)
+			 * Madison, March 1991 (available from ftp.cs.wisc.edu).
-			selec = 1.0;
+			 */
 			FmgrInfo	eqproc;
 			bool	   *hasmatch1;
 			bool	   *hasmatch2;
 			double		matchprodfreq,
 						matchfreq1,
 						matchfreq2,
 						unmatchfreq1,
 						unmatchfreq2,
 						otherfreq1,
 						otherfreq2,
 						totalsel1,
 						totalsel2;
 			int			i,
 						nmatches;
 			fmgr_info(get_opcode(operator), &eqproc);
 			hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
 			memset(hasmatch1, 0, nvalues1 * sizeof(bool));
 			hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
 			memset(hasmatch2, 0, nvalues2 * sizeof(bool));
 			/*
 			 * Note we assume that each MCV will match at most one member of
 			 * the other MCV list.  If the operator isn't really equality,
 			 * there could be multiple matches --- but we don't look for them,
 			 * both for speed and because the math wouldn't add up...
 			 */
 			matchprodfreq = 0.0;
 			nmatches = 0;
 			for (i = 0; i < nvalues1; i++)
 			{
 				int		j;
 				for (j = 0; j < nvalues2; j++)
 				{
 					if (hasmatch2[j])
 						continue;
 					if (DatumGetBool(FunctionCall2(&eqproc,
 												   values1[i],
 												   values2[j])))
 					{
 						hasmatch1[i] = hasmatch2[j] = true;
 						matchprodfreq += numbers1[i] * numbers2[j];
 						nmatches++;
 						break;
 					}
 				}
 			}
 			/* Sum up frequencies of matched and unmatched MCVs */
 			matchfreq1 = unmatchfreq1 = 0.0;
 			for (i = 0; i < nvalues1; i++)
 			{
 				if (hasmatch1[i])
 					matchfreq1 += numbers1[i];
 				else
 					unmatchfreq1 += numbers1[i];
 			}
 			matchfreq2 = unmatchfreq2 = 0.0;
 			for (i = 0; i < nvalues2; i++)
 			{
 				if (hasmatch2[i])
 					matchfreq2 += numbers2[i];
 				else
 					unmatchfreq2 += numbers2[i];
 			}
 			pfree(hasmatch1);
 			pfree(hasmatch2);
 			/*
 			 * Compute total frequency of non-null values that are not in
 			 * the MCV lists.
 			 */
 			otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
 			otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
 			/*
 			 * We can estimate the total selectivity from the point of view
 			 * of relation 1 as: the known selectivity for matched MCVs, plus
 			 * unmatched MCVs that are assumed to match against random members
 			 * of relation 2's non-MCV population, plus non-MCV values that
 			 * are assumed to match against random members of relation 2's
 			 * unmatched MCVs plus non-MCV values.
 			 */
 			totalsel1 = matchprodfreq;
 			if (nd2 > nvalues2)
 				totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
 			if (nd2 > nmatches)
 				totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
 					(nd2 - nmatches);
 			/* Same estimate from the point of view of relation 2. */
 			totalsel2 = matchprodfreq;
 			if (nd1 > nvalues1)
 				totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
 			if (nd1 > nmatches)
 				totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
 					(nd1 - nmatches);
 			/*
 			 * For robustness, we average the two estimates.  (Can a case
 			 * be made for taking the min or max instead?)
 			 */
 			selec = (totalsel1 + totalsel2) * 0.5;
 		}
 		else
 		{
 			/*
 			 * We do not have MCV lists for both sides.  Estimate the
 			 * join selectivity as MIN(1/nd1, 1/nd2).  This is plausible
 			 * if we assume that the values are about equally distributed:
 			 * a given tuple of rel1 will join to either 0 or N2/nd2 rows
 			 * of rel2, so total join rows are at most N1*N2/nd2 giving
 			 * a join selectivity of not more than 1/nd2.  By the same logic
 			 * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
 			 * bound.  Using the MIN() means we estimate from the point of
 			 * view of the relation with smaller nd (since the larger nd is
 			 * determining the MIN).  It is reasonable to assume that most
 			 * tuples in this rel will have join partners, so the bound is
 			 * probably reasonably tight and should be taken as-is.
 			 *
 			 * XXX Can we be smarter if we have an MCV list for just one side?
 			 * It seems that if we assume equal distribution for the other
 			 * side, we end up with the same answer anyway.
 			 */
 			if (nd1 > nd2)
 				selec = 1.0 / nd1;
 			else
 				selec = 1.0 / nd2;
 		}
 		if (have_mcvs1)
 			free_attstatsslot(var1->vartype, values1, nvalues1,
 							  numbers1, nnumbers1);
 		if (have_mcvs2)
 			free_attstatsslot(var2->vartype, values2, nvalues2,
 							  numbers2, nnumbers2);
 		if (HeapTupleIsValid(statsTuple1))
 			ReleaseSysCache(statsTuple1);
 		if (HeapTupleIsValid(statsTuple2))
@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
 Datum
 neqjoinsel(PG_FUNCTION_ARGS)
 {
 	Query	   *root = (Query *) PG_GETARG_POINTER(0);
 	Oid			operator = PG_GETARG_OID(1);
 	List	   *args = (List *) PG_GETARG_POINTER(2);
 	Oid			eqop;
 	float8		result;
 	/*
-	 * XXX we skip looking up the negator operator here because we know
+	 * We want 1 - eqjoinsel() where the equality operator is the one
-	 * eqjoinsel() won't look at it anyway.  If eqjoinsel() ever does
+	 * associated with this != operator, that is, its negator.
 	 * look, this routine will need to look more like neqsel() does.
 	 */
-	result = DatumGetFloat8(eqjoinsel(fcinfo));
+	eqop = get_negator(operator);
 	if (eqop)
 	{
 		result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
 											 PointerGetDatum(root),
 											 ObjectIdGetDatum(eqop),
 											 PointerGetDatum(args)));
 	}
 	else
 	{
 		/* Use default selectivity (should we raise an error instead?) */
 		result = DEFAULT_EQ_SEL;
 	}
 	result = 1.0 - result;
 	PG_RETURN_FLOAT8(result);
 }