diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a6555e91b7..cc2a9a1b6c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3237,15 +3237,15 @@ add_unique_group_var(PlannerInfo *root, List *varinfos, * restriction selectivity of the equality in the next step. * 4. For Vars within a single source rel, we multiply together the numbers * of values, clamp to the number of rows in the rel (divided by 10 if - * more than one Var), and then multiply by the selectivity of the - * restriction clauses for that rel. When there's more than one Var, - * the initial product is probably too high (it's the worst case) but - * clamping to a fraction of the rel's rows seems to be a helpful - * heuristic for not letting the estimate get out of hand. (The factor - * of 10 is derived from pre-Postgres-7.4 practice.) Multiplying - * by the restriction selectivity is effectively assuming that the - * restriction clauses are independent of the grouping, which is a crummy - * assumption, but it's hard to do better. + * more than one Var), and then multiply by a factor based on the + * selectivity of the restriction clauses for that rel. When there's + * more than one Var, the initial product is probably too high (it's the + * worst case) but clamping to a fraction of the rel's rows seems to be a + * helpful heuristic for not letting the estimate get out of hand. (The + * factor of 10 is derived from pre-Postgres-7.4 practice.) The factor + * we multiply by to adjust for the restriction selectivity assumes that + * the restriction clauses are independent of the grouping, which may not + * be a valid assumption, but it's hard to do better. * 5. If there are Vars from multiple rels, we repeat step 4 for each such * rel, and multiply the results together. * Note that rels not containing grouped Vars are ignored completely, as are @@ -3439,9 +3439,51 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, reldistinct = clamp; /* - * Multiply by restriction selectivity. + * Update the estimate based on the restriction selectivity, + * guarding against division by zero when reldistinct is zero. + * Also skip this if we know that we are returning all rows. */ - reldistinct *= rel->rows / rel->tuples; + if (reldistinct > 0 && rel->rows < rel->tuples) + { + /* + * Given a table containing N rows with n distinct values in a + * uniform distribution, if we select p rows at random then + * the expected number of distinct values selected is + * + * n * (1 - product((N-N/n-i)/(N-i), i=0..p-1)) + * + * = n * (1 - (N-N/n)! / (N-N/n-p)! * (N-p)! / N!) + * + * See "Approximating block accesses in database + * organizations", S. B. Yao, Communications of the ACM, + * Volume 20 Issue 4, April 1977 Pages 260-261. + * + * Alternatively, re-arranging the terms from the factorials, + * this may be written as + * + * n * (1 - product((N-p-i)/(N-i), i=0..N/n-1)) + * + * This form of the formula is more efficient to compute in + * the common case where p is larger than N/n. Additionally, + * as pointed out by Dell'Era, if i << N for all terms in the + * product, it can be approximated by + * + * n * (1 - ((N-p)/N)^(N/n)) + * + * See "Expected distinct values when selecting from a bag + * without replacement", Alberto Dell'Era, + * http://www.adellera.it/investigations/distinct_balls/. + * + * The condition i << N is equivalent to n >> 1, so this is a + * good approximation when the number of distinct values in + * the table is large. It turns out that this formula also + * works well even when n is small. + */ + reldistinct *= + (1 - pow((rel->tuples - rel->rows) / rel->tuples, + rel->tuples / reldistinct)); + } + reldistinct = clamp_row_est(reldistinct); /* * Update estimate of total distinct groups. diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index de64ca7ec7..0fc93d9d72 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -807,27 +807,24 @@ select * from int4_tbl where explain (verbose, costs off) select * from int4_tbl o where (f1, f1) in (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1); - QUERY PLAN ----------------------------------------------------------------------- - Hash Join + QUERY PLAN +---------------------------------------------------------------- + Hash Semi Join Output: o.f1 Hash Cond: (o.f1 = "ANY_subquery".f1) -> Seq Scan on public.int4_tbl o Output: o.f1 -> Hash Output: "ANY_subquery".f1, "ANY_subquery".g - -> HashAggregate + -> Subquery Scan on "ANY_subquery" Output: "ANY_subquery".f1, "ANY_subquery".g - Group Key: "ANY_subquery".f1, "ANY_subquery".g - -> Subquery Scan on "ANY_subquery" - Output: "ANY_subquery".f1, "ANY_subquery".g - Filter: ("ANY_subquery".f1 = "ANY_subquery".g) - -> HashAggregate - Output: i.f1, (generate_series(1, 2) / 10) - Group Key: i.f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1 -(18 rows) + Filter: ("ANY_subquery".f1 = "ANY_subquery".g) + -> HashAggregate + Output: i.f1, (generate_series(1, 2) / 10) + Group Key: i.f1 + -> Seq Scan on public.int4_tbl i + Output: i.f1 +(15 rows) select * from int4_tbl o where (f1, f1) in (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);