diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 8a5f07c957..b0c65650ee 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2072,6 +2072,12 @@ compute_distinct_stats(VacAttrStatsP stats, * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * + * In this calculation, we consider only non-nulls. We used to + * include rows with null values in the n and N counts, but that + * leads to inaccurate answers in columns with many nulls, and + * it's intuitively bogus anyway considering the desired result is + * the number of distinct non-null values. + * * We assume (not very reliably!) that all the multiply-occurring * values are reflected in the final track[] list, and the other * nonnull values all appeared but once. (XXX this usually @@ -2081,21 +2087,22 @@ compute_distinct_stats(VacAttrStatsP stats, */ int f1 = nonnull_cnt - summultiple; int d = f1 + nmultiple; - double numer, - denom, - stadistinct; + double n = samplerows - null_cnt; + double N = totalrows * (1.0 - stats->stanullfrac); + double stadistinct; - numer = (double) samplerows *(double) d; + /* N == 0 shouldn't happen, but just in case ... */ + if (N > 0) + stadistinct = (n * d) / ((n - f1) + f1 * n / N); + else + stadistinct = 0; - denom = (double) (samplerows - f1) + - (double) f1 *(double) samplerows / totalrows; - - stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ - if (stadistinct < (double) d) - stadistinct = (double) d; - if (stadistinct > totalrows) - stadistinct = totalrows; + if (stadistinct < d) + stadistinct = d; + if (stadistinct > N) + stadistinct = N; + /* And round to integer */ stats->stadistinct = floor(stadistinct + 0.5); } @@ -2425,26 +2432,33 @@ compute_scalar_stats(VacAttrStatsP stats, * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * + * In this calculation, we consider only non-nulls. We used to + * include rows with null values in the n and N counts, but that + * leads to inaccurate answers in columns with many nulls, and + * it's intuitively bogus anyway considering the desired result is + * the number of distinct non-null values. + * * Overwidth values are assumed to have been distinct. *---------- */ int f1 = ndistinct - nmultiple + toowide_cnt; int d = f1 + nmultiple; - double numer, - denom, - stadistinct; + double n = samplerows - null_cnt; + double N = totalrows * (1.0 - stats->stanullfrac); + double stadistinct; - numer = (double) samplerows *(double) d; + /* N == 0 shouldn't happen, but just in case ... */ + if (N > 0) + stadistinct = (n * d) / ((n - f1) + f1 * n / N); + else + stadistinct = 0; - denom = (double) (samplerows - f1) + - (double) f1 *(double) samplerows / totalrows; - - stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ - if (stadistinct < (double) d) - stadistinct = (double) d; - if (stadistinct > totalrows) - stadistinct = totalrows; + if (stadistinct < d) + stadistinct = d; + if (stadistinct > N) + stadistinct = N; + /* And round to integer */ stats->stadistinct = floor(stadistinct + 0.5); }