diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 1283de0334..cf8c8164b7 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2055,7 +2055,11 @@ compute_distinct_stats(VacAttrStatsP stats, /* * Our track list includes every value in the sample, and every * value appeared more than once. Assume the column has just - * these values. + * these values. (This case is meant to address columns with + * small, fixed sets of possible values, such as boolean or enum + * columns. If there are any values that appear just once in the + * sample, including too-wide values, we should assume that that's + * not what we're dealing with.) */ stats->stadistinct = track_cnt; } @@ -2123,6 +2127,16 @@ compute_distinct_stats(VacAttrStatsP stats, * significantly more common than the (estimated) average. We set the * threshold rather arbitrarily at 25% more than average, with at * least 2 instances in the sample. + * + * Note: the first of these cases is meant to address columns with + * small, fixed sets of possible values, such as boolean or enum + * columns. If we can *completely* represent the column population by + * an MCV list that will fit into the stats target, then we should do + * so and thus provide the planner with complete information. But if + * the MCV list is not complete, it's generally worth being more + * selective, and not just filling it all the way up to the stats + * target. So for an incomplete list, we try to take only MCVs that + * are significantly more common than average. */ if (track_cnt < track_max && toowide_cnt == 0 && stats->stadistinct > 0 && @@ -2416,7 +2430,11 @@ compute_scalar_stats(VacAttrStatsP stats, { /* * Every value in the sample appeared more than once. Assume the - * column has just these values. + * column has just these values. (This case is meant to address + * columns with small, fixed sets of possible values, such as + * boolean or enum columns. If there are any values that appear + * just once in the sample, including too-wide values, we should + * assume that that's not what we're dealing with.) */ stats->stadistinct = ndistinct; } @@ -2485,6 +2503,16 @@ compute_scalar_stats(VacAttrStatsP stats, * emit duplicate histogram bin boundaries. (We might end up with * duplicate histogram entries anyway, if the distribution is skewed; * but we prefer to treat such values as MCVs if at all possible.) + * + * Note: the first of these cases is meant to address columns with + * small, fixed sets of possible values, such as boolean or enum + * columns. If we can *completely* represent the column population by + * an MCV list that will fit into the stats target, then we should do + * so and thus provide the planner with complete information. But if + * the MCV list is not complete, it's generally worth being more + * selective, and not just filling it all the way up to the stats + * target. So for an incomplete list, we try to take only MCVs that + * are significantly more common than average. */ if (track_cnt == ndistinct && toowide_cnt == 0 && stats->stadistinct > 0 &&