Mark expressions nullable by grouping sets

When generating window_pathkeys, distinct_pathkeys, or sort_pathkeys,
we failed to realize that the grouping/ordering expressions might be
nullable by grouping sets.  As a result, we may incorrectly deem that
the PathKeys are redundant by EquivalenceClass processing and thus
remove them from the pathkeys list.  That would lead to wrong results
in some cases.

To fix this issue, we mark the grouping expressions nullable by
grouping sets if that is the case.  If the grouping expression is a
Var or PlaceHolderVar or constructed from those, we can just add the
RT index of the RTE_GROUP RTE to the existing nullingrels field(s);
otherwise we have to add a PlaceHolderVar to carry on the nullingrel
bit.

However, we have to manually remove this nullingrel bit from
expressions in various cases where these expressions are logically
below the grouping step, such as when we generate groupClause pathkeys
for grouping sets, or when we generate PathTarget for initial input to
grouping nodes.

Furthermore, in set_upper_references, the targetlist and quals of an
Agg node should have nullingrels that include the effects of the
grouping step, ie they will have nullingrels equal to the input
Vars/PHVs' nullingrels plus the nullingrel bit that references the
grouping RTE.  In order to perform exact nullingrels matches, we also
need to manually remove this nullingrel bit.

Bump catversion because this changes the querytree produced by the
parser.

Thanks to Tom Lane for the idea to invent a new kind of RTE.

Per reports from Geoff Winkless, Tobias Wendorff, Richard Guo from
various threads.

Author: Richard Guo
Reviewed-by: Ashutosh Bapat, Sutou Kouhei
Discussion: https://postgr.es/m/CAMbWs4_dp7e7oTwaiZeBX8+P1rXw4ThkZxh1QG81rhu9Z47VsQ@mail.gmail.com
This commit is contained in:
Richard Guo 2024-09-10 12:36:48 +09:00
parent 247dea89f7
commit f5050f795a
11 changed files with 411 additions and 33 deletions

View File

@ -726,6 +726,10 @@ get_eclass_for_sort_expr(PlannerInfo *root,
{
RelOptInfo *rel = root->simple_rel_array[i];
/* ignore the RTE_GROUP RTE */
if (i == root->group_rtindex)
continue;
if (rel == NULL) /* must be an outer join */
{
Assert(bms_is_member(i, root->outer_join_rels));
@ -1087,6 +1091,10 @@ generate_base_implied_equalities(PlannerInfo *root)
{
RelOptInfo *rel = root->simple_rel_array[i];
/* ignore the RTE_GROUP RTE */
if (i == root->group_rtindex)
continue;
if (rel == NULL) /* must be an outer join */
{
Assert(bms_is_member(i, root->outer_join_rels));
@ -3354,6 +3362,10 @@ get_eclass_indexes_for_relids(PlannerInfo *root, Relids relids)
{
RelOptInfo *rel = root->simple_rel_array[i];
/* ignore the RTE_GROUP RTE */
if (i == root->group_rtindex)
continue;
if (rel == NULL) /* must be an outer join */
{
Assert(bms_is_member(i, root->outer_join_rels));

View File

@ -25,6 +25,7 @@
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "partitioning/partbounds.h"
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
/* Consider reordering of GROUP BY keys? */
@ -1341,6 +1342,7 @@ make_pathkeys_for_sortclauses(PlannerInfo *root,
&sortclauses,
tlist,
false,
false,
&sortable,
false);
/* It's caller error if not all clauses were sortable */
@ -1359,6 +1361,9 @@ make_pathkeys_for_sortclauses(PlannerInfo *root,
* give rise to redundant pathkeys are removed from the sortclauses list
* (which therefore must be pass-by-reference in this version).
*
* If remove_group_rtindex is true, then we need to remove the RT index of the
* grouping step from the sort expressions before we make PathKeys for them.
*
* *sortable is set to true if all the sort clauses are in fact sortable.
* If any are not, they are ignored except for setting *sortable false.
* (In that case, the output pathkey list isn't really useful. However,
@ -1375,6 +1380,7 @@ make_pathkeys_for_sortclauses_extended(PlannerInfo *root,
List **sortclauses,
List *tlist,
bool remove_redundant,
bool remove_group_rtindex,
bool *sortable,
bool set_ec_sortref)
{
@ -1394,6 +1400,14 @@ make_pathkeys_for_sortclauses_extended(PlannerInfo *root,
*sortable = false;
continue;
}
if (remove_group_rtindex)
{
Assert(root->group_rtindex > 0);
sortkey = (Expr *)
remove_nulling_relids((Node *) sortkey,
bms_make_singleton(root->group_rtindex),
NULL);
}
pathkey = make_pathkey_from_sortop(root,
sortkey,
sortcl->sortop,

View File

@ -1328,6 +1328,10 @@ mark_rels_nulled_by_join(PlannerInfo *root, Index ojrelid,
{
RelOptInfo *rel = root->simple_rel_array[relid];
/* ignore the RTE_GROUP RTE */
if (relid == root->group_rtindex)
continue;
if (rel == NULL) /* must be an outer join */
{
Assert(bms_is_member(relid, root->outer_join_rels));

View File

@ -58,6 +58,7 @@
#include "parser/parse_relation.h"
#include "parser/parsetree.h"
#include "partitioning/partdesc.h"
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/selfuncs.h"
@ -3506,9 +3507,23 @@ standard_qp_callback(PlannerInfo *root, void *extra)
if (grouping_is_sortable(groupClause))
{
root->group_pathkeys = make_pathkeys_for_sortclauses(root,
groupClause,
tlist);
bool sortable;
/*
* The groupClause is logically below the grouping step. So if
* there is an RTE entry for the grouping step, we need to remove
* its RT index from the sort expressions before we make PathKeys
* for them.
*/
root->group_pathkeys =
make_pathkeys_for_sortclauses_extended(root,
&groupClause,
tlist,
false,
parse->hasGroupRTE,
&sortable,
false);
Assert(sortable);
root->num_groupby_pathkeys = list_length(root->group_pathkeys);
}
else
@ -3538,6 +3553,7 @@ standard_qp_callback(PlannerInfo *root, void *extra)
&root->processed_groupClause,
tlist,
true,
false,
&sortable,
true);
if (!sortable)
@ -3589,6 +3605,7 @@ standard_qp_callback(PlannerInfo *root, void *extra)
&root->processed_distinctClause,
tlist,
true,
false,
&sortable,
false);
if (!sortable)
@ -3616,6 +3633,7 @@ standard_qp_callback(PlannerInfo *root, void *extra)
&groupClauses,
tlist,
false,
false,
&sortable,
false);
if (!sortable)
@ -5526,7 +5544,19 @@ make_group_input_target(PlannerInfo *root, PathTarget *final_target)
{
/*
* It's a grouping column, so add it to the input target as-is.
*
* Note that the target is logically below the grouping step. So
* with grouping sets we need to remove the RT index of the
* grouping step if there is any from the target expression.
*/
if (parse->hasGroupRTE && parse->groupingSets != NIL)
{
Assert(root->group_rtindex > 0);
expr = (Expr *)
remove_nulling_relids((Node *) expr,
bms_make_singleton(root->group_rtindex),
NULL);
}
add_column_to_pathtarget(input_target, expr, sgref);
}
else
@ -5554,11 +5584,23 @@ make_group_input_target(PlannerInfo *root, PathTarget *final_target)
* includes Vars used in resjunk items, so we are covering the needs of
* ORDER BY and window specifications. Vars used within Aggrefs and
* WindowFuncs will be pulled out here, too.
*
* Note that the target is logically below the grouping step. So with
* grouping sets we need to remove the RT index of the grouping step if
* there is any from the non-group Vars.
*/
non_group_vars = pull_var_clause((Node *) non_group_cols,
PVC_RECURSE_AGGREGATES |
PVC_RECURSE_WINDOWFUNCS |
PVC_INCLUDE_PLACEHOLDERS);
if (parse->hasGroupRTE && parse->groupingSets != NIL)
{
Assert(root->group_rtindex > 0);
non_group_vars = (List *)
remove_nulling_relids((Node *) non_group_vars,
bms_make_singleton(root->group_rtindex),
NULL);
}
add_new_columns_to_pathtarget(input_target, non_group_vars);
/* clean up cruft */
@ -6207,6 +6249,7 @@ make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc,
&wc->partitionClause,
tlist,
true,
false,
&sortable,
false);

View File

@ -26,6 +26,7 @@
#include "optimizer/subselect.h"
#include "optimizer/tlist.h"
#include "parser/parse_relation.h"
#include "rewrite/rewriteManip.h"
#include "tcop/utility.h"
#include "utils/syscache.h"
@ -2426,6 +2427,28 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset)
subplan_itlist = build_tlist_index(subplan->targetlist);
/*
* If it's a grouping node with grouping sets, any Vars and PHVs appearing
* in the targetlist and quals should have nullingrels that include the
* effects of the grouping step, ie they will have nullingrels equal to
* the input Vars/PHVs' nullingrels plus the RT index of the grouping
* step. In order to perform exact nullingrels matches, we remove the RT
* index of the grouping step first.
*/
if (IsA(plan, Agg) &&
root->group_rtindex > 0 &&
((Agg *) plan)->groupingSets)
{
plan->targetlist = (List *)
remove_nulling_relids((Node *) plan->targetlist,
bms_make_singleton(root->group_rtindex),
NULL);
plan->qual = (List *)
remove_nulling_relids((Node *) plan->qual,
bms_make_singleton(root->group_rtindex),
NULL);
}
output_targetlist = NIL;
foreach(l, plan->targetlist)
{

View File

@ -22,6 +22,7 @@
#include "access/sysattr.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
#include "optimizer/placeholder.h"
#include "optimizer/prep.h"
@ -83,6 +84,8 @@ static Node *flatten_join_alias_vars_mutator(Node *node,
flatten_join_alias_vars_context *context);
static Node *flatten_group_exprs_mutator(Node *node,
flatten_join_alias_vars_context *context);
static Node *mark_nullable_by_grouping(PlannerInfo *root, Node *newnode,
Var *oldvar);
static Node *add_nullingrels_if_needed(PlannerInfo *root, Node *newnode,
Var *oldvar);
static bool is_standard_join_alias_expression(Node *newnode, Var *oldvar);
@ -909,6 +912,18 @@ flatten_join_alias_vars_mutator(Node *node,
* flatten_group_exprs
* Replace Vars that reference GROUP outputs with the underlying grouping
* expressions.
*
* We have to preserve any varnullingrels info attached to the group Vars we're
* replacing. If the replacement expression is a Var or PlaceHolderVar or
* constructed from those, we can just add the varnullingrels bits to the
* existing nullingrels field(s); otherwise we have to add a PlaceHolderVar
* wrapper.
*
* NOTE: this is also used by ruleutils.c, to deparse one query parsetree back
* to source text. For that use-case, root will be NULL, which is why we have
* to pass the Query separately. We need the root itself only for preserving
* varnullingrels. We can avoid preserving varnullingrels in the ruleutils.c's
* usage because it does not make any difference to the deparsed source text.
*/
Node *
flatten_group_exprs(PlannerInfo *root, Query *query, Node *node)
@ -973,7 +988,8 @@ flatten_group_exprs_mutator(Node *node,
if (context->possible_sublink && !context->inserted_sublink)
context->inserted_sublink = checkExprHasSubLink(newvar);
return newvar;
/* Lastly, add any varnullingrels to the replacement expression */
return mark_nullable_by_grouping(context->root, newvar, var);
}
if (IsA(node, Aggref))
@ -1040,6 +1056,76 @@ flatten_group_exprs_mutator(Node *node,
(void *) context);
}
/*
* Add oldvar's varnullingrels, if any, to a flattened grouping expression.
* The newnode has been copied, so we can modify it freely.
*/
static Node *
mark_nullable_by_grouping(PlannerInfo *root, Node *newnode, Var *oldvar)
{
Relids relids;
if (root == NULL)
return newnode;
if (oldvar->varnullingrels == NULL)
return newnode; /* nothing to do */
Assert(bms_equal(oldvar->varnullingrels,
bms_make_singleton(root->group_rtindex)));
relids = pull_varnos_of_level(root, newnode, oldvar->varlevelsup);
if (!bms_is_empty(relids))
{
/*
* If the newnode is not variable-free, we set the nullingrels of Vars
* or PHVs that are contained in the expression. This is not really
* 'correct' in theory, because it is the whole expression that can be
* nullable by grouping sets, not its individual vars. But it works
* in practice, because what we need is that the expression can be
* somehow distinguished from the same expression in ECs, and marking
* its vars is sufficient for this purpose.
*/
newnode = add_nulling_relids(newnode,
relids,
oldvar->varnullingrels);
}
else /* variable-free? */
{
/*
* If the newnode is variable-free and does not contain volatile
* functions or set-returning functions, it can be treated as a member
* of EC that is redundant. So wrap it in a new PlaceHolderVar to
* carry the nullingrels. Otherwise we do not bother to make any
* changes.
*
* Aggregate functions and window functions are not allowed in
* grouping expressions.
*/
Assert(!contain_agg_clause(newnode));
Assert(!contain_window_function(newnode));
if (!contain_volatile_functions(newnode) &&
!expression_returns_set(newnode))
{
PlaceHolderVar *newphv;
Relids phrels;
phrels = get_relids_in_jointree((Node *) root->parse->jointree,
true, false);
Assert(!bms_is_empty(phrels));
newphv = make_placeholder_expr(root, (Expr *) newnode, phrels);
/* newphv has zero phlevelsup and NULL phnullingrels; fix it */
newphv->phlevelsup = oldvar->varlevelsup;
newphv->phnullingrels = bms_copy(oldvar->varnullingrels);
newnode = (Node *) newphv;
}
}
return newnode;
}
/*
* Add oldvar's varnullingrels, if any, to a flattened join alias expression.
* The newnode has been copied, so we can modify it freely.

View File

@ -1333,9 +1333,6 @@ substitute_grouped_columns_mutator(Node *node,
if (node == NULL)
return NULL;
if (IsA(node, Const) ||
IsA(node, Param))
return node; /* constants are always acceptable */
if (IsA(node, Aggref))
{
@ -1409,6 +1406,16 @@ substitute_grouped_columns_mutator(Node *node,
}
}
/*
* Constants are always acceptable. We have to do this after we checked
* the subexpression as a whole for a match, because it is possible that
* we have GROUP BY items that are constants, and the constants would
* become not so constant after the grouping step.
*/
if (IsA(node, Const) ||
IsA(node, Param))
return node;
/*
* If we have an ungrouped Var of the original query level, we have a
* failure. Vars below the original query level are not a problem, and

View File

@ -57,6 +57,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202409101
#define CATALOG_VERSION_NO 202409102
#endif

View File

@ -240,6 +240,7 @@ extern List *make_pathkeys_for_sortclauses_extended(PlannerInfo *root,
List **sortclauses,
List *tlist,
bool remove_redundant,
bool remove_group_rtindex,
bool *sortable,
bool set_ec_sortref);
extern void initialize_mergeclause_eclasses(PlannerInfo *root,

View File

@ -442,19 +442,22 @@ select * from (
group by grouping sets(1, 2)
) ss
where x = 1 and q1 = 123;
QUERY PLAN
--------------------------------------------
QUERY PLAN
--------------------------------------------------
Subquery Scan on ss
Output: ss.x, ss.q1, ss.sum
Filter: ((ss.x = 1) AND (ss.q1 = 123))
-> GroupAggregate
Output: (1), i1.q1, sum(i1.q2)
Group Key: 1
Group Key: (1)
Sort Key: i1.q1
Group Key: i1.q1
-> Seq Scan on public.int8_tbl i1
Output: 1, i1.q1, i1.q2
(10 rows)
-> Sort
Output: (1), i1.q1, i1.q2
Sort Key: (1)
-> Seq Scan on public.int8_tbl i1
Output: 1, i1.q1, i1.q2
(13 rows)
select * from (
select 1 as x, q1, sum(q2)
@ -736,15 +739,18 @@ select a, b, sum(v.x)
-- Test reordering of grouping sets
explain (costs off)
select * from gstest1 group by grouping sets((a,b,v),(v)) order by v,b,a;
QUERY PLAN
------------------------------------------------------------------------------
GroupAggregate
Group Key: "*VALUES*".column3, "*VALUES*".column2, "*VALUES*".column1
Group Key: "*VALUES*".column3
-> Sort
Sort Key: "*VALUES*".column3, "*VALUES*".column2, "*VALUES*".column1
-> Values Scan on "*VALUES*"
(6 rows)
QUERY PLAN
------------------------------------------------------------------------------------
Incremental Sort
Sort Key: "*VALUES*".column3, "*VALUES*".column2, "*VALUES*".column1
Presorted Key: "*VALUES*".column3
-> GroupAggregate
Group Key: "*VALUES*".column3, "*VALUES*".column2, "*VALUES*".column1
Group Key: "*VALUES*".column3
-> Sort
Sort Key: "*VALUES*".column3, "*VALUES*".column2, "*VALUES*".column1
-> Values Scan on "*VALUES*"
(9 rows)
-- Agg level check. This query should error out.
select (select grouping(a,b) from gstest2) from gstest2 group by a,b;
@ -816,16 +822,18 @@ select a,count(*) from gstest2 group by rollup(a) having a is distinct from 1 or
explain (costs off)
select a,count(*) from gstest2 group by rollup(a) having a is distinct from 1 order by a;
QUERY PLAN
----------------------------------
GroupAggregate
Group Key: a
Group Key: ()
Filter: (a IS DISTINCT FROM 1)
-> Sort
Sort Key: a
-> Seq Scan on gstest2
(7 rows)
QUERY PLAN
----------------------------------------
Sort
Sort Key: a
-> GroupAggregate
Group Key: a
Group Key: ()
Filter: (a IS DISTINCT FROM 1)
-> Sort
Sort Key: a
-> Seq Scan on gstest2
(9 rows)
select v.c, (select count(*) from gstest2 group by () having v.c)
from (values (false),(true)) v(c) order by v.c;
@ -2288,4 +2296,137 @@ select not a from (values(true)) t(a) group by rollup(not a) having not not a;
f
(1 row)
-- test handling of expressions nullable by grouping sets
explain (costs off)
select distinct on (a, b) a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b;
QUERY PLAN
----------------------------------------------------------------
Unique
-> Sort
Sort Key: "*VALUES*".column1, "*VALUES*".column2
-> HashAggregate
Hash Key: "*VALUES*".column1, "*VALUES*".column2
Hash Key: "*VALUES*".column1
-> Values Scan on "*VALUES*"
Filter: (column1 = column2)
(8 rows)
select distinct on (a, b) a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b;
a | b
---+---
1 | 1
1 |
2 | 2
2 |
(4 rows)
explain (costs off)
select distinct on (a, b+1) a, b+1
from (values (1, 0), (2, 1)) as t (a, b) where a = b+1
group by grouping sets((a, b+1), (a))
order by a, b+1;
QUERY PLAN
----------------------------------------------------------------------
Unique
-> Sort
Sort Key: "*VALUES*".column1, (("*VALUES*".column2 + 1))
-> HashAggregate
Hash Key: "*VALUES*".column1, ("*VALUES*".column2 + 1)
Hash Key: "*VALUES*".column1
-> Values Scan on "*VALUES*"
Filter: (column1 = (column2 + 1))
(8 rows)
select distinct on (a, b+1) a, b+1
from (values (1, 0), (2, 1)) as t (a, b) where a = b+1
group by grouping sets((a, b+1), (a))
order by a, b+1;
a | ?column?
---+----------
1 | 1
1 |
2 | 2
2 |
(4 rows)
explain (costs off)
select a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b nulls first;
QUERY PLAN
----------------------------------------------------------------
Sort
Sort Key: "*VALUES*".column1, "*VALUES*".column2 NULLS FIRST
-> HashAggregate
Hash Key: "*VALUES*".column1, "*VALUES*".column2
Hash Key: "*VALUES*".column1
-> Values Scan on "*VALUES*"
Filter: (column1 = column2)
(7 rows)
select a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b nulls first;
a | b
---+---
1 |
1 | 1
2 |
2 | 2
(4 rows)
explain (costs off)
select 1 as one group by rollup(one) order by one nulls first;
QUERY PLAN
-----------------------------
Sort
Sort Key: (1) NULLS FIRST
-> MixedAggregate
Hash Key: 1
Group Key: ()
-> Result
(6 rows)
select 1 as one group by rollup(one) order by one nulls first;
one
-----
1
(2 rows)
explain (costs off)
select a, b, row_number() over (order by a, b nulls first)
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a));
QUERY PLAN
----------------------------------------------------------------------
WindowAgg
-> Sort
Sort Key: "*VALUES*".column1, "*VALUES*".column2 NULLS FIRST
-> HashAggregate
Hash Key: "*VALUES*".column1, "*VALUES*".column2
Hash Key: "*VALUES*".column1
-> Values Scan on "*VALUES*"
Filter: (column1 = column2)
(8 rows)
select a, b, row_number() over (order by a, b nulls first)
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a));
a | b | row_number
---+---+------------
1 | | 1
1 | 1 | 2
2 | | 3
2 | 2 | 4
(4 rows)
-- end

View File

@ -640,4 +640,51 @@ explain (costs off)
select not a from (values(true)) t(a) group by rollup(not a) having not not a;
select not a from (values(true)) t(a) group by rollup(not a) having not not a;
-- test handling of expressions nullable by grouping sets
explain (costs off)
select distinct on (a, b) a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b;
select distinct on (a, b) a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b;
explain (costs off)
select distinct on (a, b+1) a, b+1
from (values (1, 0), (2, 1)) as t (a, b) where a = b+1
group by grouping sets((a, b+1), (a))
order by a, b+1;
select distinct on (a, b+1) a, b+1
from (values (1, 0), (2, 1)) as t (a, b) where a = b+1
group by grouping sets((a, b+1), (a))
order by a, b+1;
explain (costs off)
select a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b nulls first;
select a, b
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a))
order by a, b nulls first;
explain (costs off)
select 1 as one group by rollup(one) order by one nulls first;
select 1 as one group by rollup(one) order by one nulls first;
explain (costs off)
select a, b, row_number() over (order by a, b nulls first)
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a));
select a, b, row_number() over (order by a, b nulls first)
from (values (1, 1), (2, 2)) as t (a, b) where a = b
group by grouping sets((a, b), (a));
-- end