(cloudberry) 07/39: fix redundant columns of mutlistage-agg plan (#16080)

yjhjstz Mon, 27 Jan 2025 04:32:38 -0800

This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit c0fcd9924fe79830d5e033053572be1e881d49a6
Author: chaotian <[email protected]>
AuthorDate: Tue Oct 17 17:05:03 2023 +0800

    fix redundant columns of mutlistage-agg plan (#16080)
    
    Partial Agg emitted redundant columns in targetlist especially on 
multi-stage agg scenario
    (upstream didn't have this concern since they don't have mutlistage-agg 
path).
    Now we use `pull_var_clause()` for pull-up vars, aggref and placeholdervar 
of the top
    targetlist, but there are missed cases -- targetentry exists in group 
clause -- we should
    keep them as group clause rather than pulling up it. For implementing it, 
`deconstruct_expr()`
    works for what we mentioned above, not pull-up `Exprs` anymore if it is a 
group clause.
---
 src/backend/optimizer/plan/planner.c               | 114 ++++++++++++++++++---
 src/test/regress/expected/bfv_aggregate.out        |  78 ++++++++++++++
 .../regress/expected/bfv_aggregate_optimizer.out   |  76 ++++++++++++++
 src/test/regress/sql/bfv_aggregate.sql             |   9 ++
 4 files changed, 265 insertions(+), 12 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c 
b/src/backend/optimizer/plan/planner.c
index 2a175b7c3b..16efdb2ed8 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -161,6 +161,12 @@ typedef struct
        AggStrategy strat;
 } split_rollup_data;
 
+typedef struct
+{
+       PathTarget *partial_target;
+       List *grps_tlist;
+} deconstruct_expr_context;
+
 /* Local functions */
 static Node *preprocess_expression(PlannerInfo *root, Node *expr, int kind);
 void preprocess_qual_conditions(PlannerInfo *root, Node *jtnode);
@@ -5612,6 +5618,93 @@ create_scatter_path(PlannerInfo *root, List 
*scatterClause, Path *path)
        return path;
 }
 
+/*
+ * Function: deconstruct_expr_walker
+ *
+ * Work for deconstruct_expr.
+ */
+static bool
+deconstruct_expr_walker(Node *node, deconstruct_expr_context *ctx)
+{
+       ListCell *lc;
+
+       if (node == NULL)
+       {
+               return false;
+       }
+       else if (IsA(node, Var))
+       {
+               if (((Var *) node)->varlevelsup != 0)
+                       elog(ERROR, "Upper-level Var found where not expected");
+
+               add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node);
+               return false;
+       }
+       else if (IsA(node, PlaceHolderVar))
+       {
+               if (((PlaceHolderVar *) node)->phlevelsup != 0)
+                       elog(ERROR, "Upper-level PlaceHolderVar found where not 
expected");
+
+               add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node);
+               return false;
+       }
+       else if (IsA(node, Aggref))
+       {
+               if (((Aggref *) node)->agglevelsup != 0)
+                       elog(ERROR, "Upper-level Aggref found where not 
expected");
+
+               add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node);
+               return false;
+       }
+       else if (IsA(node, GroupId))
+       {
+               if (((GroupId *) node)->agglevelsup != 0)
+                       elog(ERROR, "Upper-level GROUP_ID found where not 
expected");
+
+               add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node);
+               return false;
+       }
+       else if (IsA(node, GroupingFunc))
+       {
+               if (((GroupingFunc *) node)->agglevelsup != 0)
+                       elog(ERROR, "Upper-level GROUPING found where not 
expected");
+
+               add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node);
+               return false;
+       }
+       else
+       {
+               foreach(lc, ctx->grps_tlist)
+               {
+                       Expr *grp_expr = (Expr *)lfirst(lc);
+
+                       /* just return if node equal to group column */
+                       if (equal(node, grp_expr))
+                       {
+                               return false;
+                       }
+               }
+       }
+
+       return expression_tree_walker(node, deconstruct_expr_walker, (void *) 
ctx);
+}
+
+/*
+ * Function: deconstruct_expr
+ *
+ * Prepare an expression for execution within 2-stage aggregation.
+ * This involves adding targets as needed to the target list of the
+ * first (partial) aggregation.
+ */
+static bool
+deconstruct_expr(Expr *expr, PathTarget *partial_target, List *grps_tlist)
+{
+       deconstruct_expr_context ctx;
+       ctx.partial_target = partial_target;
+       ctx.grps_tlist = grps_tlist;
+
+       return deconstruct_expr_walker((Node *) expr, &ctx);
+}
 
 /*
  * make_group_input_target
@@ -5734,15 +5827,13 @@ make_partial_grouping_target(PlannerInfo *root,
 {
        Query      *parse = root->parse;
        PathTarget *partial_target;
-       List       *non_group_cols;
-       List       *non_group_exprs;
-       int                     i;
+       List       *non_group_cols = NULL;
+       List       *grps_tlist = NULL;
+       int                     i = 0;
        ListCell   *lc;
 
        partial_target = create_empty_pathtarget();
-       non_group_cols = NIL;
 
-       i = 0;
        foreach(lc, grouping_target->exprs)
        {
                Expr       *expr = (Expr *) lfirst(lc);
@@ -5756,6 +5847,7 @@ make_partial_grouping_target(PlannerInfo *root,
                         * (This allows the upper agg step to repeat the 
grouping calcs.)
                         */
                        add_column_to_pathtarget(partial_target, expr, sgref);
+                       grps_tlist = lappend(grps_tlist, expr);
                }
                else
                {
@@ -5782,12 +5874,11 @@ make_partial_grouping_target(PlannerInfo *root,
         * be present already.)  Note this includes Vars used in resjunk items, 
so
         * we are covering the needs of ORDER BY and window specifications.
         */
-       non_group_exprs = pull_var_clause((Node *) non_group_cols,
-                                                                         
PVC_INCLUDE_AGGREGATES |
-                                                                         
PVC_RECURSE_WINDOWFUNCS |
-                                                                         
PVC_INCLUDE_PLACEHOLDERS);
-
-       add_new_columns_to_pathtarget(partial_target, non_group_exprs);
+       foreach(lc, non_group_cols)
+       {
+               Expr *expr = (Expr *) lfirst(lc);
+               deconstruct_expr(expr, partial_target, grps_tlist);
+       }
 
        /*
         * Adjust Aggrefs to put them in partial mode.  At this point all 
Aggrefs
@@ -5817,7 +5908,6 @@ make_partial_grouping_target(PlannerInfo *root,
        }
 
        /* clean up cruft */
-       list_free(non_group_exprs);
        list_free(non_group_cols);
 
        /* XXX this causes some redundant cost calculation ... */
diff --git a/src/test/regress/expected/bfv_aggregate.out 
b/src/test/regress/expected/bfv_aggregate.out
index b60276e0a5..9d0c9750f6 100644
--- a/src/test/regress/expected/bfv_aggregate.out
+++ b/src/test/regress/expected/bfv_aggregate.out
@@ -1914,6 +1914,84 @@ explain select g%10 as c1, sum(g::numeric)as c2, 
count(*) as c3 from generate_se
 (4 rows)
 
 reset optimizer_force_multistage_agg;
+-- Eliminate unuseful columns of targetlist in multistage-agg
+create table ex1(a int, b int, c int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table ex2(a int, b int, c int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into ex1 select i,i,i from generate_series(1, 10) i;
+insert into ex2 select i,i,i from generate_series(1, 10) i;
+explain (verbose on, costs off) select ex2.b/2, sum(ex1.a) from ex1, (select 
a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: ((COALESCE(ex2.b, 1) / 2)), (sum(ex1.a))
+   ->  Finalize HashAggregate
+         Output: ((COALESCE(ex2.b, 1) / 2)), sum(ex1.a)
+         Group Key: ((COALESCE(ex2.b, 1) / 2))
+         ->  Redistribute Motion 3:3  (slice2; segments: 3)
+               Output: ((COALESCE(ex2.b, 1) / 2)), (PARTIAL sum(ex1.a))
+               Hash Key: ((COALESCE(ex2.b, 1) / 2))
+               ->  Partial HashAggregate
+                     Output: ((COALESCE(ex2.b, 1) / 2)), PARTIAL sum(ex1.a)
+                     Group Key: (COALESCE(ex2.b, 1) / 2)
+                     ->  Hash Join
+                           Output: (COALESCE(ex2.b, 1) / 2), ex1.a
+                           Hash Cond: (ex1.a = ex2.a)
+                           ->  Seq Scan on bfv_aggregate.ex1
+                                 Output: ex1.a, ex1.b, ex1.c
+                           ->  Hash
+                                 Output: ex2.b, ex2.a
+                                 ->  Seq Scan on bfv_aggregate.ex2
+                                       Output: ex2.b, ex2.a
+ Settings: optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(22 rows)
+
+select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 
where ex1.a = ex2.a group by ex2.b/2;
+ ?column? | sum 
+----------+-----
+        4 |  17
+        2 |   9
+        3 |  13
+        1 |   5
+        0 |   1
+        5 |  10
+(6 rows)
+
+explain (verbose on, costs off) SELECT b/2, sum(b) * (b/2) FROM ex1  GROUP BY 
b/2;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: ((b / 2)), ((sum(b) * ((b / 2))))
+   ->  Finalize HashAggregate
+         Output: ((b / 2)), (sum(b) * ((b / 2)))
+         Group Key: ((ex1.b / 2))
+         ->  Redistribute Motion 3:3  (slice2; segments: 3)
+               Output: ((b / 2)), (PARTIAL sum(b))
+               Hash Key: ((b / 2))
+               ->  Partial HashAggregate
+                     Output: ((b / 2)), PARTIAL sum(b)
+                     Group Key: (ex1.b / 2)
+                     ->  Seq Scan on bfv_aggregate.ex1
+                           Output: (b / 2), b
+ Settings: optimizer = 'off'
+ Optimizer: Postgres query optimizer
+(15 rows)
+
+SELECT b/2, sum(b) * (b/2) FROM ex1  GROUP BY b/2;
+ ?column? | ?column? 
+----------+----------
+        5 |       50
+        4 |       68
+        2 |       18
+        3 |       39
+        1 |        5
+        0 |        0
+(6 rows)
+
 -- CLEANUP
 set client_min_messages='warning';
 drop schema bfv_aggregate cascade;
diff --git a/src/test/regress/expected/bfv_aggregate_optimizer.out 
b/src/test/regress/expected/bfv_aggregate_optimizer.out
index 06efa5f6f6..03722b84bb 100644
--- a/src/test/regress/expected/bfv_aggregate_optimizer.out
+++ b/src/test/regress/expected/bfv_aggregate_optimizer.out
@@ -1939,6 +1939,82 @@ explain select g%10 as c1, sum(g::numeric)as c2, 
count(*) as c3 from generate_se
 (4 rows)
 
 reset optimizer_force_multistage_agg;
+-- Eliminate unuseful columns of targetlist in multistage-agg
+create table ex1(a int, b int, c int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table ex2(a int, b int, c int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into ex1 select i,i,i from generate_series(1, 10) i;
+insert into ex2 select i,i,i from generate_series(1, 10) i;
+explain (verbose on, costs off) select ex2.b/2, sum(ex1.a) from ex1, (select 
a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: (((COALESCE(ex2.b, 1)) / 2)), (sum(ex1.a))
+   ->  GroupAggregate
+         Output: (((COALESCE(ex2.b, 1)) / 2)), sum(ex1.a)
+         Group Key: (((COALESCE(ex2.b, 1)) / 2))
+         ->  Sort
+               Output: ex1.a, (((COALESCE(ex2.b, 1)) / 2))
+               Sort Key: (((COALESCE(ex2.b, 1)) / 2))
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Output: ex1.a, (((COALESCE(ex2.b, 1)) / 2))
+                     Hash Key: (((COALESCE(ex2.b, 1)) / 2))
+                     ->  Hash Join
+                           Output: ex1.a, ((COALESCE(ex2.b, 1)) / 2)
+                           Hash Cond: (ex2.a = ex1.a)
+                           ->  Seq Scan on bfv_aggregate.ex2
+                                 Output: COALESCE(ex2.b, 1), ex2.a
+                           ->  Hash
+                                 Output: ex1.a
+                                 ->  Seq Scan on bfv_aggregate.ex1
+                                       Output: ex1.a
+ Optimizer: Pivotal Optimizer (GPORCA)
+(21 rows)
+
+select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 
where ex1.a = ex2.a group by ex2.b/2;
+ ?column? | sum 
+----------+-----
+        5 |  10
+        2 |   9
+        3 |  13
+        4 |  17
+        0 |   1
+        1 |   5
+(6 rows)
+
+explain (verbose on, costs off) SELECT b/2, sum(b) * (b/2) FROM ex1  GROUP BY 
b/2;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   Output: ((b / 2)), ((sum(b) * ((b / 2))))
+   ->  GroupAggregate
+         Output: ((b / 2)), (sum(b) * ((b / 2)))
+         Group Key: ((ex1.b / 2))
+         ->  Sort
+               Output: b, ((b / 2))
+               Sort Key: ((ex1.b / 2))
+               ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                     Output: b, ((b / 2))
+                     Hash Key: ((b / 2))
+                     ->  Seq Scan on bfv_aggregate.ex1
+                           Output: b, (b / 2)
+ Optimizer: Pivotal Optimizer (GPORCA)
+(14 rows)
+
+SELECT b/2, sum(b) * (b/2) FROM ex1  GROUP BY b/2;
+ ?column? | ?column? 
+----------+----------
+        5 |       50
+        2 |       18
+        3 |       39
+        4 |       68
+        0 |        0
+        1 |        5
+(6 rows)
+
 -- CLEANUP
 set client_min_messages='warning';
 drop schema bfv_aggregate cascade;
diff --git a/src/test/regress/sql/bfv_aggregate.sql 
b/src/test/regress/sql/bfv_aggregate.sql
index 87b512418c..e7c922f48a 100644
--- a/src/test/regress/sql/bfv_aggregate.sql
+++ b/src/test/regress/sql/bfv_aggregate.sql
@@ -1489,6 +1489,15 @@ explain select count(*) from generate_series(0, 100) g;
 explain select g%10 as c1, sum(g::numeric)as c2, count(*) as c3 from 
generate_series(1, 99) g group by g%10;
 
 reset optimizer_force_multistage_agg;
+-- Eliminate unuseful columns of targetlist in multistage-agg
+create table ex1(a int, b int, c int);
+create table ex2(a int, b int, c int);
+insert into ex1 select i,i,i from generate_series(1, 10) i;
+insert into ex2 select i,i,i from generate_series(1, 10) i;
+explain (verbose on, costs off) select ex2.b/2, sum(ex1.a) from ex1, (select 
a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2;
+select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 
where ex1.a = ex2.a group by ex2.b/2;
+explain (verbose on, costs off) SELECT b/2, sum(b) * (b/2) FROM ex1  GROUP BY 
b/2;
+SELECT b/2, sum(b) * (b/2) FROM ex1  GROUP BY b/2;
 
 -- CLEANUP
 set client_min_messages='warning';


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 07/39: fix redundant columns of mutlistage-agg plan (#16080)

Reply via email to