This is an automated email from the ASF dual-hosted git repository. yjhjstz pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit c0fcd9924fe79830d5e033053572be1e881d49a6 Author: chaotian <[email protected]> AuthorDate: Tue Oct 17 17:05:03 2023 +0800 fix redundant columns of mutlistage-agg plan (#16080) Partial Agg emitted redundant columns in targetlist especially on multi-stage agg scenario (upstream didn't have this concern since they don't have mutlistage-agg path). Now we use `pull_var_clause()` for pull-up vars, aggref and placeholdervar of the top targetlist, but there are missed cases -- targetentry exists in group clause -- we should keep them as group clause rather than pulling up it. For implementing it, `deconstruct_expr()` works for what we mentioned above, not pull-up `Exprs` anymore if it is a group clause. --- src/backend/optimizer/plan/planner.c | 114 ++++++++++++++++++--- src/test/regress/expected/bfv_aggregate.out | 78 ++++++++++++++ .../regress/expected/bfv_aggregate_optimizer.out | 76 ++++++++++++++ src/test/regress/sql/bfv_aggregate.sql | 9 ++ 4 files changed, 265 insertions(+), 12 deletions(-) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 2a175b7c3b..16efdb2ed8 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -161,6 +161,12 @@ typedef struct AggStrategy strat; } split_rollup_data; +typedef struct +{ + PathTarget *partial_target; + List *grps_tlist; +} deconstruct_expr_context; + /* Local functions */ static Node *preprocess_expression(PlannerInfo *root, Node *expr, int kind); void preprocess_qual_conditions(PlannerInfo *root, Node *jtnode); @@ -5612,6 +5618,93 @@ create_scatter_path(PlannerInfo *root, List *scatterClause, Path *path) return path; } +/* + * Function: deconstruct_expr_walker + * + * Work for deconstruct_expr. + */ +static bool +deconstruct_expr_walker(Node *node, deconstruct_expr_context *ctx) +{ + ListCell *lc; + + if (node == NULL) + { + return false; + } + else if (IsA(node, Var)) + { + if (((Var *) node)->varlevelsup != 0) + elog(ERROR, "Upper-level Var found where not expected"); + + add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node); + return false; + } + else if (IsA(node, PlaceHolderVar)) + { + if (((PlaceHolderVar *) node)->phlevelsup != 0) + elog(ERROR, "Upper-level PlaceHolderVar found where not expected"); + + add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node); + return false; + } + else if (IsA(node, Aggref)) + { + if (((Aggref *) node)->agglevelsup != 0) + elog(ERROR, "Upper-level Aggref found where not expected"); + + add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node); + return false; + } + else if (IsA(node, GroupId)) + { + if (((GroupId *) node)->agglevelsup != 0) + elog(ERROR, "Upper-level GROUP_ID found where not expected"); + + add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node); + return false; + } + else if (IsA(node, GroupingFunc)) + { + if (((GroupingFunc *) node)->agglevelsup != 0) + elog(ERROR, "Upper-level GROUPING found where not expected"); + + add_new_column_to_pathtarget(ctx->partial_target, (Expr *)node); + return false; + } + else + { + foreach(lc, ctx->grps_tlist) + { + Expr *grp_expr = (Expr *)lfirst(lc); + + /* just return if node equal to group column */ + if (equal(node, grp_expr)) + { + return false; + } + } + } + + return expression_tree_walker(node, deconstruct_expr_walker, (void *) ctx); +} + +/* + * Function: deconstruct_expr + * + * Prepare an expression for execution within 2-stage aggregation. + * This involves adding targets as needed to the target list of the + * first (partial) aggregation. + */ +static bool +deconstruct_expr(Expr *expr, PathTarget *partial_target, List *grps_tlist) +{ + deconstruct_expr_context ctx; + ctx.partial_target = partial_target; + ctx.grps_tlist = grps_tlist; + + return deconstruct_expr_walker((Node *) expr, &ctx); +} /* * make_group_input_target @@ -5734,15 +5827,13 @@ make_partial_grouping_target(PlannerInfo *root, { Query *parse = root->parse; PathTarget *partial_target; - List *non_group_cols; - List *non_group_exprs; - int i; + List *non_group_cols = NULL; + List *grps_tlist = NULL; + int i = 0; ListCell *lc; partial_target = create_empty_pathtarget(); - non_group_cols = NIL; - i = 0; foreach(lc, grouping_target->exprs) { Expr *expr = (Expr *) lfirst(lc); @@ -5756,6 +5847,7 @@ make_partial_grouping_target(PlannerInfo *root, * (This allows the upper agg step to repeat the grouping calcs.) */ add_column_to_pathtarget(partial_target, expr, sgref); + grps_tlist = lappend(grps_tlist, expr); } else { @@ -5782,12 +5874,11 @@ make_partial_grouping_target(PlannerInfo *root, * be present already.) Note this includes Vars used in resjunk items, so * we are covering the needs of ORDER BY and window specifications. */ - non_group_exprs = pull_var_clause((Node *) non_group_cols, - PVC_INCLUDE_AGGREGATES | - PVC_RECURSE_WINDOWFUNCS | - PVC_INCLUDE_PLACEHOLDERS); - - add_new_columns_to_pathtarget(partial_target, non_group_exprs); + foreach(lc, non_group_cols) + { + Expr *expr = (Expr *) lfirst(lc); + deconstruct_expr(expr, partial_target, grps_tlist); + } /* * Adjust Aggrefs to put them in partial mode. At this point all Aggrefs @@ -5817,7 +5908,6 @@ make_partial_grouping_target(PlannerInfo *root, } /* clean up cruft */ - list_free(non_group_exprs); list_free(non_group_cols); /* XXX this causes some redundant cost calculation ... */ diff --git a/src/test/regress/expected/bfv_aggregate.out b/src/test/regress/expected/bfv_aggregate.out index b60276e0a5..9d0c9750f6 100644 --- a/src/test/regress/expected/bfv_aggregate.out +++ b/src/test/regress/expected/bfv_aggregate.out @@ -1914,6 +1914,84 @@ explain select g%10 as c1, sum(g::numeric)as c2, count(*) as c3 from generate_se (4 rows) reset optimizer_force_multistage_agg; +-- Eliminate unuseful columns of targetlist in multistage-agg +create table ex1(a int, b int, c int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table ex2(a int, b int, c int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into ex1 select i,i,i from generate_series(1, 10) i; +insert into ex2 select i,i,i from generate_series(1, 10) i; +explain (verbose on, costs off) select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2; + QUERY PLAN +---------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Output: ((COALESCE(ex2.b, 1) / 2)), (sum(ex1.a)) + -> Finalize HashAggregate + Output: ((COALESCE(ex2.b, 1) / 2)), sum(ex1.a) + Group Key: ((COALESCE(ex2.b, 1) / 2)) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Output: ((COALESCE(ex2.b, 1) / 2)), (PARTIAL sum(ex1.a)) + Hash Key: ((COALESCE(ex2.b, 1) / 2)) + -> Partial HashAggregate + Output: ((COALESCE(ex2.b, 1) / 2)), PARTIAL sum(ex1.a) + Group Key: (COALESCE(ex2.b, 1) / 2) + -> Hash Join + Output: (COALESCE(ex2.b, 1) / 2), ex1.a + Hash Cond: (ex1.a = ex2.a) + -> Seq Scan on bfv_aggregate.ex1 + Output: ex1.a, ex1.b, ex1.c + -> Hash + Output: ex2.b, ex2.a + -> Seq Scan on bfv_aggregate.ex2 + Output: ex2.b, ex2.a + Settings: optimizer = 'off' + Optimizer: Postgres query optimizer +(22 rows) + +select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2; + ?column? | sum +----------+----- + 4 | 17 + 2 | 9 + 3 | 13 + 1 | 5 + 0 | 1 + 5 | 10 +(6 rows) + +explain (verbose on, costs off) SELECT b/2, sum(b) * (b/2) FROM ex1 GROUP BY b/2; + QUERY PLAN +------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Output: ((b / 2)), ((sum(b) * ((b / 2)))) + -> Finalize HashAggregate + Output: ((b / 2)), (sum(b) * ((b / 2))) + Group Key: ((ex1.b / 2)) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Output: ((b / 2)), (PARTIAL sum(b)) + Hash Key: ((b / 2)) + -> Partial HashAggregate + Output: ((b / 2)), PARTIAL sum(b) + Group Key: (ex1.b / 2) + -> Seq Scan on bfv_aggregate.ex1 + Output: (b / 2), b + Settings: optimizer = 'off' + Optimizer: Postgres query optimizer +(15 rows) + +SELECT b/2, sum(b) * (b/2) FROM ex1 GROUP BY b/2; + ?column? | ?column? +----------+---------- + 5 | 50 + 4 | 68 + 2 | 18 + 3 | 39 + 1 | 5 + 0 | 0 +(6 rows) + -- CLEANUP set client_min_messages='warning'; drop schema bfv_aggregate cascade; diff --git a/src/test/regress/expected/bfv_aggregate_optimizer.out b/src/test/regress/expected/bfv_aggregate_optimizer.out index 06efa5f6f6..03722b84bb 100644 --- a/src/test/regress/expected/bfv_aggregate_optimizer.out +++ b/src/test/regress/expected/bfv_aggregate_optimizer.out @@ -1939,6 +1939,82 @@ explain select g%10 as c1, sum(g::numeric)as c2, count(*) as c3 from generate_se (4 rows) reset optimizer_force_multistage_agg; +-- Eliminate unuseful columns of targetlist in multistage-agg +create table ex1(a int, b int, c int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table ex2(a int, b int, c int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into ex1 select i,i,i from generate_series(1, 10) i; +insert into ex2 select i,i,i from generate_series(1, 10) i; +explain (verbose on, costs off) select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2; + QUERY PLAN +--------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Output: (((COALESCE(ex2.b, 1)) / 2)), (sum(ex1.a)) + -> GroupAggregate + Output: (((COALESCE(ex2.b, 1)) / 2)), sum(ex1.a) + Group Key: (((COALESCE(ex2.b, 1)) / 2)) + -> Sort + Output: ex1.a, (((COALESCE(ex2.b, 1)) / 2)) + Sort Key: (((COALESCE(ex2.b, 1)) / 2)) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Output: ex1.a, (((COALESCE(ex2.b, 1)) / 2)) + Hash Key: (((COALESCE(ex2.b, 1)) / 2)) + -> Hash Join + Output: ex1.a, ((COALESCE(ex2.b, 1)) / 2) + Hash Cond: (ex2.a = ex1.a) + -> Seq Scan on bfv_aggregate.ex2 + Output: COALESCE(ex2.b, 1), ex2.a + -> Hash + Output: ex1.a + -> Seq Scan on bfv_aggregate.ex1 + Output: ex1.a + Optimizer: Pivotal Optimizer (GPORCA) +(21 rows) + +select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2; + ?column? | sum +----------+----- + 5 | 10 + 2 | 9 + 3 | 13 + 4 | 17 + 0 | 1 + 1 | 5 +(6 rows) + +explain (verbose on, costs off) SELECT b/2, sum(b) * (b/2) FROM ex1 GROUP BY b/2; + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Output: ((b / 2)), ((sum(b) * ((b / 2)))) + -> GroupAggregate + Output: ((b / 2)), (sum(b) * ((b / 2))) + Group Key: ((ex1.b / 2)) + -> Sort + Output: b, ((b / 2)) + Sort Key: ((ex1.b / 2)) + -> Redistribute Motion 3:3 (slice2; segments: 3) + Output: b, ((b / 2)) + Hash Key: ((b / 2)) + -> Seq Scan on bfv_aggregate.ex1 + Output: b, (b / 2) + Optimizer: Pivotal Optimizer (GPORCA) +(14 rows) + +SELECT b/2, sum(b) * (b/2) FROM ex1 GROUP BY b/2; + ?column? | ?column? +----------+---------- + 5 | 50 + 2 | 18 + 3 | 39 + 4 | 68 + 0 | 0 + 1 | 5 +(6 rows) + -- CLEANUP set client_min_messages='warning'; drop schema bfv_aggregate cascade; diff --git a/src/test/regress/sql/bfv_aggregate.sql b/src/test/regress/sql/bfv_aggregate.sql index 87b512418c..e7c922f48a 100644 --- a/src/test/regress/sql/bfv_aggregate.sql +++ b/src/test/regress/sql/bfv_aggregate.sql @@ -1489,6 +1489,15 @@ explain select count(*) from generate_series(0, 100) g; explain select g%10 as c1, sum(g::numeric)as c2, count(*) as c3 from generate_series(1, 99) g group by g%10; reset optimizer_force_multistage_agg; +-- Eliminate unuseful columns of targetlist in multistage-agg +create table ex1(a int, b int, c int); +create table ex2(a int, b int, c int); +insert into ex1 select i,i,i from generate_series(1, 10) i; +insert into ex2 select i,i,i from generate_series(1, 10) i; +explain (verbose on, costs off) select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2; +select ex2.b/2, sum(ex1.a) from ex1, (select a, coalesce(b, 1) b from ex2) ex2 where ex1.a = ex2.a group by ex2.b/2; +explain (verbose on, costs off) SELECT b/2, sum(b) * (b/2) FROM ex1 GROUP BY b/2; +SELECT b/2, sum(b) * (b/2) FROM ex1 GROUP BY b/2; -- CLEANUP set client_min_messages='warning'; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
