This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 745b89a8e83f8f4ebcbf19dd8b65b47db4c2e7f2
Author: Zhenghua Lyu <[email protected]>
AuthorDate: Mon Jun 5 14:23:32 2023 +0000

    Refactor cdbpullup_missingVarWalker.
    
    Previously the API cdbpullup_missingVarWalker() is used to check if an
    expression can be represented by a targetlist. There are two problems
    for the previous design:
      * the API return bool type, the walker help function also return
        bool, but the walker's return value is to control
        expression_tree_walker's termination, it is not good to use
        walker's return value as the API's return value.
      * how to handle the case if there are no Vars in the node expression.
    
    This commit fixes the above two issues:
      * use context to store result needed by users, not the walker's
        return value
      * if there are no vars in the node expression, take this case as
        missing (cannot represented via targetlist).
    
    Fix Github Issue #15079.
---
 src/backend/cdb/cdbpullup.c                        | 112 +++++++++++++++------
 src/test/regress/expected/bfv_subquery.out         |  10 ++
 .../regress/expected/bfv_subquery_optimizer.out    |  10 ++
 src/test/regress/sql/bfv_subquery.sql              |   5 +
 4 files changed, 104 insertions(+), 33 deletions(-)

diff --git a/src/backend/cdb/cdbpullup.c b/src/backend/cdb/cdbpullup.c
index c6f243ae08..ae5945d0b6 100644
--- a/src/backend/cdb/cdbpullup.c
+++ b/src/backend/cdb/cdbpullup.c
@@ -27,9 +27,18 @@
 #include "cdb/cdbpullup.h"             /* me */
 
 
-static bool cdbpullup_missingVarWalker(Expr *node, void *targetlist);
+static bool  cdbpullup_missingVarCheck(Expr *node, List *targetlist);
+static List *cdbpullup_getall_vars(Node *node);
+static bool  cdbpullup_getall_vars_walker(Node *node, void *context);
+static bool  cdbpullup_missingVarCheckWalker(Node *node, void *context);
 static Expr *cdbpullup_make_expr(Index varno, AttrNumber varattno, Expr 
*oldexpr, bool modifyOld);
 
+
+typedef struct MissingVarContext
+{
+       List   *vars;
+       List   *findvars;
+} MissingVarContext;
 /*
  * cdbpullup_expr
  *
@@ -317,10 +326,8 @@ cdbpullup_findEclassInTargetList(EquivalenceClass *eclass, 
List *targetlist,
 
                /* Return this item if all referenced Vars are in targetlist. */
                if (!IsA(key, Var) &&
-                       !cdbpullup_missingVarWalker(key, targetlist))
-               {
+                       !cdbpullup_missingVarCheck(key, targetlist))
                        return key;
-               }
        }
 
        return NULL;
@@ -414,46 +421,85 @@ cdbpullup_make_expr(Index varno, AttrNumber varattno, 
Expr *oldexpr, bool modify
        }
 }                                                              /* 
cdbpullup_make_var */
 
-
 /*
- * cdbpullup_missingVarWalker
- *
- * Returns true if some Var in expr is not in targetlist.
- * 'targetlist' is either a List of TargetEntry, or a plain List of Expr.
+ * cdbpullup_missingVarCheck
+ *   Given an expression and a targetlist, check if the expression
+ *   can be represented by the exprs in the targetlist.
  *
- * NB:  A Var in the expr is considered as matching a Var in the targetlist
- * without regard for whether or not there is a RelabelType node atop the
- * targetlist Var.
+ *   This function return true when Expr node cannot be represented
+ *   based on targetlist (which matches the name that we have missing vars).
  *
+ *   The implementaion is as below:
+ *     1. build the list of all Vars in the Expr node, it is a set (created
+ *        by list_append_unique_ptr in the helper function 
cdbpullup_getall_vars)
+ *     2. walk the targetlist to build a set (created by 
list_append_unique_ptr)
+ *        for all Vars in targetlist that is member of Varset built in 1st 
step,
+ *        here membership test is using list_member which call equal to test.
+ *     3. if the size of the 2nd step's var set is smaller than the 1st step's,
+ *        it means the targetlist misses some vars.
  */
 static bool
-cdbpullup_missingVarWalker(Expr *node, void *targetlist)
+cdbpullup_missingVarCheck(Expr *node, List *targetlist)
 {
-       if (!node)
+       List             *vars    = NIL;
+       MissingVarContext context;
+       bool              missing;
+
+       vars = cdbpullup_getall_vars((Node *) node);
+       if (list_length(vars) == 0)
+               return true;
+
+       context.vars = vars;
+       context.findvars = NIL;
+       (void) expression_tree_walker((Node *) targetlist,
+                                                                 
cdbpullup_missingVarCheckWalker,
+                                                                 (void *) 
&context);
+       missing = (list_length(context.findvars) < list_length(vars));
+       list_free(vars);
+       list_free(context.findvars);
+       return missing;
+}
+
+static List *
+cdbpullup_getall_vars(Node *node)
+{
+       List *vars = NIL;
+
+       (void) expression_tree_walker(node, cdbpullup_getall_vars_walker, (void 
*) &vars);
+       return vars;
+}
+
+static bool
+cdbpullup_getall_vars_walker(Node *node, void *context)
+{
+       List **vars = context;
+
+       if (node == NULL)
                return false;
 
-       /*
-        * Should also consider PlaceHolderVar in the targetlist.
-        * See github issue: https://github.com/greenplum-db/gpdb/issues/10315
-        */
-       if (IsA(node, Var) || IsA(node, PlaceHolderVar))
+       if (IsA(node, Var))
        {
-               if (!targetlist)
-                       return true;
+               *vars = list_append_unique_ptr(*vars, node);
+               return false;
+       }
 
-               /* is targetlist a List of TargetEntry? */
-               if (IsA(linitial(targetlist), TargetEntry))
-               {
-                       if (tlist_member_ignore_relabel(node, (List *) 
targetlist))
-                               return false;   /* this Var ok - go on to check 
rest of expr */
-               }
+       return expression_tree_walker(node, cdbpullup_getall_vars_walker, (void 
*) vars);
+}
+
+static bool
+cdbpullup_missingVarCheckWalker(Node *node, void *context)
+{
+       MissingVarContext *ctx = context;
 
-               /* targetlist must be a List of Expr */
-               else if (list_member((List *) targetlist, node))
-                       return false;           /* this Var ok - go on to check 
rest of expr */
+       if (node == NULL)
+               return false;
 
-               return true;                    /* Var is not in targetlist - 
quit the walk */
+       if (IsA(node, Var))
+       {
+               if (list_member(ctx->vars, node))
+                       ctx->findvars = list_append_unique_ptr(ctx->findvars, 
node);
+               return false;
        }
 
-       return expression_tree_walker((Node *) node, 
cdbpullup_missingVarWalker, targetlist);
-}                                                              /* 
cdbpullup_missingVarWalker */
+       return expression_tree_walker(node, cdbpullup_missingVarCheckWalker, 
(void *) ctx);
+}
diff --git a/src/test/regress/expected/bfv_subquery.out 
b/src/test/regress/expected/bfv_subquery.out
index 741211b3b3..ff9f04adc2 100644
--- a/src/test/regress/expected/bfv_subquery.out
+++ b/src/test/regress/expected/bfv_subquery.out
@@ -555,3 +555,13 @@ limit 15;
  2 |   102
 (15 rows)
 
+-- test case for Github Issue 15079
+create table if not exists issue_15079(c0 real, c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c0' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+select * from (select bool_or(issue_15079.c1) from issue_15079 group by 
random()) as result;
+ bool_or 
+---------
+(0 rows)
+
+drop table issue_15079;
diff --git a/src/test/regress/expected/bfv_subquery_optimizer.out 
b/src/test/regress/expected/bfv_subquery_optimizer.out
index f89dd20935..db858dd038 100644
--- a/src/test/regress/expected/bfv_subquery_optimizer.out
+++ b/src/test/regress/expected/bfv_subquery_optimizer.out
@@ -550,3 +550,13 @@ limit 15;
  2 |   102
 (15 rows)
 
+-- test case for Github Issue 15079
+create table if not exists issue_15079(c0 real, c1 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c0' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+select * from (select bool_or(issue_15079.c1) from issue_15079 group by 
random()) as result;
+ bool_or 
+---------
+(0 rows)
+
+drop table issue_15079;
diff --git a/src/test/regress/sql/bfv_subquery.sql 
b/src/test/regress/sql/bfv_subquery.sql
index e03fcad15b..3e968e3e74 100644
--- a/src/test/regress/sql/bfv_subquery.sql
+++ b/src/test/regress/sql/bfv_subquery.sql
@@ -325,3 +325,8 @@ select with_test2.* from with_test2
 where value < all (select total from my_group_sum where my_group_sum.i = 
with_test2.i)
 order by 1,2
 limit 15;
+
+-- test case for Github Issue 15079
+create table if not exists issue_15079(c0 real, c1 boolean);
+select * from (select bool_or(issue_15079.c1) from issue_15079 group by 
random()) as result;
+drop table issue_15079;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to