This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ae847b  IMPALA-10382: fix invalid outer join simplification
4ae847b is described below

commit 4ae847bf94e0a1e07860c9c7aa3f0dfcdf548fac
Author: xqhe <hexianqing...@126.com>
AuthorDate: Tue Dec 8 16:46:13 2020 +0800

    IMPALA-10382: fix invalid outer join simplification
    
    When set ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = true, the planner
    will simplify outer joins if the predicate with case expr or conditional
    function on both sides of outer join.
    However, the predicate maybe not null-rejecting, if simplify the outer
    join, the result is incorrect. E.g. t1.b > coalesce(t1.c, t2.c) can
    return true if t2.c is null, so it is not null-rejecting predicate
    for t2.
    
    The fix is simply to support the case that the predicate with two
    operands and the operator is one of (=, !=, >, <, >=, <=),
    1. one of the operands or
    2. if the operand is arithmetic expression and one of the children
    does not contain conditional builtin function or case expr and has
    tuple id in outer joined tuples.
    E.g. t1.b > coalesce(t2.c, t1.c) or t1.b + coalesce(t2.c, t1.c) >
    coalesce(t2.c, t1.c) is null-rejecting predicate for t1.
    
    Testing:
    * Add new plan tests in outer-to-inner-joins.test
    * Add new query tests to verify the correctness on transformation
    
    Change-Id: I84a3812f4212fa823f3d1ced6e12f2df05aedb2b
    Reviewed-on: http://gerrit.cloudera.org:8080/16845
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
---
 .../java/org/apache/impala/analysis/Analyzer.java  |  51 +++++--
 .../queries/PlannerTest/outer-to-inner-joins.test  | 156 +++++++++++++++++++++
 .../queries/QueryTest/outer-to-inner-joins.test    |  27 ++++
 3 files changed, 223 insertions(+), 11 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java 
b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index 8a73e07..a4633c8 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -59,6 +59,7 @@ import org.apache.impala.catalog.TableLoadingException;
 import org.apache.impala.catalog.Type;
 import org.apache.impala.catalog.local.LocalKuduTable;
 import org.apache.impala.common.AnalysisException;
+import org.apache.impala.common.Id;
 import org.apache.impala.common.IdGenerator;
 import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.InternalException;
@@ -3326,25 +3327,53 @@ public class Analyzer {
     }
 
     // Simply assume that a conjunct contains a UDF, is distinct from/ is not 
distinct
-    // from operator or nondeterministic buitin functions, it is not 
null-rejecting
-    // predicate.
+    // from operator, nondeterministic buitin functions or is null operator, 
it is not
+    // null-rejecting predicate.
     if 
(e.contains(Predicates.or(Expr.IS_DISTINCT_FROM_OR_NOT_DISTINCT_PREDICATE,
         Expr.IS_NONDETERMINISTIC_BUILTIN_FN_PREDICATE,
-        Expr.IS_UDF_PREDICATE))) {
+        Expr.IS_UDF_PREDICATE, Expr.IS_IS_NULL_PREDICATE))) {
       return true;
     }
 
-    // For conditional function, is null expr or case expr, if the tuple id of 
the expr
-    // is a subset of 'tupleIds', the result may have null value
+    // Predicate contains conditional function, case expr may not 
null-rejecting.
     List<Expr> maybeNullableExprs = new ArrayList<>();
     e.collectAll(Predicates.or(Expr.IS_CONDITIONAL_BUILTIN_FN_PREDICATE,
-        Expr.IS_IS_NULL_PREDICATE, Expr.IS_CASE_EXPR_PREDICATE), 
maybeNullableExprs);
-    for (Expr expr : maybeNullableExprs) {
-      List<TupleId> tids = new ArrayList<>();
-      expr.getIds(tids, null);
-      if (tupleIds.containsAll(tids)) {
-        return true;
+        Expr.IS_CASE_EXPR_PREDICATE), maybeNullableExprs);
+    if (!maybeNullableExprs.isEmpty()) {
+      if (!Expr.IS_BINARY_PREDICATE.apply(e)) return true;
+      // For t1 left join t2 on t1.a = t2.a where t2.b > coalesce(t1.c, t2.c) 
can
+      // simplify to an inner join. Simply support the case that one child 
does not
+      // contain conditional builtin function or case expr and has tuple id in 
outer
+      // joined tuples.
+      for (Expr operand : e.getChildren()) {
+        if (operand instanceof ArithmeticExpr) {
+          // 't1.id + coalesce(t1.c, t2.c) > coalesce(t2.c, t1.c)' is 
null-rejecting
+          // predicate for t1
+          for (Expr expr : operand.getChildren()) {
+            if (noConditionalBuiltinFnOrCaseExpr(expr, tupleIds)) return false;
+          }
+        } else {
+          if (noConditionalBuiltinFnOrCaseExpr(operand, tupleIds)) return 
false;
+        }
       }
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * If the 'e' does not contain conditional builtin function or case expr and 
has
+   * tupleId in 'tupleIds', return true, return false otherwise.
+   */
+  private boolean noConditionalBuiltinFnOrCaseExpr(Expr e, List<TupleId> 
tupleIds) {
+    List<Expr> nullableExprs = new ArrayList<>();
+    e.collectAll(Predicates.or(Expr.IS_CONDITIONAL_BUILTIN_FN_PREDICATE,
+        Expr.IS_CASE_EXPR_PREDICATE), nullableExprs);
+    if (!nullableExprs.isEmpty()) return false;
+    List<TupleId> tids = new ArrayList<>();
+    e.getIds(tids, null);
+    if (TupleId.intersect(tupleIds, new HashSet<>(tids))) {
+      return true;
     }
     return false;
   }
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
index a90b925..a1714e9 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
@@ -866,4 +866,160 @@ PLAN-ROOT SINK
    predicates: ws.ws_net_paid > 0, ws.ws_net_profit > 1, ws.ws_quantity > 0
    runtime filters: RF000 -> ws.ws_item_sk, RF001 -> ws.ws_order_number, RF004 
-> ws_sold_date_sk
    row-size=32B cardinality=71.94K
+====
+# IMPALA-10382: outer join should not be simplified if the predicate with case 
expr or
+# conditional builtin functions on both sides of outer join is not NULL 
filtering.
+# t1.tinyint_col >= coalesce(t1.int_col, t2.int_col) is not null-rejecting 
predicate for t2
+SELECT t1.int_col
+FROM functional.alltypestiny t1
+LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+  LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col
+WHERE t1.tinyint_col >= coalesce(t1.int_col, t2.int_col);
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.int_col = t3.int_col
+|  row-size=14B cardinality=7.14M
+|
+|--02:SCAN HDFS [functional.alltypes t3]
+|     HDFS partitions=24/24 files=24 size=478.45KB
+|     row-size=4B cardinality=7.30K
+|
+03:HASH JOIN [RIGHT OUTER JOIN]
+|  hash predicates: t2.tinyint_col = t1.tinyint_col
+|  other predicates: t1.tinyint_col >= coalesce(t1.int_col, t2.int_col)
+|  runtime filters: RF000 <- t1.tinyint_col
+|  row-size=10B cardinality=9.78K
+|
+|--00:SCAN HDFS [functional.alltypestiny t1]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=5B cardinality=8
+|
+01:SCAN HDFS [functional.alltypesagg t2]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   runtime filters: RF000 -> t2.tinyint_col
+   row-size=5B cardinality=11.00K
+====
+# IMPALA-10382: outer join should not be simplified if the predicate with case 
expr or
+# conditional builtin functions on both sides of outer join is not NULL 
filtering.
+# t2.tinyint_col >= coalesce(t1.int_col, t2.int_col) is null-rejecting 
predicate for t2
+SELECT t1.int_col
+FROM functional.alltypestiny t1
+LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+  LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col
+WHERE t2.tinyint_col >= coalesce(t1.int_col, t2.int_col);
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.int_col = t3.int_col
+|  row-size=14B cardinality=7.14M
+|
+|--02:SCAN HDFS [functional.alltypes t3]
+|     HDFS partitions=24/24 files=24 size=478.45KB
+|     row-size=4B cardinality=7.30K
+|
+03:HASH JOIN [INNER JOIN]
+|  hash predicates: t2.tinyint_col = t1.tinyint_col
+|  other predicates: t2.tinyint_col >= coalesce(t1.int_col, t2.int_col)
+|  runtime filters: RF000 <- t1.tinyint_col
+|  row-size=10B cardinality=9.78K
+|
+|--00:SCAN HDFS [functional.alltypestiny t1]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=5B cardinality=8
+|
+01:SCAN HDFS [functional.alltypesagg t2]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   runtime filters: RF000 -> t2.tinyint_col
+   row-size=5B cardinality=11.00K
+====
+# IMPALA-10382: outer join should not be simplified if the predicate with case 
expr or
+# conditional builtin functions on both sides of outer join is not NULL 
filtering.
+# The predicate is only null-rejecting for t2, so convert t1 full join t2 to 
t1 right join t2
+SELECT t1.int_col
+FROM functional.alltypestiny t1
+    FULL JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+WHERE t2.tinyint_col + CASE
+    WHEN t1.int_col IS NOT NULL THEN t1.int_col
+    ELSE t2.int_col
+END >= CASE
+    WHEN t1.int_col IS NOT NULL THEN t1.int_col
+    ELSE t2.int_col
+END
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t2.tinyint_col = t1.tinyint_col
+|  other predicates: t2.tinyint_col + CASE WHEN t1.int_col IS NOT NULL THEN 
t1.int_col ELSE t2.int_col END >= CASE WHEN t1.int_col IS NOT NULL THEN 
t1.int_col ELSE t2.int_col END
+|  row-size=10B cardinality=11.00K
+|
+|--00:SCAN HDFS [functional.alltypestiny t1]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=5B cardinality=8
+|
+01:SCAN HDFS [functional.alltypesagg t2]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   row-size=5B cardinality=11.00K
+====
+# IMPALA-10382: outer join should not be simplified if the predicate with case 
expr or
+# conditional builtin functions on both sides of outer join is not NULL 
filtering.
+SELECT t1.int_col
+FROM functional.alltypestiny t1
+LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+  LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col
+WHERE t2.tinyint_col + coalesce(t2.int_col, t1.int_col) >= 
coalesce(t1.int_col, t2.int_col);
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.int_col = t3.int_col
+|  row-size=14B cardinality=7.14M
+|
+|--02:SCAN HDFS [functional.alltypes t3]
+|     HDFS partitions=24/24 files=24 size=478.45KB
+|     row-size=4B cardinality=7.30K
+|
+03:HASH JOIN [INNER JOIN]
+|  hash predicates: t2.tinyint_col = t1.tinyint_col
+|  other predicates: t2.tinyint_col + coalesce(t2.int_col, t1.int_col) >= 
coalesce(t1.int_col, t2.int_col)
+|  runtime filters: RF000 <- t1.tinyint_col
+|  row-size=10B cardinality=9.78K
+|
+|--00:SCAN HDFS [functional.alltypestiny t1]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=5B cardinality=8
+|
+01:SCAN HDFS [functional.alltypesagg t2]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   runtime filters: RF000 -> t2.tinyint_col
+   row-size=5B cardinality=11.00K
+====
+# IMPALA-10382: outer join should not be simplified if the predicate with case 
expr or
+# conditional builtin functions on both sides of outer join is not NULL 
filtering.
+# t2.tinyint_col + t1.int_col + coalesce(t2.int_col, t1.int_col) >= 
coalesce(t1.int_col, t2.int_col)
+# is null-rejecting for t1 and t2
+SELECT t1.int_col
+FROM functional.alltypestiny t1
+  FULL JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+WHERE t2.tinyint_col + t1.int_col + coalesce(t2.int_col, t1.int_col) >= 
coalesce(t1.int_col, t2.int_col);
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [INNER JOIN]
+|  hash predicates: t2.tinyint_col = t1.tinyint_col
+|  other predicates: t2.tinyint_col + t1.int_col + coalesce(t2.int_col, 
t1.int_col) >= coalesce(t1.int_col, t2.int_col)
+|  runtime filters: RF000 <- t1.tinyint_col
+|  row-size=10B cardinality=9.78K
+|
+|--00:SCAN HDFS [functional.alltypestiny t1]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=5B cardinality=8
+|
+01:SCAN HDFS [functional.alltypesagg t2]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   runtime filters: RF000 -> t2.tinyint_col
+   row-size=5B cardinality=11.00K
 ====
\ No newline at end of file
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
 
b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
index 0516721..f7d2933 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
@@ -255,4 +255,31 @@ where t1.id = t2.pos and t2.item = 2
 2
 ---- TYPES
 bigint
+====
+---- QUERY
+SELECT count(*)
+FROM functional.alltypestiny t1
+LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+  LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col
+WHERE t1.tinyint_col >= coalesce(t1.int_col, t2.int_col)
+---- RESULTS
+2922920
+---- TYPES
+bigint
+====
+---- QUERY
+SELECT count(*)
+FROM functional.alltypestiny t1
+    FULL JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col
+WHERE t2.tinyint_col + CASE
+    WHEN t1.int_col IS NOT NULL THEN t1.int_col
+    ELSE t2.int_col
+END >= CASE
+    WHEN t1.int_col IS NOT NULL THEN t1.int_col
+    ELSE t2.int_col
+END
+---- RESULTS
+12000
+---- TYPES
+bigint
 ====
\ No newline at end of file

Reply via email to