This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new f684ed7  IMPALA-10252: fix invalid runtime filters for outer joins
f684ed7 is described below

commit f684ed72c541fa04dc1841a1aab83a7c9847f1a2
Author: Tim Armstrong <tarmstr...@cloudera.com>
AuthorDate: Wed Oct 21 13:59:25 2020 -0700

    IMPALA-10252: fix invalid runtime filters for outer joins
    
    The planner generates runtime filters for non-join conjuncts
    assigned to LEFT OUTER and FULL OUTER JOIN nodes. This is
    correct in many cases where NULLs stemming from unmatched rows
    would result in the predicate evaluating to false. E.g.
    x = y is always false if y is NULL.
    
    However, it is incorrect if the NULL returned from the unmatched
    row can result in the predicate evaluating to true. E.g.
    x = isnull(y, 1) can return true even if y is NULL.
    
    The fix is to detect cases when the source expression from the
    left input of the join returns non-NULL for null inputs and then
    skip generating the filter.
    
    Examples of expressions that may be affected by this change are
    COALESCE and ISNULL.
    
    Testing:
    Added regression tests:
    * Planner tests for LEFT OUTER and FULL OUTER where the runtime
      filter was incorrectly generated before this patch.
    * Enabled end-to-end test that was previously failing.
    * Added a new runtime filter test that will execute on both
      Parquet and Kudu (which are subtly different because of nullability of
      slots).
    
    Ran exhaustive tests.
    
    Change-Id: I507af1cc8df15bca21e0d8555019997812087261
    Reviewed-on: http://gerrit.cloudera.org:8080/16622
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 .../java/org/apache/impala/analysis/Analyzer.java  | 22 +++++++-
 .../impala/planner/RuntimeFilterGenerator.java     | 31 ++++++++++++
 .../PlannerTest/runtime-filter-propagation.test    | 59 +++++++++++++++++++++-
 .../queries/QueryTest/runtime_filters.test         | 33 ++++++++++++
 .../queries/QueryTest/subquery.test                |  6 +--
 tests/query_test/test_queries.py                   |  2 -
 6 files changed, 144 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java 
b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index a5b1e36..8a73e07 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -2301,6 +2301,25 @@ public class Analyzer {
    * returns false otherwise. Throws if backend expression evaluation fails.
    */
   public boolean isTrueWithNullSlots(Expr p) throws InternalException {
+    Expr nullTuplePred = substituteNullSlots(p);
+    return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx());
+  }
+
+  /**
+   * Try to evaluate 'p' with all NULL slots into a literal.
+   * @return null if it could not be evaluated successfully, the literal 
otherwise.
+   * @throws AnalysisException
+   */
+  public LiteralExpr evalWithNullSlots(Expr p) throws AnalysisException {
+    Expr nullTuplePred = substituteNullSlots(p);
+    return LiteralExpr.createBounded(
+            nullTuplePred, getQueryCtx(), StringLiteral.MAX_STRING_LEN);
+  }
+
+  /**
+   * Replace all the SlotRefs in 'p' with null literals
+   */
+  private Expr substituteNullSlots(Expr p) {
     // Construct predicate with all SlotRefs substituted by NullLiterals.
     List<SlotRef> slotRefs = new ArrayList<>();
     p.collect(Predicates.instanceOf(SlotRef.class), slotRefs);
@@ -2313,8 +2332,7 @@ public class Analyzer {
         // function signature as in the original predicate.
         nullSmap.put(slotRef.clone(), NullLiteral.create(slotRef.getType()));
     }
-    Expr nullTuplePred = p.substitute(nullSmap, this, false);
-    return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx());
+    return p.substitute(nullSmap, this, false);
   }
 
   public TupleId getTupleId(SlotId slotId) {
diff --git 
a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java 
b/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
index 8356d78..1422129 100644
--- a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
+++ b/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
@@ -35,6 +35,8 @@ import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.ExprSubstitutionMap;
 import org.apache.impala.analysis.FunctionCallExpr;
 import org.apache.impala.analysis.IsNullPredicate;
+import org.apache.impala.analysis.LiteralExpr;
+import org.apache.impala.analysis.NullLiteral;
 import org.apache.impala.analysis.Predicate;
 import org.apache.impala.analysis.SlotDescriptor;
 import org.apache.impala.analysis.SlotId;
@@ -368,6 +370,35 @@ public final class RuntimeFilterGenerator {
       Preconditions.checkNotNull(targetSlots);
       if (targetSlots.isEmpty()) return null;
 
+      if (filterSrcNode.getJoinOp().isLeftOuterJoin() ||
+              filterSrcNode.getJoinOp().isFullOuterJoin()) {
+        try {
+          LiteralExpr nullSlotVal = analyzer.evalWithNullSlots(srcExpr);
+          if (!(nullSlotVal instanceof NullLiteral)) {
+            // IMPALA-10252: if the source expression returns a non-NULL value 
for a NULL
+            // input then it is not safe to generate a runtime filter from a 
LEFT OUTER
+            // JOIN or FULL OUTER JOIN because the generated filter would be 
missing a
+            // possible non-NULL value of the expression.
+            // E.g. If the source expression is zeroifnull(y), column y has 
values
+            // [1, 2, 3], and y comes from the right input of a left outer 
join , then
+            // the generated runtime filter would only contain the values [1, 
2, 3] but
+            // not the value zeroifnull(NULL) = 0 that would be present for an 
unmatched
+            // row. For now we avoid the problem by skipping this filter. In 
future we
+            // could generate valid runtime filters for these join types by 
adding
+            // backend support to calculate and insert the missing value.
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Skipping runtime filter because source expression 
returns " +
+                      "non-null after null substitution: " + srcExpr.toSql());
+            }
+            return null;
+          }
+        } catch (AnalysisException e) {
+          LOG.warn("Skipping runtime filter because analysis after null 
substitution " +
+              "failed: " + srcExpr.toSql(), e);
+          return null;
+        }
+      }
+
       if (LOG.isTraceEnabled()) {
         LOG.trace("Generating runtime filter from predicate " + joinPredicate);
       }
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
index 6292d80..050000d 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
@@ -1266,7 +1266,7 @@ PLAN-ROOT SINK
 |     row-size=32B cardinality=1
 |
 00:SCAN HDFS [tpch_nested_parquet.customer c]
-   HDFS partitions=1/1 files=4 size=289.02MB
+   HDFS partitions=1/1 files=4 size=289.08MB
    row-size=32B cardinality=150.00K
 ====
 # Two-way join query where the build side is optimized into an empty set
@@ -1817,3 +1817,60 @@ PLAN-ROOT SINK
    HDFS partitions=4/4 files=4 size=460B
    row-size=4B cardinality=8
 ====
+# IMPALA-10252: runtime filters should not be generated from left and full 
outer joins
+# when expressions are not null-preserving, e.g. zeroifnull() which returns a 
non-NULL
+# value even when the input is NULL.
+# This tests Parquet + LEFT OUTER JOIN
+SELECT s.id, s.int_col % 2, v.id, zeroifnull(v.id + 1)
+FROM functional_parquet.alltypessmall s LEFT OUTER JOIN (
+    SELECT id, int_col
+    FROM functional_parquet.alltypestiny t) v ON s.id = v.id
+WHERE s.int_col % 2 = zeroifnull(v.id + 1)
+ORDER BY s.id ASC
+LIMIT 10
+---- PLAN
+PLAN-ROOT SINK
+|
+03:TOP-N [LIMIT=10]
+|  order by: id ASC
+|  row-size=12B cardinality=10
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: s.id = id
+|  other predicates: s.int_col % 2 = zeroifnull(id + 1)
+|  row-size=12B cardinality=940
+|
+|--01:SCAN HDFS [functional_parquet.alltypestiny t]
+|     HDFS partitions=4/4 files=4 size=11.92KB
+|     row-size=4B cardinality=758
+|
+00:SCAN HDFS [functional_parquet.alltypessmall s]
+   HDFS partitions=4/4 files=4 size=14.78KB
+   row-size=8B cardinality=940
+====
+# IMPALA-10252: same as above but Kudu + FULL OUTER JOIN
+SELECT s.id, s.int_col % 2, v.id, zeroifnull(v.id + 1)
+FROM functional_kudu.alltypessmall s FULL OUTER JOIN (
+    SELECT id, int_col
+    FROM functional_kudu.alltypestiny t) v ON s.id = v.id
+WHERE s.int_col % 2 = zeroifnull(v.id + 1)
+ORDER BY s.id ASC
+LIMIT 10
+---- PLAN
+PLAN-ROOT SINK
+|
+03:TOP-N [LIMIT=10]
+|  order by: id ASC
+|  row-size=12B cardinality=10
+|
+02:HASH JOIN [FULL OUTER JOIN]
+|  hash predicates: s.id = id
+|  other predicates: s.int_col % 2 = zeroifnull(id + 1)
+|  row-size=12B cardinality=108
+|
+|--01:SCAN KUDU [functional_kudu.alltypestiny t]
+|     row-size=4B cardinality=8
+|
+00:SCAN KUDU [functional_kudu.alltypessmall s]
+   row-size=8B cardinality=100
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test 
b/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
index e99e06e..b8bbba9 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
@@ -386,3 +386,36 @@ where a.int_col = b.int_col and a.int_col = c.smallint_col 
* 2 and c.id < 100
 ---- RESULTS
 26645000
 ====
+---- QUERY
+####################################################
+# Test case 17: IMPALA-10252: non-null-preserving expressions in the WHERE 
clause
+# that are placed at an outer join must be handled correctly. These predicates 
are
+# eligible for runtime filters, but are not safe because the 
non-null-preserving
+# predicate - e.g. zeroifnull() - would require special handling to insert that
+# value - zeroifnull(NULL) in this case - into the runtime filter.
+# In this example, we use alltypestiny.id in the expression because it is 
non-nullable
+# on Kudu tables and allows us to test that special case.
+####################################################
+set RUNTIME_FILTER_WAIT_TIME_MS=$RUNTIME_FILTER_WAIT_TIME_MS;
+set RUNTIME_FILTER_MODE=GLOBAL;
+SELECT s.id, s.int_col % 2, v.id, zeroifnull(v.id + 1)
+FROM alltypessmall s LEFT OUTER JOIN (
+    SELECT id, int_col
+    FROM alltypestiny t) v ON s.id = v.id
+WHERE s.int_col % 2 = zeroifnull(v.id + 1)
+ORDER BY s.id ASC
+LIMIT 10
+---- TYPES
+INT, INT, INT, BIGINT
+---- RESULTS
+8,0,NULL,0
+10,0,NULL,0
+12,0,NULL,0
+14,0,NULL,0
+16,0,NULL,0
+18,0,NULL,0
+20,0,NULL,0
+22,0,NULL,0
+24,0,NULL,0
+25,0,NULL,0
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/subquery.test 
b/testdata/workloads/functional-query/queries/QueryTest/subquery.test
index 40f2ec3..6d3bcc1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/subquery.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/subquery.test
@@ -387,11 +387,9 @@ INT, INT, INT, INT
 ====
 ---- QUERY
 # Correlated aggregate subquery
-# Remove functional database qualification in the query below after 
IMPALA-10252 is
-# addressed.
 select id, int_col, year, month
-from functional.alltypessmall s
-where s.int_col = (select count(*) from functional.alltypestiny t where s.id = 
t.id)
+from alltypessmall s
+where s.int_col = (select count(*) from alltypestiny t where s.id = t.id)
 order by id
 ---- RESULTS
 1,1,2009,1
diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py
index ed956b8..249ee20 100644
--- a/tests/query_test/test_queries.py
+++ b/tests/query_test/test_queries.py
@@ -146,8 +146,6 @@ class TestQueries(ImpalaTestSuite):
     self.run_test_case('QueryTest/inline-view-limit', vector)
 
   def test_subquery(self, vector):
-    if vector.get_value('table_format').file_format == 'parquet':
-     pytest.xfail("IMPALA-10252: Query returns less number of rows.")
     if vector.get_value('table_format').file_format == 'hbase':
         pytest.xfail("Table alltypesagg is populated differently in database "
                 "functional and functional_hbase: there are nulls in column "

Reply via email to