This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push: new 4ae847b IMPALA-10382: fix invalid outer join simplification 4ae847b is described below commit 4ae847bf94e0a1e07860c9c7aa3f0dfcdf548fac Author: xqhe <hexianqing...@126.com> AuthorDate: Tue Dec 8 16:46:13 2020 +0800 IMPALA-10382: fix invalid outer join simplification When set ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = true, the planner will simplify outer joins if the predicate with case expr or conditional function on both sides of outer join. However, the predicate maybe not null-rejecting, if simplify the outer join, the result is incorrect. E.g. t1.b > coalesce(t1.c, t2.c) can return true if t2.c is null, so it is not null-rejecting predicate for t2. The fix is simply to support the case that the predicate with two operands and the operator is one of (=, !=, >, <, >=, <=), 1. one of the operands or 2. if the operand is arithmetic expression and one of the children does not contain conditional builtin function or case expr and has tuple id in outer joined tuples. E.g. t1.b > coalesce(t2.c, t1.c) or t1.b + coalesce(t2.c, t1.c) > coalesce(t2.c, t1.c) is null-rejecting predicate for t1. Testing: * Add new plan tests in outer-to-inner-joins.test * Add new query tests to verify the correctness on transformation Change-Id: I84a3812f4212fa823f3d1ced6e12f2df05aedb2b Reviewed-on: http://gerrit.cloudera.org:8080/16845 Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> --- .../java/org/apache/impala/analysis/Analyzer.java | 51 +++++-- .../queries/PlannerTest/outer-to-inner-joins.test | 156 +++++++++++++++++++++ .../queries/QueryTest/outer-to-inner-joins.test | 27 ++++ 3 files changed, 223 insertions(+), 11 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index 8a73e07..a4633c8 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -59,6 +59,7 @@ import org.apache.impala.catalog.TableLoadingException; import org.apache.impala.catalog.Type; import org.apache.impala.catalog.local.LocalKuduTable; import org.apache.impala.common.AnalysisException; +import org.apache.impala.common.Id; import org.apache.impala.common.IdGenerator; import org.apache.impala.common.ImpalaException; import org.apache.impala.common.InternalException; @@ -3326,25 +3327,53 @@ public class Analyzer { } // Simply assume that a conjunct contains a UDF, is distinct from/ is not distinct - // from operator or nondeterministic buitin functions, it is not null-rejecting - // predicate. + // from operator, nondeterministic buitin functions or is null operator, it is not + // null-rejecting predicate. if (e.contains(Predicates.or(Expr.IS_DISTINCT_FROM_OR_NOT_DISTINCT_PREDICATE, Expr.IS_NONDETERMINISTIC_BUILTIN_FN_PREDICATE, - Expr.IS_UDF_PREDICATE))) { + Expr.IS_UDF_PREDICATE, Expr.IS_IS_NULL_PREDICATE))) { return true; } - // For conditional function, is null expr or case expr, if the tuple id of the expr - // is a subset of 'tupleIds', the result may have null value + // Predicate contains conditional function, case expr may not null-rejecting. List<Expr> maybeNullableExprs = new ArrayList<>(); e.collectAll(Predicates.or(Expr.IS_CONDITIONAL_BUILTIN_FN_PREDICATE, - Expr.IS_IS_NULL_PREDICATE, Expr.IS_CASE_EXPR_PREDICATE), maybeNullableExprs); - for (Expr expr : maybeNullableExprs) { - List<TupleId> tids = new ArrayList<>(); - expr.getIds(tids, null); - if (tupleIds.containsAll(tids)) { - return true; + Expr.IS_CASE_EXPR_PREDICATE), maybeNullableExprs); + if (!maybeNullableExprs.isEmpty()) { + if (!Expr.IS_BINARY_PREDICATE.apply(e)) return true; + // For t1 left join t2 on t1.a = t2.a where t2.b > coalesce(t1.c, t2.c) can + // simplify to an inner join. Simply support the case that one child does not + // contain conditional builtin function or case expr and has tuple id in outer + // joined tuples. + for (Expr operand : e.getChildren()) { + if (operand instanceof ArithmeticExpr) { + // 't1.id + coalesce(t1.c, t2.c) > coalesce(t2.c, t1.c)' is null-rejecting + // predicate for t1 + for (Expr expr : operand.getChildren()) { + if (noConditionalBuiltinFnOrCaseExpr(expr, tupleIds)) return false; + } + } else { + if (noConditionalBuiltinFnOrCaseExpr(operand, tupleIds)) return false; + } } + return true; + } + return false; + } + + /** + * If the 'e' does not contain conditional builtin function or case expr and has + * tupleId in 'tupleIds', return true, return false otherwise. + */ + private boolean noConditionalBuiltinFnOrCaseExpr(Expr e, List<TupleId> tupleIds) { + List<Expr> nullableExprs = new ArrayList<>(); + e.collectAll(Predicates.or(Expr.IS_CONDITIONAL_BUILTIN_FN_PREDICATE, + Expr.IS_CASE_EXPR_PREDICATE), nullableExprs); + if (!nullableExprs.isEmpty()) return false; + List<TupleId> tids = new ArrayList<>(); + e.getIds(tids, null); + if (TupleId.intersect(tupleIds, new HashSet<>(tids))) { + return true; } return false; } diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test index a90b925..a1714e9 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test @@ -866,4 +866,160 @@ PLAN-ROOT SINK predicates: ws.ws_net_paid > 0, ws.ws_net_profit > 1, ws.ws_quantity > 0 runtime filters: RF000 -> ws.ws_item_sk, RF001 -> ws.ws_order_number, RF004 -> ws_sold_date_sk row-size=32B cardinality=71.94K +==== +# IMPALA-10382: outer join should not be simplified if the predicate with case expr or +# conditional builtin functions on both sides of outer join is not NULL filtering. +# t1.tinyint_col >= coalesce(t1.int_col, t2.int_col) is not null-rejecting predicate for t2 +SELECT t1.int_col +FROM functional.alltypestiny t1 +LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col + LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col +WHERE t1.tinyint_col >= coalesce(t1.int_col, t2.int_col); +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t1.int_col = t3.int_col +| row-size=14B cardinality=7.14M +| +|--02:SCAN HDFS [functional.alltypes t3] +| HDFS partitions=24/24 files=24 size=478.45KB +| row-size=4B cardinality=7.30K +| +03:HASH JOIN [RIGHT OUTER JOIN] +| hash predicates: t2.tinyint_col = t1.tinyint_col +| other predicates: t1.tinyint_col >= coalesce(t1.int_col, t2.int_col) +| runtime filters: RF000 <- t1.tinyint_col +| row-size=10B cardinality=9.78K +| +|--00:SCAN HDFS [functional.alltypestiny t1] +| HDFS partitions=4/4 files=4 size=460B +| row-size=5B cardinality=8 +| +01:SCAN HDFS [functional.alltypesagg t2] + HDFS partitions=11/11 files=11 size=814.73KB + runtime filters: RF000 -> t2.tinyint_col + row-size=5B cardinality=11.00K +==== +# IMPALA-10382: outer join should not be simplified if the predicate with case expr or +# conditional builtin functions on both sides of outer join is not NULL filtering. +# t2.tinyint_col >= coalesce(t1.int_col, t2.int_col) is null-rejecting predicate for t2 +SELECT t1.int_col +FROM functional.alltypestiny t1 +LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col + LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col +WHERE t2.tinyint_col >= coalesce(t1.int_col, t2.int_col); +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t1.int_col = t3.int_col +| row-size=14B cardinality=7.14M +| +|--02:SCAN HDFS [functional.alltypes t3] +| HDFS partitions=24/24 files=24 size=478.45KB +| row-size=4B cardinality=7.30K +| +03:HASH JOIN [INNER JOIN] +| hash predicates: t2.tinyint_col = t1.tinyint_col +| other predicates: t2.tinyint_col >= coalesce(t1.int_col, t2.int_col) +| runtime filters: RF000 <- t1.tinyint_col +| row-size=10B cardinality=9.78K +| +|--00:SCAN HDFS [functional.alltypestiny t1] +| HDFS partitions=4/4 files=4 size=460B +| row-size=5B cardinality=8 +| +01:SCAN HDFS [functional.alltypesagg t2] + HDFS partitions=11/11 files=11 size=814.73KB + runtime filters: RF000 -> t2.tinyint_col + row-size=5B cardinality=11.00K +==== +# IMPALA-10382: outer join should not be simplified if the predicate with case expr or +# conditional builtin functions on both sides of outer join is not NULL filtering. +# The predicate is only null-rejecting for t2, so convert t1 full join t2 to t1 right join t2 +SELECT t1.int_col +FROM functional.alltypestiny t1 + FULL JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col +WHERE t2.tinyint_col + CASE + WHEN t1.int_col IS NOT NULL THEN t1.int_col + ELSE t2.int_col +END >= CASE + WHEN t1.int_col IS NOT NULL THEN t1.int_col + ELSE t2.int_col +END +---- PLAN +PLAN-ROOT SINK +| +02:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t2.tinyint_col = t1.tinyint_col +| other predicates: t2.tinyint_col + CASE WHEN t1.int_col IS NOT NULL THEN t1.int_col ELSE t2.int_col END >= CASE WHEN t1.int_col IS NOT NULL THEN t1.int_col ELSE t2.int_col END +| row-size=10B cardinality=11.00K +| +|--00:SCAN HDFS [functional.alltypestiny t1] +| HDFS partitions=4/4 files=4 size=460B +| row-size=5B cardinality=8 +| +01:SCAN HDFS [functional.alltypesagg t2] + HDFS partitions=11/11 files=11 size=814.73KB + row-size=5B cardinality=11.00K +==== +# IMPALA-10382: outer join should not be simplified if the predicate with case expr or +# conditional builtin functions on both sides of outer join is not NULL filtering. +SELECT t1.int_col +FROM functional.alltypestiny t1 +LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col + LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col +WHERE t2.tinyint_col + coalesce(t2.int_col, t1.int_col) >= coalesce(t1.int_col, t2.int_col); +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: t1.int_col = t3.int_col +| row-size=14B cardinality=7.14M +| +|--02:SCAN HDFS [functional.alltypes t3] +| HDFS partitions=24/24 files=24 size=478.45KB +| row-size=4B cardinality=7.30K +| +03:HASH JOIN [INNER JOIN] +| hash predicates: t2.tinyint_col = t1.tinyint_col +| other predicates: t2.tinyint_col + coalesce(t2.int_col, t1.int_col) >= coalesce(t1.int_col, t2.int_col) +| runtime filters: RF000 <- t1.tinyint_col +| row-size=10B cardinality=9.78K +| +|--00:SCAN HDFS [functional.alltypestiny t1] +| HDFS partitions=4/4 files=4 size=460B +| row-size=5B cardinality=8 +| +01:SCAN HDFS [functional.alltypesagg t2] + HDFS partitions=11/11 files=11 size=814.73KB + runtime filters: RF000 -> t2.tinyint_col + row-size=5B cardinality=11.00K +==== +# IMPALA-10382: outer join should not be simplified if the predicate with case expr or +# conditional builtin functions on both sides of outer join is not NULL filtering. +# t2.tinyint_col + t1.int_col + coalesce(t2.int_col, t1.int_col) >= coalesce(t1.int_col, t2.int_col) +# is null-rejecting for t1 and t2 +SELECT t1.int_col +FROM functional.alltypestiny t1 + FULL JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col +WHERE t2.tinyint_col + t1.int_col + coalesce(t2.int_col, t1.int_col) >= coalesce(t1.int_col, t2.int_col); +---- PLAN +PLAN-ROOT SINK +| +02:HASH JOIN [INNER JOIN] +| hash predicates: t2.tinyint_col = t1.tinyint_col +| other predicates: t2.tinyint_col + t1.int_col + coalesce(t2.int_col, t1.int_col) >= coalesce(t1.int_col, t2.int_col) +| runtime filters: RF000 <- t1.tinyint_col +| row-size=10B cardinality=9.78K +| +|--00:SCAN HDFS [functional.alltypestiny t1] +| HDFS partitions=4/4 files=4 size=460B +| row-size=5B cardinality=8 +| +01:SCAN HDFS [functional.alltypesagg t2] + HDFS partitions=11/11 files=11 size=814.73KB + runtime filters: RF000 -> t2.tinyint_col + row-size=5B cardinality=11.00K ==== \ No newline at end of file diff --git a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test index 0516721..f7d2933 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test +++ b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test @@ -255,4 +255,31 @@ where t1.id = t2.pos and t2.item = 2 2 ---- TYPES bigint +==== +---- QUERY +SELECT count(*) +FROM functional.alltypestiny t1 +LEFT JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col + LEFT JOIN functional.alltypes t3 ON t1.int_col = t3.int_col +WHERE t1.tinyint_col >= coalesce(t1.int_col, t2.int_col) +---- RESULTS +2922920 +---- TYPES +bigint +==== +---- QUERY +SELECT count(*) +FROM functional.alltypestiny t1 + FULL JOIN functional.alltypesagg t2 ON t1.tinyint_col = t2.tinyint_col +WHERE t2.tinyint_col + CASE + WHEN t1.int_col IS NOT NULL THEN t1.int_col + ELSE t2.int_col +END >= CASE + WHEN t1.int_col IS NOT NULL THEN t1.int_col + ELSE t2.int_col +END +---- RESULTS +12000 +---- TYPES +bigint ==== \ No newline at end of file