This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 59d462ad3e0 HIVE-26135: Invalid Anti join conversion may cause missing results (#3205) (Zoltan Haindrich reviewed by Krisztian Kasa) 59d462ad3e0 is described below commit 59d462ad3e023352ffbd57b4f2446e497a421252 Author: Zoltan Haindrich <k...@rxd.hu> AuthorDate: Tue Apr 26 17:22:49 2022 +0200 HIVE-26135: Invalid Anti join conversion may cause missing results (#3205) (Zoltan Haindrich reviewed by Krisztian Kasa) --- .../hive/ql/optimizer/calcite/HiveCalciteUtil.java | 18 +- .../calcite/rules/HiveAntiSemiJoinRule.java | 23 +- .../queries/clientpositive/antijoin_conversion.q | 22 ++ .../clientpositive/llap/antijoin_conversion.q.out | 280 +++++++++++++++++++++ 4 files changed, 339 insertions(+), 4 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java index d925f159fba..160bfb86f6c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java @@ -630,7 +630,7 @@ public class HiveCalciteUtil { } }; - public static ImmutableList<RexNode> getPredsNotPushedAlready(RelNode inp, List<RexNode> predsToPushDown) { + public static ImmutableList<RexNode> getPredsNotPushedAlready(RelNode inp, List<RexNode> predsToPushDown) { return getPredsNotPushedAlready(Sets.<String>newHashSet(), inp, predsToPushDown); } @@ -1238,6 +1238,22 @@ public class HiveCalciteUtil { return false; } + public static boolean hasAllExpressionsFromRightSide(RelNode joinRel, List<RexNode> expressions) { + List<RelDataTypeField> joinFields = joinRel.getRowType().getFieldList(); + int nTotalFields = joinFields.size(); + List<RelDataTypeField> leftFields = (joinRel.getInputs().get(0)).getRowType().getFieldList(); + int nFieldsLeft = leftFields.size(); + ImmutableBitSet rightBitmap = ImmutableBitSet.range(nFieldsLeft, nTotalFields); + + for (RexNode node : expressions) { + ImmutableBitSet inputBits = RelOptUtil.InputFinder.bits(node); + if (!rightBitmap.contains(inputBits)) { + return false; + } + } + return true; + } + /** * Extracts inputs referenced by aggregate operator. */ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java index 14a64c3d75c..3697ec2c4aa 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java @@ -20,12 +20,15 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.rules; import org.apache.calcite.plan.RelOptRule; import org.apache.calcite.plan.RelOptRuleCall; import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.plan.Strong; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Filter; import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexVisitorImpl; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; @@ -36,8 +39,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; +import java.util.concurrent.atomic.AtomicBoolean; /** * Planner rule that converts a join plus filter to anti join. @@ -136,7 +138,8 @@ public class HiveAntiSemiJoinRule extends RelOptRule { for (RexNode filterNode : aboveFilters) { if (filterNode.getKind() == SqlKind.IS_NULL) { // Null filter from right side table can be removed and its a pre-condition for anti join conversion. - if (HiveCalciteUtil.hasAnyExpressionFromRightSide(join, Collections.singletonList(filterNode))) { + if (HiveCalciteUtil.hasAllExpressionsFromRightSide(join, Collections.singletonList(filterNode)) + && isStrong(((RexCall) filterNode).getOperands().get(0))) { hasNullFilterOnRightSide = true; } else { filterList.add(filterNode); @@ -157,4 +160,18 @@ public class HiveAntiSemiJoinRule extends RelOptRule { } return filterList; } + + private boolean isStrong(RexNode rexNode) { + AtomicBoolean hasCast = new AtomicBoolean(false); + rexNode.accept(new RexVisitorImpl<Void>(true) { + @Override + public Void visitCall(RexCall call) { + if (call.getKind() == SqlKind.CAST) { + hasCast.set(true); + } + return super.visitCall(call); + } + }); + return !hasCast.get() && Strong.isStrong(rexNode); + } } diff --git a/ql/src/test/queries/clientpositive/antijoin_conversion.q b/ql/src/test/queries/clientpositive/antijoin_conversion.q new file mode 100644 index 00000000000..70cb8b39d5a --- /dev/null +++ b/ql/src/test/queries/clientpositive/antijoin_conversion.q @@ -0,0 +1,22 @@ + +drop table if exists t; +drop table if exists n; + +create table t(a string) stored as orc; +create table n(a string) stored as orc; + +insert into t values ('a'),('1'),('2'),(null); +insert into n values ('a'),('b'),('1'),('3'),(null); + +explain +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; +select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; + +set hive.auto.convert.anti.join=false; +explain +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; +select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; + + diff --git a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out new file mode 100644 index 00000000000..583a06994ae --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out @@ -0,0 +1,280 @@ +PREHOOK: query: drop table if exists t +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists t +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists n +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists n +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table t(a string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t +POSTHOOK: query: create table t(a string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t +PREHOOK: query: create table n(a string) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@n +POSTHOOK: query: create table n(a string) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@n +PREHOOK: query: insert into t values ('a'),('1'),('2'),(null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values ('a'),('1'),('2'),(null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.a SCRIPT [] +PREHOOK: query: insert into n values ('a'),('b'),('1'),('3'),(null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@n +POSTHOOK: query: insert into n values ('a'),('b'),('1'),('3'),(null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@n +POSTHOOK: Lineage: n.a SCRIPT [] +PREHOOK: query: explain +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +PREHOOK: type: QUERY +PREHOOK: Input: default@n +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: explain +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: n + Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: t + filterExpr: a is not null (type: boolean) + Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 680 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: UDFToFloat(_col1) is null (type: boolean) + Statistics: Num rows: 2 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +PREHOOK: type: QUERY +PREHOOK: Input: default@n +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +POSTHOOK: Input: default@t +#### A masked pattern was here #### +3 +a +b +NULL +PREHOOK: query: select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +PREHOOK: type: QUERY +PREHOOK: Input: default@n +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +POSTHOOK: Input: default@t +#### A masked pattern was here #### +NULL +PREHOOK: query: explain +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +PREHOOK: type: QUERY +PREHOOK: Input: default@n +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: explain +select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: n + Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: t + filterExpr: a is not null (type: boolean) + Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: a is not null (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 680 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: UDFToFloat(_col1) is null (type: boolean) + Statistics: Num rows: 2 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +PREHOOK: type: QUERY +PREHOOK: Input: default@n +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +POSTHOOK: Input: default@t +#### A masked pattern was here #### +3 +a +b +NULL +PREHOOK: query: select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +PREHOOK: type: QUERY +PREHOOK: Input: default@n +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@n +POSTHOOK: Input: default@t +#### A masked pattern was here #### +NULL