This is an automated email from the ASF dual-hosted git repository.

kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 59d462ad3e0 HIVE-26135: Invalid Anti join conversion may cause missing 
results (#3205) (Zoltan Haindrich reviewed by Krisztian Kasa)
59d462ad3e0 is described below

commit 59d462ad3e023352ffbd57b4f2446e497a421252
Author: Zoltan Haindrich <k...@rxd.hu>
AuthorDate: Tue Apr 26 17:22:49 2022 +0200

    HIVE-26135: Invalid Anti join conversion may cause missing results (#3205) 
(Zoltan Haindrich reviewed by Krisztian Kasa)
---
 .../hive/ql/optimizer/calcite/HiveCalciteUtil.java |  18 +-
 .../calcite/rules/HiveAntiSemiJoinRule.java        |  23 +-
 .../queries/clientpositive/antijoin_conversion.q   |  22 ++
 .../clientpositive/llap/antijoin_conversion.q.out  | 280 +++++++++++++++++++++
 4 files changed, 339 insertions(+), 4 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
index d925f159fba..160bfb86f6c 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
@@ -630,7 +630,7 @@ public class HiveCalciteUtil {
                                                               }
                                                             };
 
-  public static ImmutableList<RexNode> getPredsNotPushedAlready(RelNode inp, 
List<RexNode> predsToPushDown) {   
+  public static ImmutableList<RexNode> getPredsNotPushedAlready(RelNode inp, 
List<RexNode> predsToPushDown) {
     return getPredsNotPushedAlready(Sets.<String>newHashSet(), inp, 
predsToPushDown);
   }
 
@@ -1238,6 +1238,22 @@ public class HiveCalciteUtil {
     return false;
   }
 
+  public static boolean hasAllExpressionsFromRightSide(RelNode joinRel, 
List<RexNode> expressions) {
+    List<RelDataTypeField> joinFields = joinRel.getRowType().getFieldList();
+    int nTotalFields = joinFields.size();
+    List<RelDataTypeField> leftFields = 
(joinRel.getInputs().get(0)).getRowType().getFieldList();
+    int nFieldsLeft = leftFields.size();
+    ImmutableBitSet rightBitmap = ImmutableBitSet.range(nFieldsLeft, 
nTotalFields);
+
+    for (RexNode node : expressions) {
+      ImmutableBitSet inputBits = RelOptUtil.InputFinder.bits(node);
+      if (!rightBitmap.contains(inputBits)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   /**
    * Extracts inputs referenced by aggregate operator.
    */
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
index 14a64c3d75c..3697ec2c4aa 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
@@ -20,12 +20,15 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
 import org.apache.calcite.plan.RelOptRule;
 import org.apache.calcite.plan.RelOptRuleCall;
 import org.apache.calcite.plan.RelOptUtil;
+import org.apache.calcite.plan.Strong;
 import org.apache.calcite.rel.RelNode;
 import org.apache.calcite.rel.core.Filter;
 import org.apache.calcite.rel.core.Join;
 import org.apache.calcite.rel.core.JoinRelType;
 import org.apache.calcite.rel.core.Project;
+import org.apache.calcite.rex.RexCall;
 import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexVisitorImpl;
 import org.apache.calcite.sql.SqlKind;
 import org.apache.calcite.sql.fun.SqlStdOperatorTable;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
@@ -36,8 +39,7 @@ import org.slf4j.LoggerFactory;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 /**
  * Planner rule that converts a join plus filter to anti join.
@@ -136,7 +138,8 @@ public class HiveAntiSemiJoinRule extends RelOptRule {
     for (RexNode filterNode : aboveFilters) {
       if (filterNode.getKind() == SqlKind.IS_NULL) {
         // Null filter from right side table can be removed and its a 
pre-condition for anti join conversion.
-        if (HiveCalciteUtil.hasAnyExpressionFromRightSide(join, 
Collections.singletonList(filterNode))) {
+        if (HiveCalciteUtil.hasAllExpressionsFromRightSide(join, 
Collections.singletonList(filterNode))
+            && isStrong(((RexCall) filterNode).getOperands().get(0))) {
           hasNullFilterOnRightSide = true;
         } else {
           filterList.add(filterNode);
@@ -157,4 +160,18 @@ public class HiveAntiSemiJoinRule extends RelOptRule {
     }
     return filterList;
   }
+
+  private boolean isStrong(RexNode rexNode) {
+    AtomicBoolean hasCast = new AtomicBoolean(false);
+    rexNode.accept(new RexVisitorImpl<Void>(true) {
+      @Override
+      public Void visitCall(RexCall call) {
+        if (call.getKind() == SqlKind.CAST) {
+          hasCast.set(true);
+        }
+        return super.visitCall(call);
+      }
+    });
+    return !hasCast.get() && Strong.isStrong(rexNode);
+  }
 }
diff --git a/ql/src/test/queries/clientpositive/antijoin_conversion.q 
b/ql/src/test/queries/clientpositive/antijoin_conversion.q
new file mode 100644
index 00000000000..70cb8b39d5a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/antijoin_conversion.q
@@ -0,0 +1,22 @@
+
+drop table if exists t;
+drop table if exists n;
+
+create table t(a string) stored as orc;
+create table n(a string) stored as orc;
+
+insert into t values ('a'),('1'),('2'),(null);
+insert into n values ('a'),('b'),('1'),('3'),(null);
+
+explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null;
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null;
+select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null;
+
+set hive.auto.convert.anti.join=false;
+explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null;
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null;
+select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null;
+
+
diff --git a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out 
b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
new file mode 100644
index 00000000000..583a06994ae
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
@@ -0,0 +1,280 @@
+PREHOOK: query: drop table if exists t
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists t
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists n
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists n
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table t(a string) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t
+POSTHOOK: query: create table t(a string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t
+PREHOOK: query: create table n(a string) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@n
+POSTHOOK: query: create table n(a string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@n
+PREHOOK: query: insert into t values ('a'),('1'),('2'),(null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t
+POSTHOOK: query: insert into t values ('a'),('1'),('2'),(null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t
+POSTHOOK: Lineage: t.a SCRIPT []
+PREHOOK: query: insert into n values ('a'),('b'),('1'),('3'),(null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@n
+POSTHOOK: query: insert into n values ('a'),('b'),('1'),('3'),(null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@n
+POSTHOOK: Lineage: n.a SCRIPT []
+PREHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: n
+                  Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: a (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 5 Data size: 425 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 5 Data size: 425 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  filterExpr: a is not null (type: boolean)
+                  Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Filter Operator
+                    predicate: a is not null (type: boolean)
+                    Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: a (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 5 Data size: 680 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Filter Operator
+                  predicate: UDFToFloat(_col1) is null (type: boolean)
+                  Statistics: Num rows: 2 Data size: 340 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: _col0 (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 2 Data size: 170 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 2 Data size: 170 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      table:
+                          input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                          output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                          serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select n.* from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select n.* from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+3
+a
+b
+NULL
+PREHOOK: query: select assert_true(count(1)=4) from n left outer join t on 
(n.a=t.a) where cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select assert_true(count(1)=4) from n left outer join t on 
(n.a=t.a) where cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+NULL
+PREHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is 
null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: n
+                  Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: a (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 5 Data size: 425 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 5 Data size: 425 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  filterExpr: a is not null (type: boolean)
+                  Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Filter Operator
+                    predicate: a is not null (type: boolean)
+                    Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: a (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 5 Data size: 680 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Filter Operator
+                  predicate: UDFToFloat(_col1) is null (type: boolean)
+                  Statistics: Num rows: 2 Data size: 340 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: _col0 (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 2 Data size: 170 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 2 Data size: 170 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      table:
+                          input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                          output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                          serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select n.* from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select n.* from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+3
+a
+b
+NULL
+PREHOOK: query: select assert_true(count(1)=4) from n left outer join t on 
(n.a=t.a) where cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select assert_true(count(1)=4) from n left outer join t on 
(n.a=t.a) where cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+NULL

Reply via email to