This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new f9a969c87ba HIVE-29175: Wrong Anti join conversion leading to missing
results (#6055)
f9a969c87ba is described below
commit f9a969c87bac441cba07143f457e8a46c8dbe56e
Author: Dayakar M <[email protected]>
AuthorDate: Mon Sep 8 12:20:15 2025 +0530
HIVE-29175: Wrong Anti join conversion leading to missing results (#6055)
---
.../hive/ql/optimizer/calcite/HiveCalciteUtil.java | 22 ++++
.../calcite/rules/HiveAntiSemiJoinRule.java | 5 +
.../queries/clientpositive/antijoin_conversion.q | 29 ++++++
.../clientpositive/llap/antijoin_conversion.q.out | 112 +++++++++++++++++++++
4 files changed, 168 insertions(+)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
index a19a6c74ec3..a7305326c00 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
@@ -1272,6 +1272,28 @@ public static boolean
hasAllExpressionsFromRightSide(RelNode joinRel, List<RexNo
return true;
}
+ /**
+ * Checks the operands in the join conditions are only from left side.
+ *
+ * @param joinRel Join node
+ * @return true if the join condition operands are from right and left side,
false otherwise.
+ */
+ public static boolean checkIfJoinConditionOnlyUsesLeftOperands(Join joinRel)
{
+ RexNode condition = joinRel.getCondition();
+ RelNode leftRel = joinRel.getLeft();
+ int leftFieldCount = leftRel.getRowType().getFieldCount();
+ ImmutableBitSet leftBitmap = ImmutableBitSet.range(leftFieldCount);
+ List<RexNode> conditions = RelOptUtil.conjunctions(condition);
+ for (RexNode cond : conditions) {
+ ImmutableBitSet condBitmap = RelOptUtil.InputFinder.bits(cond);
+ // here condition becomes true if both the operands are from left table
+ if (leftBitmap.contains(condBitmap)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
/**
* Extracts inputs referenced by aggregate operator.
*/
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
index 3697ec2c4aa..3c4c2ab693c 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
@@ -97,6 +97,11 @@ protected void perform(RelOptRuleCall call, Project project,
Filter filter, Join
return;
}
+ // if one of the operand from join condition is not from right table then
no need to convert to anti join.
+ if (HiveCalciteUtil.checkIfJoinConditionOnlyUsesLeftOperands(join)) {
+ return;
+ }
+
LOG.debug("Matched HiveAntiJoinRule");
// Build anti join with same left, right child and condition as original
left outer join.
diff --git a/ql/src/test/queries/clientpositive/antijoin_conversion.q
b/ql/src/test/queries/clientpositive/antijoin_conversion.q
index 70cb8b39d5a..71e5a097aed 100644
--- a/ql/src/test/queries/clientpositive/antijoin_conversion.q
+++ b/ql/src/test/queries/clientpositive/antijoin_conversion.q
@@ -20,3 +20,32 @@ select n.* from n left outer join t on (n.a=t.a) where
cast(t.a as float) is nul
select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null;
+create table tab1 (col1 int, col2 int);
+create table tab2 (col1 int, col2 int);
+
+insert into tab1 values (123, 1000), (456, 1000), (123, 5000);
+
+insert into tab2 values (123, 1000), (456, 1000), (788, 1000);
+
+select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null;
+
+set hive.auto.convert.anti.join=true;
+
+explain CBO select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null;
+
+select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null;
diff --git a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
index 6e2cae9f0cf..c25c8427a27 100644
--- a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
+++ b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
@@ -282,3 +282,115 @@ POSTHOOK: Input: default@n
POSTHOOK: Input: default@t
#### A masked pattern was here ####
NULL
+PREHOOK: query: create table tab1 (col1 int, col2 int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tab1
+POSTHOOK: query: create table tab1 (col1 int, col2 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tab1
+PREHOOK: query: create table tab2 (col1 int, col2 int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tab2
+POSTHOOK: query: create table tab2 (col1 int, col2 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tab2
+PREHOOK: query: insert into tab1 values (123, 1000), (456, 1000), (123, 5000)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tab1
+POSTHOOK: query: insert into tab1 values (123, 1000), (456, 1000), (123, 5000)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tab1
+POSTHOOK: Lineage: tab1.col1 SCRIPT []
+POSTHOOK: Lineage: tab1.col2 SCRIPT []
+PREHOOK: query: insert into tab2 values (123, 1000), (456, 1000), (788, 1000)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tab2
+POSTHOOK: query: insert into tab2 values (123, 1000), (456, 1000), (788, 1000)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tab2
+POSTHOOK: Lineage: tab2.col1 SCRIPT []
+POSTHOOK: Lineage: tab2.col2 SCRIPT []
+PREHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tab1
+PREHOOK: Input: default@tab2
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tab1
+POSTHOOK: Input: default@tab2
+#### A masked pattern was here ####
+123 5000
+PREHOOK: query: explain CBO select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tab1
+PREHOOK: Input: default@tab2
+#### A masked pattern was here ####
+POSTHOOK: query: explain CBO select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tab1
+POSTHOOK: Input: default@tab2
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(col1=[$0], col2=[$1])
+ HiveFilter(condition=[IS NULL($4)])
+ HiveJoin(condition=[AND(=($4, $0), =($3, $1))], joinType=[left],
algorithm=[none], cost=[not available])
+ HiveJoin(condition=[AND(=($2, $0), =($3, $1))], joinType=[left],
algorithm=[none], cost=[not available])
+ HiveProject(col1=[$0], col2=[$1])
+ HiveTableScan(table=[[default, tab1]], table:alias=[t1])
+ HiveProject(col1=[$0], col2=[$1])
+ HiveFilter(condition=[AND(IS NOT NULL($0), IS NOT NULL($1))])
+ HiveTableScan(table=[[default, tab2]], table:alias=[t2])
+ HiveProject(col1=[$0])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, tab1]], table:alias=[t3])
+
+PREHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tab1
+PREHOOK: Input: default@tab2
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tab1
+POSTHOOK: Input: default@tab2
+#### A masked pattern was here ####
+123 5000