[
https://issues.apache.org/jira/browse/HIVE-29688?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18092673#comment-18092673
]
Stamatis Zampetakis commented on HIVE-29688:
--------------------------------------------
Although the exception appears in {{ASTConverter}} the problem occurs much
earlier in the decorrelation phase. The {{HiveRelDecorrelator}} fails to remove
the correlations from the query and leaves {{LogicalCorrelate}} operators in
the plan. In various places, the compiler assumes that {{LogicalCorrelate}}
operators are fully removed so in cases that this assumption is violated
different errors may arise.
The following query using the part table from TPC-H dataset fails with a
slightly different error:
{code:sql}
SELECT x.p_partkey
FROM part x
WHERE EXISTS (SELECT 1
FROM part y
WHERE x.p_name = y.p_name
AND y.p_brand IN (SELECT 'Brand#32'));
{code}
{noformat}
java.lang.AssertionError
at
org.apache.hadoop.hive.ql.parse.type.TypeCheckProcFactory$DefaultExprProcessor.getXpathOrFuncExprNodeDesc(TypeCheckProcFactory.java:881)
at
org.apache.hadoop.hive.ql.parse.type.TypeCheckProcFactory$DefaultExprProcessor.process(TypeCheckProcFactory.java:1481)
at
org.apache.hadoop.hive.ql.lib.CostLessRuleDispatcher.dispatch(CostLessRuleDispatcher.java:66)
at
org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.dispatchAndReturn(DefaultGraphWalker.java:105)
at
org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.dispatch(DefaultGraphWalker.java:89)
at
org.apache.hadoop.hive.ql.lib.ExpressionWalker.walk(ExpressionWalker.java:101)
at
org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.startWalking(DefaultGraphWalker.java:120)
at
org.apache.hadoop.hive.ql.parse.type.TypeCheckProcFactory.genExprNode(TypeCheckProcFactory.java:231)
at
org.apache.hadoop.hive.ql.parse.type.ExprNodeTypeCheck.genExprNode(ExprNodeTypeCheck.java:49)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genAllExprNodeDesc(SemanticAnalyzer.java:13659)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(SemanticAnalyzer.java:13614)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(SemanticAnalyzer.java:13582)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genFilterPlan(SemanticAnalyzer.java:3793)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genFilterPlan(SemanticAnalyzer.java:3773)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(SemanticAnalyzer.java:11508)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12495)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12348)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12379)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12348)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12379)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:12361)
at
org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:638)
at
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:13230)
at
org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:476)
at
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:359)
at
org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:187)
at
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:359)
at org.apache.hadoop.hive.ql.Compiler.analyze(Compiler.java:224)
at org.apache.hadoop.hive.ql.Compiler.compile(Compiler.java:109)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:499)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:451)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:415)
at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:409)
at
org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:126)
at
org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:234)
at
org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:259)
at org.apache.hadoop.hive.cli.CliDriver.processCmd1(CliDriver.java:203)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:129)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:430)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:358)
{noformat}
The underlying cause is the same; the decorrelator cannot remove all
correlations from the plan leading to an invalid state. The plan before and
after decorrelation is shown below.
h3. Before decorrelation
{noformat}
2026-06-30T07:02:18,465 DEBUG [687b2cb5-a5af-4b03-907a-bf8a56026dd8 main]
parse.CalcitePlanner: Plan after removing subquery:
HiveProject(p_partkey=[$0])
LogicalCorrelate(correlation=[$cor1], joinType=[semi], requiredColumns=[{1}])
HiveTableScan(table=[[default, part]], table:alias=[x])
HiveProject(literalTrue=[true])
HiveFilter(condition=[=($cor1.p_name, $1)])
HiveSemiJoin(condition=[=($3, $13)], joinType=[semi])
HiveTableScan(table=[[default, part]], table:alias=[y])
HiveAggregate(group=[{0}])
HiveProject(_o__c0=[_UTF-16LE'Brand#32':VARCHAR(2147483647)
CHARACTER SET "UTF-16LE"])
HiveTableScan(table=[[_dummy_database, _dummy_table]],
table:alias=[_dummy_table])
{noformat}
h3. After decorrelation
{noformat}
2026-06-30T07:02:18,480 DEBUG [687b2cb5-a5af-4b03-907a-bf8a56026dd8 main]
parse.CalcitePlanner: Plan after decorrelation:
HiveProject(p_partkey=[$0])
LogicalCorrelate(correlation=[$cor1], joinType=[semi], requiredColumns=[{1}])
HiveTableScan(table=[[default, part]], table:alias=[x])
HiveProject(literalTrue=[true])
HiveSemiJoin(condition=[=($3, $13)], joinType=[semi])
HiveFilter(condition=[=($cor1.p_name, $1)])
HiveTableScan(table=[[default, part]], table:alias=[y])
HiveAggregate(group=[{0}])
HiveProject(_o__c0=[_UTF-16LE'Brand#32':VARCHAR(2147483647) CHARACTER
SET "UTF-16LE"])
HiveTableScan(table=[[_dummy_database, _dummy_table]],
table:alias=[_dummy_table])
{noformat}
Observe that even though the decorrelator run the {{LogicalCorrelate}} was not
removed because the {{HiveRelDecorrelator}} cannot handle correlated semijoins.
The {{=($cor1.p_name, $1)}} condition below the {{HiveSemiJoin}} is essentially
blocking the decorrelation.
Note that before starting the decorrelation phase the correlated condition is
*above* the semijoin so the subtree rooted at {{HiveSemiJoin}} is not
correlated. During a [pre-processing phase of the
decorrelation|https://github.com/apache/hive/blame/fc877ff2130cad5d8db364ffa129e3e717674449/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRelDecorrelator.java#L241]
the {{HiveFilterJoinRule}} is triggered and pushes the correlated condition
below the semijoin and makes the whole subtree impossible to decorrelate.
The correlation pushdown goes against the usual decorrelation principle where
correlations are pulled upwards till they are eliminated completely and
shouldn’t happen in the first place. The proposal in PR#6572 modifies the
{{HiveFilterJoinRule}} to prevent the pushdown of correlated conditions and
avoids the decorrelation dead-end. This is the same fix applied by CALCITE-7319
in the baseline rule for a slightly different use-case.
> IndexOutOfBoundsException when WHERE clause contains IN/EXISTS subqueries AND
> correlated conditions
> ---------------------------------------------------------------------------------------------------
>
> Key: HIVE-29688
> URL: https://issues.apache.org/jira/browse/HIVE-29688
> Project: Hive
> Issue Type: Bug
> Reporter: Thomas Rebele
> Assignee: Thomas Rebele
> Priority: Major
> Labels: pull-request-available
>
> The following q file test fails with an exception:
> {code:java}
> drop table if exists `table1`;
> CREATE EXTERNAL TABLE `table1`(
> `f1` string,
> `f2` string,
> `f3` string,
> `f4` string,
> `f5` string,
> `f6` string)
> ;
> SELECT 1
> FROM table1 a
> WHERE a.f4 IN ('1', '2')
> AND EXISTS (
> SELECT 1
> FROM table1 b
> WHERE a.f6 = b.f1 AND b.f3 IN (SELECT 1)
> )
> ;
> {code}
> Steps to reproduce:
> {code}
> mvn clean install -DskipTests -Denforcer.skip=true -T 1C
> mvn test -pl ql,itests/qtest,itests/test-serde,itests/util -Pitests
> -Dtest=TestMiniLlapLocalCliDriver -Dqfile=decorrelate-semi-join.q
> -Dtest.output.overwrite=true -Denforcer.skip=true
> {code}
> The exception was:
> {code:java}
> java.lang.IndexOutOfBoundsException: Index 3 out of bounds for length 1
> at
> java.base/jdk.internal.util.Preconditions.outOfBounds(Preconditions.java:100)
> at
> java.base/jdk.internal.util.Preconditions.outOfBoundsCheckIndex(Preconditions.java:106)
> at
> java.base/jdk.internal.util.Preconditions.checkIndex(Preconditions.java:302)
> at java.base/java.util.Objects.checkIndex(Objects.java:385)
> at java.base/java.util.ArrayList.get(ArrayList.java:427)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:853)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:808)
> at org.apache.calcite.rex.RexInputRef.accept(RexInputRef.java:113)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:1107)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:808)
> at org.apache.calcite.rex.RexCall.accept(RexCall.java:189)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:283)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:136)
> at
> org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:605)
> at
> org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:13230)
> at
> org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:476)
> at
> org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:359)
> at org.apache.hadoop.hive.ql.Compiler.analyze(Compiler.java:224)
> at org.apache.hadoop.hive.ql.Compiler.compile(Compiler.java:109)
> at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:499)
> at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:451)
> at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:415)
> at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:409)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:126)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:234)
> at
> org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:259)
> at org.apache.hadoop.hive.cli.CliDriver.processCmd1(CliDriver.java:203)
> at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:129)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:430)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:358)
> at
> org.apache.hadoop.hive.ql.QTestUtil.executeClientInternal(QTestUtil.java:790)
> at org.apache.hadoop.hive.ql.QTestUtil.executeClient(QTestUtil.java:760)
> at
> org.apache.hadoop.hive.cli.control.CoreCliDriver.runTest(CoreCliDriver.java:115)
> at
> org.apache.hadoop.hive.cli.control.CliAdapter.runTest(CliAdapter.java:139)
> at
> org.apache.hadoop.hive.cli.TestMiniLlapLocalCliDriver.testCliDriver(TestMiniLlapLocalCliDriver.java:62)
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)