[ https://issues.apache.org/jira/browse/SPARK-24341?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Juliusz Sompolski updated SPARK-24341: -------------------------------------- Description: Ran on master: {code} drop table if exists juleka; drop table if exists julekb; create table juleka (a integer, b integer); create table julekb (na integer, nb integer); insert into juleka values (1,1); insert into julekb values (1,1); select * from juleka where (a, b) not in (select (na, nb) from julekb); {code} Results in: {code} java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: Cannot compare types "int" and "org.apache.spark.sql.catalyst.InternalRow" at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:299) at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:286) at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) at com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135) at com.google.common.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2344) at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2316) at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2278) at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2193) at com.google.common.cache.LocalCache.get(LocalCache.java:3932) at com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3936) at com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4806) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.compile(CodeGenerator.scala:1415) at org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate$.create(GeneratePredicate.scala:92) at org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate$.generate(GeneratePredicate.scala:46) at org.apache.spark.sql.execution.SparkPlan.newPredicate(SparkPlan.scala:380) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.org$apache$spark$sql$execution$joins$BroadcastNestedLoopJoinExec$$boundCondition$lzycompute(BroadcastNestedLoopJoinExec.scala:99) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.org$apache$spark$sql$execution$joins$BroadcastNestedLoopJoinExec$$boundCondition(BroadcastNestedLoopJoinExec.scala:97) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2$$anonfun$apply$3.apply(BroadcastNestedLoopJoinExec.scala:203) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2$$anonfun$apply$3.apply(BroadcastNestedLoopJoinExec.scala:203) at scala.collection.IndexedSeqOptimized$class.prefixLengthImpl(IndexedSeqOptimized.scala:38) at scala.collection.IndexedSeqOptimized$class.exists(IndexedSeqOptimized.scala:46) at scala.collection.mutable.ArrayOps$ofRef.exists(ArrayOps.scala:186) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2.apply(BroadcastNestedLoopJoinExec.scala:203) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2.apply(BroadcastNestedLoopJoinExec.scala:202) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389) at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:49) at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:126) at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:125) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:111) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:349) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: Cannot compare types "int" and "org.apache.spark.sql.catalyst.InternalRow" at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:1466) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1531) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1528) at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3522) at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2315) ... 30 more {code} Looks like invalid expression is introduced in RewritePredicateSubquery: {code} === Applying Rule org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery === !Filter NOT named_struct(a, a#83, b, b#84) IN (list#74 []) 'Join LeftAnti, ((a#83 = named_struct(na, na, nb, nb)#87) || isnull((a#83 = named_struct(na, na, nb, nb)#87))) !: +- Project [named_struct(na, na#85, nb, nb#86) AS named_struct(na, na, nb, nb)#87] :- HiveTableRelation `default`.`juleka`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#83, b#84] !: +- HiveTableRelation `default`.`julekb`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [na#85, nb#86] +- Project [named_struct(na, na#85, nb, nb#86) AS named_struct(na, na, nb, nb)#87] !+- HiveTableRelation `default`.`juleka`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#83, b#84] +- HiveTableRelation `default`.`julekb`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [na#85, nb#86] {code} It work when I run {code} select * from juleka where (a, b) not in (select na, nb from julekb); {code} so the error comes from tupling the columns in the subquery. was: Ran on Shared Autoscaling on dogfood: {code} drop table if exists juleka; drop table if exists julekb; create table juleka (a integer, b integer); create table julekb (na integer, nb integer); insert into juleka values (1,1); insert into julekb values (1,1); select * from juleka where (a, b) not in (select (na, nb) from julekb); {code} Results in: {code} java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: Cannot compare types "int" and "org.apache.spark.sql.catalyst.InternalRow" at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:299) at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:286) at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) at com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135) at com.google.common.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2344) at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2316) at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2278) at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2193) at com.google.common.cache.LocalCache.get(LocalCache.java:3932) at com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3936) at com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4806) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.compile(CodeGenerator.scala:1415) at org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate$.create(GeneratePredicate.scala:92) at org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate$.generate(GeneratePredicate.scala:46) at org.apache.spark.sql.execution.SparkPlan.newPredicate(SparkPlan.scala:380) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.org$apache$spark$sql$execution$joins$BroadcastNestedLoopJoinExec$$boundCondition$lzycompute(BroadcastNestedLoopJoinExec.scala:99) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.org$apache$spark$sql$execution$joins$BroadcastNestedLoopJoinExec$$boundCondition(BroadcastNestedLoopJoinExec.scala:97) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2$$anonfun$apply$3.apply(BroadcastNestedLoopJoinExec.scala:203) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2$$anonfun$apply$3.apply(BroadcastNestedLoopJoinExec.scala:203) at scala.collection.IndexedSeqOptimized$class.prefixLengthImpl(IndexedSeqOptimized.scala:38) at scala.collection.IndexedSeqOptimized$class.exists(IndexedSeqOptimized.scala:46) at scala.collection.mutable.ArrayOps$ofRef.exists(ArrayOps.scala:186) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2.apply(BroadcastNestedLoopJoinExec.scala:203) at org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2.apply(BroadcastNestedLoopJoinExec.scala:202) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389) at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:49) at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:126) at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:125) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:111) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:349) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 27, Column 29: Cannot compare types "int" and "org.apache.spark.sql.catalyst.InternalRow" at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:1466) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1531) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1528) at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3522) at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2315) ... 30 more {code} Looks like invalid expression is introduced in RewritePredicateSubquery: {code} === Applying Rule org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery === !Filter NOT named_struct(a, a#83, b, b#84) IN (list#74 []) 'Join LeftAnti, ((a#83 = named_struct(na, na, nb, nb)#87) || isnull((a#83 = named_struct(na, na, nb, nb)#87))) !: +- Project [named_struct(na, na#85, nb, nb#86) AS named_struct(na, na, nb, nb)#87] :- HiveTableRelation `default`.`juleka`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#83, b#84] !: +- HiveTableRelation `default`.`julekb`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [na#85, nb#86] +- Project [named_struct(na, na#85, nb, nb#86) AS named_struct(na, na, nb, nb)#87] !+- HiveTableRelation `default`.`juleka`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#83, b#84] +- HiveTableRelation `default`.`julekb`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [na#85, nb#86] {code} It work when I run {code} select * from juleka where (a, b) not in (select na, nb from julekb); {code} so the error comes from tupling the columns in the subquery. > Codegen compile error from predicate subquery > --------------------------------------------- > > Key: SPARK-24341 > URL: https://issues.apache.org/jira/browse/SPARK-24341 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.3.1 > Reporter: Juliusz Sompolski > Priority: Minor > > Ran on master: > {code} > drop table if exists juleka; > drop table if exists julekb; > create table juleka (a integer, b integer); > create table julekb (na integer, nb integer); > insert into juleka values (1,1); > insert into julekb values (1,1); > select * from juleka where (a, b) not in (select (na, nb) from julekb); > {code} > Results in: > {code} > java.util.concurrent.ExecutionException: > org.codehaus.commons.compiler.CompileException: File 'generated.java', Line > 27, Column 29: failed to compile: > org.codehaus.commons.compiler.CompileException: File 'generated.java', Line > 27, Column 29: Cannot compare types "int" and > "org.apache.spark.sql.catalyst.InternalRow" > at > com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:299) > at > com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:286) > at > com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) > at > com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135) > at > com.google.common.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2344) > at > com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2316) > at > com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2278) > at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2193) > at com.google.common.cache.LocalCache.get(LocalCache.java:3932) > at com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3936) > at > com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4806) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.compile(CodeGenerator.scala:1415) > at > org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate$.create(GeneratePredicate.scala:92) > at > org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate$.generate(GeneratePredicate.scala:46) > at > org.apache.spark.sql.execution.SparkPlan.newPredicate(SparkPlan.scala:380) > at > org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.org$apache$spark$sql$execution$joins$BroadcastNestedLoopJoinExec$$boundCondition$lzycompute(BroadcastNestedLoopJoinExec.scala:99) > at > org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec.org$apache$spark$sql$execution$joins$BroadcastNestedLoopJoinExec$$boundCondition(BroadcastNestedLoopJoinExec.scala:97) > at > org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2$$anonfun$apply$3.apply(BroadcastNestedLoopJoinExec.scala:203) > at > org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2$$anonfun$apply$3.apply(BroadcastNestedLoopJoinExec.scala:203) > at > scala.collection.IndexedSeqOptimized$class.prefixLengthImpl(IndexedSeqOptimized.scala:38) > at > scala.collection.IndexedSeqOptimized$class.exists(IndexedSeqOptimized.scala:46) > at scala.collection.mutable.ArrayOps$ofRef.exists(ArrayOps.scala:186) > at > org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2.apply(BroadcastNestedLoopJoinExec.scala:203) > at > org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec$$anonfun$4$$anonfun$apply$2.apply(BroadcastNestedLoopJoinExec.scala:202) > at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:463) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389) > at > org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:49) > at > org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:126) > at > org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:125) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:111) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:349) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.codehaus.commons.compiler.CompileException: File > 'generated.java', Line 27, Column 29: failed to compile: > org.codehaus.commons.compiler.CompileException: File 'generated.java', Line > 27, Column 29: Cannot compare types "int" and > "org.apache.spark.sql.catalyst.InternalRow" > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:1466) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1531) > at > org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:1528) > at > com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3522) > at > com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2315) > ... 30 more > {code} > Looks like invalid expression is introduced in RewritePredicateSubquery: > {code} > === Applying Rule > org.apache.spark.sql.catalyst.optimizer.RewritePredicateSubquery === > !Filter NOT named_struct(a, a#83, b, b#84) IN (list#74 []) > 'Join LeftAnti, ((a#83 = > named_struct(na, na, nb, nb)#87) || isnull((a#83 = named_struct(na, na, nb, > nb)#87))) > !: +- Project [named_struct(na, na#85, nb, nb#86) AS named_struct(na, na, > nb, nb)#87] :- HiveTableRelation > `default`.`juleka`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, > [a#83, b#84] > !: +- HiveTableRelation `default`.`julekb`, > org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [na#85, nb#86] +- > Project [named_struct(na, na#85, nb, nb#86) AS named_struct(na, na, nb, > nb)#87] > !+- HiveTableRelation `default`.`juleka`, > org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#83, b#84] > +- HiveTableRelation `default`.`julekb`, > org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [na#85, nb#86] > {code} > It work when I run > {code} > select * from juleka where (a, b) not in (select na, nb from julekb); > {code} > so the error comes from tupling the columns in the subquery. -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org