Kapil Singh created SPARK-11447: ----------------------------------- Summary: Null comparison requires type information but type extraction fails for complex types Key: SPARK-11447 URL: https://issues.apache.org/jira/browse/SPARK-11447 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 1.5.1 Reporter: Kapil Singh
While comparing a Column to a null literal, comparison works only if type of null literal matches type of the Column it's being compared to. Example scala code (can be run from spark shell): import org.apache.spark.sql._ import org.apache.spark.sql.types._ import org.apache.spark.sql.catalyst.expressions._ val inputRowsData = Seq(Seq("abc"),Seq(null),Seq("xyz")) val inputRows = for(seq <- inputRowsData) yield Row.fromSeq(seq) val dfSchema = StructType(Seq(StructField("column", StringType, true))) val df = sqlContext.createDataFrame(sc.makeRDD(inputRows), dfSchema) //DOESN'T WORK val filteredDF = df.filter(df("column") <=> (new Column(Literal(null)))) //WORKS val filteredDF = df.filter(df("column") <=> (new Column(Literal.create(null, SparkleFunctions.dataType(df("column")))))) Why should type information be required for a null comparison? If it's required, it's not always possible to extract type information from complex types (e.g. StructType). Following scala code (can be run from spark shell), throws org.apache.spark.sql.catalyst.analysis.UnresolvedException: import org.apache.spark.sql._ import org.apache.spark.sql.types._ import org.apache.spark.sql.catalyst.expressions._ val inputRowsData = Seq(Seq(Row.fromSeq(Seq("abc", "def"))),Seq(Row.fromSeq(Seq(null, "123"))),Seq(Row.fromSeq(Seq("ghi", "jkl")))) val inputRows = for(seq <- inputRowsData) yield Row.fromSeq(seq) val dfSchema = StructType(Seq(StructField("column", StructType(Seq(StructField("p1", StringType, true), StructField("p2", StringType, true))), true))) val filteredDF = df.filter(df("column")("p1") <=> (new Column(Literal.create(null, SparkleFunctions.dataType(df("column")("p1")))))) org.apache.spark.sql.catalyst.analysis.UnresolvedException: Invalid call to dataType on unresolved object, tree: column#0[p1] at org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue.dataType(unresolved.scala:243) at org.apache.spark.sql.ArithmeticFunctions$class.dataType(ArithmeticFunctions.scala:76) at org.apache.spark.sql.SparkleFunctions$.dataType(SparkleFunctions.scala:14) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:38) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:43) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:45) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:47) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:49) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:51) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:53) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:55) at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:57) at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:59) at $iwC$$iwC$$iwC$$iwC.<init>(<console>:61) at $iwC$$iwC$$iwC.<init>(<console>:63) at $iwC$$iwC.<init>(<console>:65) at $iwC.<init>(<console>:67) at <init>(<console>:69) at .<init>(<console>:73) at .<clinit>(<console>) at .<init>(<console>:7) at .<clinit>(<console>) at $print(<console>) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065) at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1340) at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871) at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819) at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857) at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902) at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814) at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657) at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670) at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997) at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945) at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135) at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945) at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059) at org.apache.spark.repl.Main$.main(Main.scala:31) at org.apache.spark.repl.Main.main(Main.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:672) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org