[ 
https://issues.apache.org/jira/browse/SPARK-11447?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14987096#comment-14987096
 ] 

Kapil Singh commented on SPARK-11447:
-------------------------------------

On second look, I seem to have identified the issue. Take a look at lines 
283-286 here:
https://github.com/apache/spark/blob/a01cbf5daac148f39cd97299780f542abc41d1e9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala

If one of the types in BinaryComparison is StringType and other is NullType, 
during analyzed plan computation, this forces DoubleType on the StringType. 
Later while enforcing this cast (lines 340-343 in 
https://github.com/apache/spark/blob/a01cbf5daac148f39cd97299780f542abc41d1e9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala),
 conversion to double fails if string is not actually a number and a default 
null value is assigned to result. This manifests as null comparison resulting 
true for all string values. 

> Null comparison requires type information but type extraction fails for 
> complex types
> -------------------------------------------------------------------------------------
>
>                 Key: SPARK-11447
>                 URL: https://issues.apache.org/jira/browse/SPARK-11447
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 1.5.1
>            Reporter: Kapil Singh
>
> While comparing a Column to a null literal, comparison works only if type of 
> null literal matches type of the Column it's being compared to. Example scala 
> code (can be run from spark shell):
> import org.apache.spark.sql._
> import org.apache.spark.sql.types._
> import org.apache.spark.sql.catalyst.expressions._
> val inputRowsData = Seq(Seq("abc"),Seq(null),Seq("xyz"))
> val inputRows = for(seq <- inputRowsData) yield Row.fromSeq(seq)
> val dfSchema = StructType(Seq(StructField("column", StringType, true)))
> val df = sqlContext.createDataFrame(sc.makeRDD(inputRows), dfSchema)
> //DOESN'T WORK
> val filteredDF = df.filter(df("column") <=> (new Column(Literal(null))))
> //WORKS
> val filteredDF = df.filter(df("column") <=> (new Column(Literal.create(null, 
> SparkleFunctions.dataType(df("column"))))))
> Why should type information be required for a null comparison? If it's 
> required, it's not always possible to extract type information from complex  
> types (e.g. StructType). Following scala code (can be run from spark shell), 
> throws org.apache.spark.sql.catalyst.analysis.UnresolvedException:
> import org.apache.spark.sql._
> import org.apache.spark.sql.types._
> import org.apache.spark.sql.catalyst.expressions._
> val inputRowsData = Seq(Seq(Row.fromSeq(Seq("abc", 
> "def"))),Seq(Row.fromSeq(Seq(null, "123"))),Seq(Row.fromSeq(Seq("ghi", 
> "jkl"))))
> val inputRows = for(seq <- inputRowsData) yield Row.fromSeq(seq)
> val dfSchema = StructType(Seq(StructField("column", 
> StructType(Seq(StructField("p1", StringType, true), StructField("p2", 
> StringType, true))), true)))
> val filteredDF = df.filter(df("column")("p1") <=> (new 
> Column(Literal.create(null, SparkleFunctions.dataType(df("column")("p1"))))))
> org.apache.spark.sql.catalyst.analysis.UnresolvedException: Invalid call to 
> dataType on unresolved object, tree: column#0[p1]
>       at 
> org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue.dataType(unresolved.scala:243)
>       at 
> org.apache.spark.sql.ArithmeticFunctions$class.dataType(ArithmeticFunctions.scala:76)
>       at 
> org.apache.spark.sql.SparkleFunctions$.dataType(SparkleFunctions.scala:14)
>       at 
> $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:38)
>       at 
> $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:43)
>       at 
> $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:45)
>       at 
> $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:47)
>       at 
> $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:49)
>       at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:51)
>       at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:53)
>       at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:55)
>       at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:57)
>       at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:59)
>       at $iwC$$iwC$$iwC$$iwC.<init>(<console>:61)
>       at $iwC$$iwC$$iwC.<init>(<console>:63)
>       at $iwC$$iwC.<init>(<console>:65)
>       at $iwC.<init>(<console>:67)
>       at <init>(<console>:69)
>       at .<init>(<console>:73)
>       at .<clinit>(<console>)
>       at .<init>(<console>:7)
>       at .<clinit>(<console>)
>       at $print(<console>)
>       at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>       at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>       at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>       at java.lang.reflect.Method.invoke(Method.java:606)
>       at 
> org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
>       at 
> org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1340)
>       at 
> org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
>       at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
>       at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
>       at 
> org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
>       at 
> org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
>       at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
>       at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
>       at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
>       at 
> org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
>       at 
> org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
>       at 
> org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
>       at 
> org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
>       at 
> scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
>       at 
> org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
>       at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
>       at org.apache.spark.repl.Main$.main(Main.scala:31)
>       at org.apache.spark.repl.Main.main(Main.scala)
>       at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>       at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>       at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>       at java.lang.reflect.Method.invoke(Method.java:606)
>       at 
> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:672)
>       at 
> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>       at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>       at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>       at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to