I want to filter a DataFrame based on a Date column.
If the DataFrame object is constructed from a scala case class, it's working (either compare as String or Date). But if the DataFrame is generated by specifying a Schema to an RDD, it doesn't work. Below is the exception and test code. Do you have any idea about the error? Thank you very much! ================exception================= java.lang.ClassCastException: java.sql.Date cannot be cast to java.lang.Integer at scala.runtime.BoxesRunTime.unboxToInt(BoxesRunTime.java:106) at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToString$2$$ anonfun$apply$6.apply(Cast.scala:116) at org.apache.spark.sql.catalyst.expressions.Cast.org$apache$spark$sql$cata lyst$expressions$Cast$$buildCast(Cast.scala:111) at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToString$2.a pply(Cast.scala:116) at org.apache.spark.sql.catalyst.expressions.Cast.eval(Cast.scala:426) at org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual.eval(predic ates.scala:305) at org.apache.spark.sql.catalyst.expressions.InterpretedPredicate$$anonfun$ apply$1.apply(predicates.scala:30) at org.apache.spark.sql.catalyst.expressions.InterpretedPredicate$$anonfun$ apply$1.apply(predicates.scala:30) at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:390) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) ================code================= val conf = new SparkConf().setAppName("DFTest").setMaster("local[*]") val sc = new SparkContext(conf) val sqlCtx = new HiveContext(sc) import sqlCtx.implicits._ case class Test(dt: java.sql.Date) val df = sc.makeRDD(Seq(Test(new java.sql.Date(115,4,7)))).toDF var r = df.filter("dt >= '2015-05-06'") r.explain(true) r.show println("======") var r2 = df.filter("dt >= cast('2015-05-06' as DATE)") r2.explain(true) r2.show println("======") // "df2" doesn't do filter correct!! val rdd2 = sc.makeRDD(Seq((Row(new java.sql.Date(115,4,7))))) val schema = StructType(Array(StructField("dt", DateType, false))) val df2 = sqlCtx.applySchema(rdd2, schema) r = df2.filter("dt >= '2015-05-06'") r.explain(true) r.show println("======") r2 = df2.filter("dt >= cast('2015-05-06' as DATE)") r2.explain(true) r2.show