[ 
https://issues.apache.org/jira/browse/SPARK-17354?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Amit Baghel updated SPARK-17354:
--------------------------------
    Description: 
Hive database has one table with column type Date. While running select query 
using Spark 2.0.0 SQL and calling show() function on DF throws 
ClassCastException. Same code is working fine on Spark 1.6.2. Please see the 
sample code below.

{code}
import java.util.Calendar
val now = Calendar.getInstance().getTime()
case class Order(id : Int, customer : String, city : String, pdate : 
java.sql.Date)
val orders = Seq(
      Order(1, "John S", "San Mateo", new java.sql.Date(now.getTime)),
      Order(2, "John D", "Redwood City", new java.sql.Date(now.getTime))
          )       
orders.toDF.createOrReplaceTempView("orders1")

spark.sql("CREATE TABLE IF NOT EXISTS order(id INT, customer String,city 
String)PARTITIONED BY (pdate DATE)STORED AS PARQUETFILE")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
spark.sql("INSERT INTO TABLE order PARTITION(pdate) SELECT * FROM orders1")
spark.sql("SELECT * FROM order").show()
{code}  

Exception details

{code}
16/09/01 10:30:07 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 6)
java.lang.ClassCastException: java.lang.Integer cannot be cast to java.sql.Date
        at 
org.apache.spark.sql.execution.vectorized.ColumnVectorUtils.populate(ColumnVectorUtils.java:89)
        at 
org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initBatch(VectorizedParquetRecordReader.java:185)
        at 
org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initBatch(VectorizedParquetRecordReader.java:204)
        at 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:362)
        at 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339)
        at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
        at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown
 Source)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
        at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
        at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
        at org.apache.spark.scheduler.Task.run(Task.scala:85)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
{code} 

Expected output 

{code} 
+---+--------+------------+----------+
| id|customer|        city|     pdate|
+---+--------+------------+----------+
|  1|  John S|   San Mateo|2016-09-01|
|  2|  John D|Redwood City|2016-09-01|
+---+--------+------------+----------+
{code} 

Workaround for Spark 2.0.0

Setting enableVectorizedReader=false before show() method on DF returns 
expected result.

{code} 
spark.sql("set spark.sql.parquet.enableVectorizedReader=false")
{code} 





  was:
Hive database has one table with column type Date. While running select query 
using Spark 2.0.0 SQL and calling show() function on DF throws 
ClassCastException. Same code is working fine on Spark 1.6.2. Please see the 
sample code below.

{code}

import java.util.Calendar
val now = Calendar.getInstance().getTime()
case class Order(id : Int, customer : String, city : String, pdate : 
java.sql.Date)
val orders = Seq(
      Order(1, "John S", "San Mateo", new java.sql.Date(now.getTime)),
      Order(2, "John D", "Redwood City", new java.sql.Date(now.getTime))
          )       
orders.toDF.createOrReplaceTempView("orders1")

spark.sql("CREATE TABLE IF NOT EXISTS order(id INT, customer String,city 
String)PARTITIONED BY (pdate DATE)STORED AS PARQUETFILE")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
spark.sql("INSERT INTO TABLE order PARTITION(pdate) SELECT * FROM orders1")
spark.sql("SELECT * FROM order").show()

{code}  

Exception details

{code}
 
16/09/01 10:30:07 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 6)
java.lang.ClassCastException: java.lang.Integer cannot be cast to java.sql.Date
        at 
org.apache.spark.sql.execution.vectorized.ColumnVectorUtils.populate(ColumnVectorUtils.java:89)
        at 
org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initBatch(VectorizedParquetRecordReader.java:185)
        at 
org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initBatch(VectorizedParquetRecordReader.java:204)
        at 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:362)
        at 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339)
        at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
        at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown
 Source)
        at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
 Source)
        at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at 
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
        at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
        at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
        at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
        at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
        at org.apache.spark.scheduler.Task.run(Task.scala:85)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

{code} 

Expected output 

{code} 
+---+--------+------------+----------+
| id|customer|        city|     pdate|
+---+--------+------------+----------+
|  1|  John S|   San Mateo|2016-09-01|
|  2|  John D|Redwood City|2016-09-01|
+---+--------+------------+----------+

{code} 

Workaround for Spark 2.0.0

Setting enableVectorizedReader=false before show() method on DF returns 
expected result.

{code} 

spark.sql("set spark.sql.parquet.enableVectorizedReader=false")

{code} 






> java.lang.ClassCastException: java.lang.Integer cannot be cast to 
> java.sql.Date
> -------------------------------------------------------------------------------
>
>                 Key: SPARK-17354
>                 URL: https://issues.apache.org/jira/browse/SPARK-17354
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.0.0
>            Reporter: Amit Baghel
>            Priority: Minor
>
> Hive database has one table with column type Date. While running select query 
> using Spark 2.0.0 SQL and calling show() function on DF throws 
> ClassCastException. Same code is working fine on Spark 1.6.2. Please see the 
> sample code below.
> {code}
> import java.util.Calendar
> val now = Calendar.getInstance().getTime()
> case class Order(id : Int, customer : String, city : String, pdate : 
> java.sql.Date)
> val orders = Seq(
>       Order(1, "John S", "San Mateo", new java.sql.Date(now.getTime)),
>       Order(2, "John D", "Redwood City", new java.sql.Date(now.getTime))
>         )       
> orders.toDF.createOrReplaceTempView("orders1")
> spark.sql("CREATE TABLE IF NOT EXISTS order(id INT, customer String,city 
> String)PARTITIONED BY (pdate DATE)STORED AS PARQUETFILE")
> spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
> spark.sql("INSERT INTO TABLE order PARTITION(pdate) SELECT * FROM orders1")
> spark.sql("SELECT * FROM order").show()
> {code}  
> Exception details
> {code}
> 16/09/01 10:30:07 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 6)
> java.lang.ClassCastException: java.lang.Integer cannot be cast to 
> java.sql.Date
>       at 
> org.apache.spark.sql.execution.vectorized.ColumnVectorUtils.populate(ColumnVectorUtils.java:89)
>       at 
> org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initBatch(VectorizedParquetRecordReader.java:185)
>       at 
> org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initBatch(VectorizedParquetRecordReader.java:204)
>       at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:362)
>       at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339)
>       at 
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
>       at 
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91)
>       at 
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown
>  Source)
>       at 
> org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
>  Source)
>       at 
> org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
>       at 
> org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
>       at 
> org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246)
>       at 
> org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
>       at 
> org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
>       at 
> org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:784)
>       at 
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
>       at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
>       at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
>       at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
>       at org.apache.spark.scheduler.Task.run(Task.scala:85)
>       at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
>       at java.lang.Thread.run(Thread.java:745)
> {code} 
> Expected output 
> {code} 
> +---+--------+------------+----------+
> | id|customer|        city|     pdate|
> +---+--------+------------+----------+
> |  1|  John S|   San Mateo|2016-09-01|
> |  2|  John D|Redwood City|2016-09-01|
> +---+--------+------------+----------+
> {code} 
> Workaround for Spark 2.0.0
> Setting enableVectorizedReader=false before show() method on DF returns 
> expected result.
> {code} 
> spark.sql("set spark.sql.parquet.enableVectorizedReader=false")
> {code} 



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to