java.lang.ClassCastException: optional binary element (UTF8) is not a group
My code works fine with JSON input format (Spark 1.6 on Amazon EMR, emr-5.0.0). I tried the Parquet format. Works fine for English data. When I tried the Parquet format with some Japanese language text, I am getting this weird stack-trace: *Caused by: java.lang.ClassCastException: optional binary element (UTF8) is not a group* /at org.apache.parquet.schema.Type.asGroupType(Type.java:202) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67) at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168) at org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192) at org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) at
java.lang.ClassCastException: optional binary element (UTF8) is not a group
Dear All, My code works fine with JSON input data. When I tried the Parquet data format, it worked for English data. For Japanese text, I am getting the below stack-trace. Pls help! Caused by: java.lang.ClassCastException: optional binary element (UTF8) is not a group at org.apache.parquet.schema.Type.asGroupType(Type.java:202) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetListType(ParquetReadSupport.scala:207) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:122) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroup(ParquetReadSupport.scala:252) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.org$apache$spark$sql$execution$datasources$parquet$ParquetReadSupport$$clipParquetType(ParquetReadSupport.scala:131) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1$$anonfun$apply$1.apply(ParquetReadSupport.scala:272) at scala.Option.map(Option.scala:146) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:272) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$$anonfun$clipParquetGroupFields$1.apply(ParquetReadSupport.scala:269) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at org.apache.spark.sql.types.StructType.map(StructType.scala:95) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetGroupFields(ParquetReadSupport.scala:269) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport$.clipParquetSchema(ParquetReadSupport.scala:111) at org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.init(ParquetReadSupport.scala:67) at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:168) at org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:192) at org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:377) at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReader$1.apply(ParquetFileFormat.scala:339) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) at