[ https://issues.apache.org/jira/browse/PARQUET-893?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Cheng Lian updated PARQUET-893: ------------------------------- Description: The following Spark snippet reproduces this issue with Spark 2.1 (with parquet-mr 1.8.1) and Spark 2.2-SNAPSHOT (with parquet-mr 1.8.2): {code} import org.apache.spark.sql.types._ val path = "/tmp/parquet-test" case class Inner(f00: Int) case class Outer(f0: Inner, f1: Int) val df = Seq(Outer(Inner(1), 1)).toDF() df.printSchema() // root // |-- f0: struct (nullable = true) // | |-- f00: integer (nullable = false) // |-- f1: integer (nullable = false) df.write.mode("overwrite").parquet(path) val requestedSchema = new StructType(). add("f0", new StructType(). // This nested field name differs from the original one add("f01", IntegerType)). add("f1", IntegerType) println(requestedSchema.treeString) // root // |-- f0: struct (nullable = true) // | |-- f01: integer (nullable = true) // |-- f1: integer (nullable = true) spark.read.schema(requestedSchema).parquet(path).show() {code} In the above snippet, {{requestedSchema}} is compatible with the schema of the written Parquet file, but the following exception is thrown: {noformat} org.apache.parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file file:/tmp/parquet-test/part-00007-d2b0bec1-7be5-4b51-8d53-3642680bc9c2.snappy.parquet at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:243) at org.apache.parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:227) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:184) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 at java.util.ArrayList.rangeCheck(ArrayList.java:653) at java.util.ArrayList.get(ArrayList.java:429) at org.apache.parquet.io.GroupColumnIO.getFirst(GroupColumnIO.java:102) at org.apache.parquet.io.GroupColumnIO.getFirst(GroupColumnIO.java:102) at org.apache.parquet.io.PrimitiveColumnIO.getFirst(PrimitiveColumnIO.java:102) at org.apache.parquet.io.PrimitiveColumnIO.isFirst(PrimitiveColumnIO.java:97) at org.apache.parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:277) at org.apache.parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:135) at org.apache.parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:101) at org.apache.parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:154) at org.apache.parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:101) at org.apache.parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:140) at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:214) ... 21 more {noformat} According to this stack trace, it seems that {{GroupColumnIO.getFirst()}} [doesn't check for empty groups|https://github.com/apache/parquet-mr/blob/apache-parquet-1.8.2/parquet-column/src/main/java/org/apache/parquet/io/GroupColumnIO.java#L103] properly. I haven't tried parquet-mr 1.9.0 but it probably suffers from the same issue. was: The following Spark 2.1 snippet reproduces this issue: {code} import org.apache.spark.sql.types._ val path = "/tmp/parquet-test" case class Inner(f00: Int) case class Outer(f0: Inner, f1: Int) val df = Seq(Outer(Inner(1), 1)).toDF() df.printSchema() // root // |-- f0: struct (nullable = true) // | |-- f00: integer (nullable = false) // |-- f1: integer (nullable = false) df.write.mode("overwrite").parquet(path) val requestedSchema = new StructType(). add("f0", new StructType(). // This nested field name differs from the original one add("f01", IntegerType)). add("f1", IntegerType) println(requestedSchema.treeString) // root // |-- f0: struct (nullable = true) // | |-- f01: integer (nullable = true) // |-- f1: integer (nullable = true) spark.read.schema(requestedSchema).parquet(path).show() {code} In the above snippet, {{requestedSchema}} is compatible with the schema of the written Parquet file, but the following exception is thrown: {noformat} org.apache.parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file file:/tmp/parquet-test/part-00007-d2b0bec1-7be5-4b51-8d53-3642680bc9c2.snappy.parquet at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:243) at org.apache.parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:227) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:184) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231) at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826) at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 at java.util.ArrayList.rangeCheck(ArrayList.java:653) at java.util.ArrayList.get(ArrayList.java:429) at org.apache.parquet.io.GroupColumnIO.getFirst(GroupColumnIO.java:102) at org.apache.parquet.io.GroupColumnIO.getFirst(GroupColumnIO.java:102) at org.apache.parquet.io.PrimitiveColumnIO.getFirst(PrimitiveColumnIO.java:102) at org.apache.parquet.io.PrimitiveColumnIO.isFirst(PrimitiveColumnIO.java:97) at org.apache.parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:277) at org.apache.parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:135) at org.apache.parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:101) at org.apache.parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:154) at org.apache.parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:101) at org.apache.parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:140) at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:214) ... 21 more {noformat} According to this stack trace, it seems that {{GroupColumnIO.getFirst()}} [doesn't check for empty groups|https://github.com/apache/parquet-mr/blob/apache-parquet-1.8.2/parquet-column/src/main/java/org/apache/parquet/io/GroupColumnIO.java#L103] properly. > GroupColumnIO.getFirst() doesn't check for empty groups > ------------------------------------------------------- > > Key: PARQUET-893 > URL: https://issues.apache.org/jira/browse/PARQUET-893 > Project: Parquet > Issue Type: Bug > Components: parquet-mr > Affects Versions: 1.8.1 > Reporter: Cheng Lian > > The following Spark snippet reproduces this issue with Spark 2.1 (with > parquet-mr 1.8.1) and Spark 2.2-SNAPSHOT (with parquet-mr 1.8.2): > {code} > import org.apache.spark.sql.types._ > val path = "/tmp/parquet-test" > case class Inner(f00: Int) > case class Outer(f0: Inner, f1: Int) > val df = Seq(Outer(Inner(1), 1)).toDF() > df.printSchema() > // root > // |-- f0: struct (nullable = true) > // | |-- f00: integer (nullable = false) > // |-- f1: integer (nullable = false) > df.write.mode("overwrite").parquet(path) > val requestedSchema = > new StructType(). > add("f0", new StructType(). > // This nested field name differs from the original one > add("f01", IntegerType)). > add("f1", IntegerType) > println(requestedSchema.treeString) > // root > // |-- f0: struct (nullable = true) > // | |-- f01: integer (nullable = true) > // |-- f1: integer (nullable = true) > spark.read.schema(requestedSchema).parquet(path).show() > {code} > In the above snippet, {{requestedSchema}} is compatible with the schema of > the written Parquet file, but the following exception is thrown: > {noformat} > org.apache.parquet.io.ParquetDecodingException: Can not read value at 0 in > block -1 in file > file:/tmp/parquet-test/part-00007-d2b0bec1-7be5-4b51-8d53-3642680bc9c2.snappy.parquet > at > org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:243) > at > org.apache.parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:227) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:184) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:826) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:99) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Caused by: java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 > at java.util.ArrayList.rangeCheck(ArrayList.java:653) > at java.util.ArrayList.get(ArrayList.java:429) > at > org.apache.parquet.io.GroupColumnIO.getFirst(GroupColumnIO.java:102) > at > org.apache.parquet.io.GroupColumnIO.getFirst(GroupColumnIO.java:102) > at > org.apache.parquet.io.PrimitiveColumnIO.getFirst(PrimitiveColumnIO.java:102) > at > org.apache.parquet.io.PrimitiveColumnIO.isFirst(PrimitiveColumnIO.java:97) > at > org.apache.parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:277) > at > org.apache.parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:135) > at > org.apache.parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:101) > at > org.apache.parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:154) > at > org.apache.parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:101) > at > org.apache.parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:140) > at > org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:214) > ... 21 more > {noformat} > According to this stack trace, it seems that {{GroupColumnIO.getFirst()}} > [doesn't check for empty > groups|https://github.com/apache/parquet-mr/blob/apache-parquet-1.8.2/parquet-column/src/main/java/org/apache/parquet/io/GroupColumnIO.java#L103] > properly. > I haven't tried parquet-mr 1.9.0 but it probably suffers from the same issue. -- This message was sent by Atlassian JIRA (v6.3.15#6346)