[ https://issues.apache.org/jira/browse/SPARK-39830?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
dzcxzl updated SPARK-39830: --------------------------- Description: We can add a UT to test the scenario after the ORC-1205 release. bin/spark-shell {code:java} spark.sql("set orc.stripe.size=10240") spark.sql("set orc.rows.between.memory.checks=1") spark.sql("set spark.sql.orc.columnarWriterBatchSize=1") val df = spark.range(1, 1+512, 1, 1).map { i => if( i == 1 ){ (i, Array.fill[Byte](5 * 1024 * 1024)('X')) } else { (i,Array.fill[Byte](1)('X')) } }.toDF("c1","c2") df.write.format("orc").save("file:///tmp/test_table_orc_t1") spark.sql("create external table test_table_orc_t1 (c1 string ,c2 binary) location 'file:///tmp/test_table_orc_t1' stored as orc ") spark.sql("select * from test_table_orc_t1").show() {code} Querying this table will get the following exception {code:java} java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.orc.impl.TreeReaderFactory$TreeReader.nextVector(TreeReaderFactory.java:387) at org.apache.orc.impl.TreeReaderFactory$LongTreeReader.nextVector(TreeReaderFactory.java:740) at org.apache.orc.impl.ConvertTreeReaderFactory$StringGroupFromAnyIntegerTreeReader.nextVector(ConvertTreeReaderFactory.java:1069) at org.apache.orc.impl.reader.tree.StructBatchReader.readBatchColumn(StructBatchReader.java:65) at org.apache.orc.impl.reader.tree.StructBatchReader.nextBatchForLevel(StructBatchReader.java:100) at org.apache.orc.impl.reader.tree.StructBatchReader.nextBatch(StructBatchReader.java:77) at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1371) at org.apache.orc.mapreduce.OrcMapreduceRecordReader.ensureBatch(OrcMapreduceRecordReader.java:84) at org.apache.orc.mapreduce.OrcMapreduceRecordReader.nextKeyValue(OrcMapreduceRecordReader.java:102) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) {code} was: {code:java} spark.sql("set orc.stripe.size=10240") spark.sql("set orc.rows.between.memory.checks=1") spark.sql("set spark.sql.orc.columnarWriterBatchSize=1") val df = spark.range(1, 1+512, 1, 1).map { i => if( i == 1 ){ (i, Array.fill[Byte](5 * 1024 * 1024)('X')) } else { (i,Array.fill[Byte](1)('X')) } }.toDF("c1","c2") df.write.format("orc").save("file:///tmp/test_table_orc_t1") spark.sql("create external table test_table_orc_t1 (c1 string ,c2 binary) location 'file:///tmp/test_table_orc_t1' stored as orc ") spark.sql("select * from test_table_orc_t1").show() {code} Querying this table will get the following exception {code:java} java.lang.ArrayIndexOutOfBoundsException: 1 at org.apache.orc.impl.TreeReaderFactory$TreeReader.nextVector(TreeReaderFactory.java:387) at org.apache.orc.impl.TreeReaderFactory$LongTreeReader.nextVector(TreeReaderFactory.java:740) at org.apache.orc.impl.ConvertTreeReaderFactory$StringGroupFromAnyIntegerTreeReader.nextVector(ConvertTreeReaderFactory.java:1069) at org.apache.orc.impl.reader.tree.StructBatchReader.readBatchColumn(StructBatchReader.java:65) at org.apache.orc.impl.reader.tree.StructBatchReader.nextBatchForLevel(StructBatchReader.java:100) at org.apache.orc.impl.reader.tree.StructBatchReader.nextBatch(StructBatchReader.java:77) at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1371) at org.apache.orc.mapreduce.OrcMapreduceRecordReader.ensureBatch(OrcMapreduceRecordReader.java:84) at org.apache.orc.mapreduce.OrcMapreduceRecordReader.nextKeyValue(OrcMapreduceRecordReader.java:102) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) {code} We can add a UT to test the scenario after the [ORC-1205|https://issues.apache.org/jira/browse/ORC-1205] release > Reading ORC table that requires type promotion may throw AIOOBE > --------------------------------------------------------------- > > Key: SPARK-39830 > URL: https://issues.apache.org/jira/browse/SPARK-39830 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.3.0 > Reporter: dzcxzl > Priority: Trivial > > We can add a UT to test the scenario after the ORC-1205 release. > > bin/spark-shell > {code:java} > spark.sql("set orc.stripe.size=10240") > spark.sql("set orc.rows.between.memory.checks=1") > spark.sql("set spark.sql.orc.columnarWriterBatchSize=1") > val df = spark.range(1, 1+512, 1, 1).map { i => > if( i == 1 ){ > (i, Array.fill[Byte](5 * 1024 * 1024)('X')) > } else { > (i,Array.fill[Byte](1)('X')) > } > }.toDF("c1","c2") > df.write.format("orc").save("file:///tmp/test_table_orc_t1") > spark.sql("create external table test_table_orc_t1 (c1 string ,c2 binary) > location 'file:///tmp/test_table_orc_t1' stored as orc ") > spark.sql("select * from test_table_orc_t1").show() {code} > Querying this table will get the following exception > {code:java} > java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.orc.impl.TreeReaderFactory$TreeReader.nextVector(TreeReaderFactory.java:387) > at > org.apache.orc.impl.TreeReaderFactory$LongTreeReader.nextVector(TreeReaderFactory.java:740) > at > org.apache.orc.impl.ConvertTreeReaderFactory$StringGroupFromAnyIntegerTreeReader.nextVector(ConvertTreeReaderFactory.java:1069) > at > org.apache.orc.impl.reader.tree.StructBatchReader.readBatchColumn(StructBatchReader.java:65) > at > org.apache.orc.impl.reader.tree.StructBatchReader.nextBatchForLevel(StructBatchReader.java:100) > at > org.apache.orc.impl.reader.tree.StructBatchReader.nextBatch(StructBatchReader.java:77) > at > org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1371) > at > org.apache.orc.mapreduce.OrcMapreduceRecordReader.ensureBatch(OrcMapreduceRecordReader.java:84) > at > org.apache.orc.mapreduce.OrcMapreduceRecordReader.nextKeyValue(OrcMapreduceRecordReader.java:102) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) > {code} > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org