[ https://issues.apache.org/jira/browse/SPARK-34897?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Yuming Wang updated SPARK-34897: -------------------------------- Description: How to reproduce this issue: {code:scala} spark.sql( """ |CREATE TABLE `t1` ( | `_col0` INT, | `_col1` STRING, | `_col2` STRUCT<`c1`: STRING, `c2`: STRING, `c3`: STRING, `c4`: BIGINT>, | `_col3` STRING) |USING orc |PARTITIONED BY (_col3) |""".stripMargin) spark.sql("INSERT INTO `t1` values(1, '2', null, '2021-02-01')") spark.sql("SELECT _col2.c1, _col0 FROM `t1` WHERE _col3 = '2021-02-01'").show {code} Error message: {noformat} java.lang.AssertionError: assertion failed: The given data schema struct<_col0:int,_col2:struct<c1:string>> has less fields than the actual ORC physical schema, no idea which columns were dropped, fail to read. Try to disable at scala.Predef$.assert(Predef.scala:223) at org.apache.spark.sql.execution.datasources.orc.OrcUtils$.requestedColumnIds(OrcUtils.scala:159) at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$3(OrcFileFormat.scala:180) at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2620) at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:178) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:117) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:165) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:94) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:756) {noformat} was: How to reproduce this issue: {code:scala} spark.sql( """ |CREATE TABLE `t1` ( | `_col0` INT, | `_col1` STRING, | `_col2` STRUCT<`c1`: STRING, `c2`: STRING, `c3`: STRING, `c4`: BIGINT>, | `_col3` STRING) |USING orc |PARTITIONED BY (_col3) |""".stripMargin) spark.sql("INSERT INTO `t1` values(1, '2', null, '2021-02-01')") spark.sql("SELECT _col2.c1, _col0 FROM `t1` WHERE _col3 = '2021-02-01'").show {code} > The given data schema has less fields than the actual ORC physical schema > ------------------------------------------------------------------------- > > Key: SPARK-34897 > URL: https://issues.apache.org/jira/browse/SPARK-34897 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.0.2, 3.2.0, 3.1.1 > Reporter: Yuming Wang > Priority: Major > > How to reproduce this issue: > {code:scala} > spark.sql( > """ > |CREATE TABLE `t1` ( > | `_col0` INT, > | `_col1` STRING, > | `_col2` STRUCT<`c1`: STRING, `c2`: STRING, `c3`: STRING, `c4`: BIGINT>, > | `_col3` STRING) > |USING orc > |PARTITIONED BY (_col3) > |""".stripMargin) > spark.sql("INSERT INTO `t1` values(1, '2', null, '2021-02-01')") > spark.sql("SELECT _col2.c1, _col0 FROM `t1` WHERE _col3 = '2021-02-01'").show > {code} > Error message: > {noformat} > java.lang.AssertionError: assertion failed: The given data schema > struct<_col0:int,_col2:struct<c1:string>> has less fields than the actual ORC > physical schema, no idea which columns were dropped, fail to read. Try to > disable > at scala.Predef$.assert(Predef.scala:223) > at > org.apache.spark.sql.execution.datasources.orc.OrcUtils$.requestedColumnIds(OrcUtils.scala:159) > at > org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$3(OrcFileFormat.scala:180) > at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2620) > at > org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:178) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:117) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:165) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:94) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:756) > {noformat} -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org