This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d81e55e1ff9 [SPARK-41858][SQL] Fix ORC reader perf regression due to DEFAULT value feature d81e55e1ff9 is described below commit d81e55e1ff998c624fa80c5660d7724701b4df23 Author: Dongjoon Hyun <dongj...@apache.org> AuthorDate: Tue Jan 3 10:40:44 2023 -0800 [SPARK-41858][SQL] Fix ORC reader perf regression due to DEFAULT value feature ### What changes were proposed in this pull request? This PR is a partial and logical revert of SPARK-39862, https://github.com/apache/spark/pull/37280, to fix the huge ORC reader perf regression (3x slower). SPARK-39862 should propose a fix without perf regression. ### Why are the changes needed? During Apache Spark 3.4.0 preparation, SPARK-41782 identified a perf regression. - https://github.com/apache/spark/pull/39301#discussion_r1059239575 ### Does this PR introduce _any_ user-facing change? After this PR, the regression is removed. However, the bug of DEFAULT value feature will remain. This should be handled separately. ### How was this patch tested? Pass the CI. Closes #39362 from dongjoon-hyun/SPARK-41858. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../execution/datasources/orc/OrcDeserializer.scala | 21 +++++++++++---------- .../org/apache/spark/sql/sources/InsertSuite.scala | 9 +++++++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala index 5276f5c6d7b..5b207a04ada 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala @@ -57,14 +57,7 @@ class OrcDeserializer( } else { new RowUpdater(resultRow) } - val writer: (Int, WritableComparable[_]) => Unit = - (ordinal, value) => - if (value == null) { - rowUpdater.setNullAt(ordinal) - } else { - val writerFunc = newWriter(f.dataType, rowUpdater) - writerFunc(ordinal, value) - } + val writer = newWriter(f.dataType, rowUpdater) (value: WritableComparable[_]) => writer(index, value) } }.toArray @@ -75,7 +68,11 @@ class OrcDeserializer( while (targetColumnIndex < fieldWriters.length) { if (fieldWriters(targetColumnIndex) != null) { val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex)) - fieldWriters(targetColumnIndex)(value) + if (value == null) { + resultRow.setNullAt(targetColumnIndex) + } else { + fieldWriters(targetColumnIndex)(value) + } } targetColumnIndex += 1 } @@ -88,7 +85,11 @@ class OrcDeserializer( while (targetColumnIndex < fieldWriters.length) { if (fieldWriters(targetColumnIndex) != null) { val value = orcValues(requestedColIds(targetColumnIndex)) - fieldWriters(targetColumnIndex)(value) + if (value == null) { + resultRow.setNullAt(targetColumnIndex) + } else { + fieldWriters(targetColumnIndex)(value) + } } targetColumnIndex += 1 } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index dd37c93871e..7c4a39d6ff4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -1679,7 +1679,8 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { Config( None), Config( - Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false")))), + Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false"), + insertNullsToStorage = false))), TestCase( dataSource = "parquet", Seq( @@ -1943,7 +1944,11 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { Row(Seq(Row(1, 2)), Seq(Map(false -> "def", true -> "jkl"))), Seq(Map(true -> "xyz"))), Row(2, - null, + if (config.dataSource != "orc") { + null + } else { + Row(Seq(Row(1, 2)), Seq(Map(false -> "def", true -> "jkl"))) + }, Seq(Map(true -> "xyz"))), Row(3, Row(Seq(Row(3, 4)), Seq(Map(false -> "mno", true -> "pqr"))), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org