[spark] branch master updated: [SPARK-41858][SQL] Fix ORC reader perf regression due to DEFAULT value feature

dongjoon Tue, 03 Jan 2023 10:41:34 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new d81e55e1ff9 [SPARK-41858][SQL] Fix ORC reader perf regression due to 
DEFAULT value feature
d81e55e1ff9 is described below

commit d81e55e1ff998c624fa80c5660d7724701b4df23
Author: Dongjoon Hyun <dongj...@apache.org>
AuthorDate: Tue Jan 3 10:40:44 2023 -0800

    [SPARK-41858][SQL] Fix ORC reader perf regression due to DEFAULT value 
feature
    
    ### What changes were proposed in this pull request?
    
    This PR is a partial and logical revert of SPARK-39862, 
https://github.com/apache/spark/pull/37280, to fix the huge ORC reader perf 
regression (3x slower).
    
    SPARK-39862 should propose a fix without perf regression.
    
    ### Why are the changes needed?
    
    During Apache Spark 3.4.0 preparation, SPARK-41782 identified a perf 
regression.
    - https://github.com/apache/spark/pull/39301#discussion_r1059239575
    
    ### Does this PR introduce _any_ user-facing change?
    
    After this PR, the regression is removed. However, the bug of DEFAULT value 
feature will remain. This should be handled separately.
    
    ### How was this patch tested?
    
    Pass the CI.
    
    Closes #39362 from dongjoon-hyun/SPARK-41858.
    
    Authored-by: Dongjoon Hyun <dongj...@apache.org>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../execution/datasources/orc/OrcDeserializer.scala | 21 +++++++++++----------
 .../org/apache/spark/sql/sources/InsertSuite.scala  |  9 +++++++--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
index 5276f5c6d7b..5b207a04ada 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -57,14 +57,7 @@ class OrcDeserializer(
             } else {
               new RowUpdater(resultRow)
             }
-          val writer: (Int, WritableComparable[_]) => Unit =
-            (ordinal, value) =>
-              if (value == null) {
-                rowUpdater.setNullAt(ordinal)
-              } else {
-                val writerFunc = newWriter(f.dataType, rowUpdater)
-                writerFunc(ordinal, value)
-              }
+          val writer = newWriter(f.dataType, rowUpdater)
           (value: WritableComparable[_]) => writer(index, value)
         }
       }.toArray
@@ -75,7 +68,11 @@ class OrcDeserializer(
     while (targetColumnIndex < fieldWriters.length) {
       if (fieldWriters(targetColumnIndex) != null) {
         val value = orcStruct.getFieldValue(requestedColIds(targetColumnIndex))
-        fieldWriters(targetColumnIndex)(value)
+        if (value == null) {
+          resultRow.setNullAt(targetColumnIndex)
+        } else {
+          fieldWriters(targetColumnIndex)(value)
+        }
       }
       targetColumnIndex += 1
     }
@@ -88,7 +85,11 @@ class OrcDeserializer(
     while (targetColumnIndex < fieldWriters.length) {
       if (fieldWriters(targetColumnIndex) != null) {
         val value = orcValues(requestedColIds(targetColumnIndex))
-        fieldWriters(targetColumnIndex)(value)
+        if (value == null) {
+          resultRow.setNullAt(targetColumnIndex)
+        } else {
+          fieldWriters(targetColumnIndex)(value)
+        }
       }
       targetColumnIndex += 1
     }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index dd37c93871e..7c4a39d6ff4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -1679,7 +1679,8 @@ class InsertSuite extends DataSourceTest with 
SharedSparkSession {
           Config(
             None),
           Config(
-            Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false")))),
+            Some(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false"),
+            insertNullsToStorage = false))),
       TestCase(
         dataSource = "parquet",
         Seq(
@@ -1943,7 +1944,11 @@ class InsertSuite extends DataSourceTest with 
SharedSparkSession {
               Row(Seq(Row(1, 2)), Seq(Map(false -> "def", true -> "jkl"))),
               Seq(Map(true -> "xyz"))),
             Row(2,
-              null,
+              if (config.dataSource != "orc") {
+                null
+              } else {
+                Row(Seq(Row(1, 2)), Seq(Map(false -> "def", true -> "jkl")))
+              },
               Seq(Map(true -> "xyz"))),
             Row(3,
               Row(Seq(Row(3, 4)), Seq(Map(false -> "mno", true -> "pqr"))),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-41858][SQL] Fix ORC reader perf regression due to DEFAULT value feature

Reply via email to