(incubator-gluten) branch main updated: [VL] Disable Parquet metadata validation by default due to performance regression (#11233)

hongze Mon, 01 Dec 2025 05:23:00 -0800

This is an automated email from the ASF dual-hosted git repository.

hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new c5a15a5a35 [VL] Disable Parquet metadata validation by default due to 
performance regression (#11233)
c5a15a5a35 is described below

commit c5a15a5a353d9675d522d5b00347d28bf5bc2534
Author: Hongze Zhang <[email protected]>
AuthorDate: Mon Dec 1 13:22:40 2025 +0000

    [VL] Disable Parquet metadata validation by default due to performance 
regression (#11233)
---
 .../spark/sql/delta/test/DeltaSQLCommandTest.scala |  1 +
 .../apache/gluten/utils/ParquetMetadataUtils.scala |  5 +++-
 docs/Configuration.md                              |  7 +++--
 .../org/apache/gluten/config/GlutenConfig.scala    | 35 +++++++++++++++-------
 4 files changed, 34 insertions(+), 14 deletions(-)

diff --git 
a/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
 
b/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
index 53adfbf4a4..8d1f87089b 100644
--- 
a/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
+++ 
b/backends-velox/src-delta33/test/scala/org/apache/spark/sql/delta/test/DeltaSQLCommandTest.scala
@@ -51,6 +51,7 @@ trait DeltaSQLCommandTest extends SharedSparkSession {
       .set("spark.unsafe.exceptionOnMemoryLeak", "true")
       .set(VeloxDeltaConfig.ENABLE_NATIVE_WRITE.key, "true")
       .set("spark.databricks.delta.snapshotPartitions", "2")
+      .set("spark.gluten.sql.fallbackUnexpectedMetadataParquet", "true")
   }
 }
 // spotless:on
diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
 
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
index b533c029e4..d35a1cdb74 100644
--- 
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
+++ 
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
@@ -95,6 +95,7 @@ object ParquetMetadataUtils {
       fileLimit: Int
   ): Option[String] = {
     val isEncryptionValidationEnabled = 
GlutenConfig.get.parquetEncryptionValidationEnabled
+    val isMetadataValidationEnabled = 
GlutenConfig.get.parquetMetadataValidationEnabled
     val filesIterator: RemoteIterator[LocatedFileStatus] = fs.listFiles(path, 
true)
     var checkedFileCount = 0
     while (filesIterator.hasNext && checkedFileCount < fileLimit) {
@@ -107,7 +108,9 @@ object ParquetMetadataUtils {
       ) {
         return Some("Encrypted Parquet file detected.")
       }
-      if (isTimezoneFoundInMetadata(fileStatus, conf, parquetOptions)) {
+      if (
+        isMetadataValidationEnabled && isTimezoneFoundInMetadata(fileStatus, 
conf, parquetOptions)
+      ) {
         return Some("Legacy timezone found.")
       }
     }
diff --git a/docs/Configuration.md b/docs/Configuration.md
index da718718a0..9521f47478 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -115,10 +115,11 @@ nav_order: 15
 | spark.gluten.sql.columnarToRowMemoryThreshold                      | 64MB    
          |
 | spark.gluten.sql.countDistinctWithoutExpand                        | false   
          | Convert Count Distinct to a UDAF called count_distinct to prevent 
SparkPlanner converting it to Expand+Count. WARNING: When enabled, count 
distinct queries will fail to fallback!!!                                       
                                                                                
                                                                    |
 | spark.gluten.sql.extendedColumnPruning.enabled                     | true    
          | Do extended nested column pruning for cases ignored by vanilla 
Spark.                                                                          
                                                                                
                                                                                
                                                                |
-| spark.gluten.sql.fallbackEncryptedParquet                          | false   
          | If enabled, gluten will not offload scan when encrypted parquet 
files are detected                                                              
                                                                                
                                                                                
                                                               |
-| spark.gluten.sql.fallbackEncryptedParquet.limit                    | 
&lt;undefined&gt; | If supplied, `limit` number of files will be checked to 
determine encryption and falling back java scan. Defaulted to 
spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit.                       
                                                                                
                                                                                
         |
+| spark.gluten.sql.fallbackEncryptedParquet                          | 
&lt;undefined&gt; | If enabled, Gluten will not offload scan when encrypted 
parquet files are detected. Defaulted to 
spark.gluten.sql.fallbackUnexpectedMetadataParquet.                             
                                                                                
                                                                                
                              |
+| spark.gluten.sql.fallbackEncryptedParquet.limit                    | 
&lt;undefined&gt; | If supplied, `limit` number of files will be checked to 
determine encryption and falling back to java scan. Defaulted to 
spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit.                       
                                                                                
                                                                                
      |
 | spark.gluten.sql.fallbackRegexpExpressions                         | false   
          | If true, fall back all regexp expressions. There are a few 
incompatible cases between RE2 (used by native engine) and java.util.regex 
(used by Spark). User should enable this property if their incompatibility is 
intolerable.                                                                    
                                                                           |
-| spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit           | 10      
          | If supplied, metadata of `limit` number of Parquet files will be 
checked to determine whether to fall back java scan                             
                                                                                
                                                                                
                                                              |
+| spark.gluten.sql.fallbackUnexpectedMetadataParquet                 | false   
          | If enabled, Gluten will not offload scan when unexpected metadata 
is detected.                                                                    
                                                                                
                                                                                
                                                             |
+| spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit           | 10      
          | If supplied, metadata of `limit` number of Parquet files will be 
checked to determine whether to fall back to java scan.                         
                                                                                
                                                                                
                                                              |
 | spark.gluten.sql.injectNativePlanStringToExplain                   | false   
          | When true, Gluten will inject native plan tree to Spark's explain 
output.                                                                         
                                                                                
                                                                                
                                                             |
 | spark.gluten.sql.mergeTwoPhasesAggregate.enabled                   | true    
          | Whether to merge two phases aggregate if there are no other 
operators between them.                                                         
                                                                                
                                                                                
                                                                   |
 | spark.gluten.sql.native.arrow.reader.enabled                       | false   
          | This is config to specify whether to enable the native columnar csv 
reader                                                                          
                                                                                
                                                                                
                                                           |
diff --git 
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala 
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 1e530d56fa..9896bd7cd1 100644
--- 
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ 
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -355,8 +355,6 @@ class GlutenConfig(conf: SQLConf) extends 
GlutenCoreConfig(conf) {
 
   def enableHdfsViewfs: Boolean = getConf(HDFS_VIEWFS_ENABLED)
 
-  def parquetEncryptionValidationEnabled: Boolean = 
getConf(ENCRYPTED_PARQUET_FALLBACK_ENABLED)
-
   def enableAutoAdjustStageResourceProfile: Boolean =
     getConf(AUTO_ADJUST_STAGE_RESOURCE_PROFILE_ENABLED)
 
@@ -369,10 +367,19 @@ class GlutenConfig(conf: SQLConf) extends 
GlutenCoreConfig(conf) {
   def autoAdjustStageFallenNodeThreshold: Double =
     getConf(AUTO_ADJUST_STAGE_RESOURCES_FALLEN_NODE_RATIO_THRESHOLD)
 
+  def parquetMetadataValidationEnabled: Boolean = {
+    getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED)
+  }
+
   def parquetMetadataFallbackFileLimit: Int = {
     getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT)
   }
 
+  def parquetEncryptionValidationEnabled: Boolean = {
+    getConf(ENCRYPTED_PARQUET_FALLBACK_ENABLED)
+      .getOrElse(getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED))
+  }
+
   def parquetEncryptionValidationFileLimit: Int = {
     getConf(PARQUET_ENCRYPTED_FALLBACK_FILE_LIMIT).getOrElse(
       getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT))
@@ -1525,12 +1532,6 @@ object GlutenConfig extends ConfigRegistry {
       .booleanConf
       .createWithDefault(false)
 
-  val ENCRYPTED_PARQUET_FALLBACK_ENABLED =
-    buildConf("spark.gluten.sql.fallbackEncryptedParquet")
-      .doc("If enabled, gluten will not offload scan when encrypted parquet 
files are detected")
-      .booleanConf
-      .createWithDefault(false)
-
   val AUTO_ADJUST_STAGE_RESOURCE_PROFILE_ENABLED =
     buildConf("spark.gluten.auto.adjustStageResource.enabled")
       .experimental()
@@ -1561,19 +1562,33 @@ object GlutenConfig extends ConfigRegistry {
       .doubleConf
       .createWithDefault(0.5d)
 
+  val PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED =
+    buildConf("spark.gluten.sql.fallbackUnexpectedMetadataParquet")
+      .doc("If enabled, Gluten will not offload scan when unexpected metadata 
is detected.")
+      .booleanConf
+      .createWithDefault(false)
+
   val PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT =
     buildConf("spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit")
       .doc("If supplied, metadata of `limit` number of Parquet files will be 
checked to" +
-        " determine whether to fall back java scan")
+        " determine whether to fall back to java scan.")
       .intConf
       .checkValue(_ > 0, s"must be positive.")
       .createWithDefault(10)
 
+  val ENCRYPTED_PARQUET_FALLBACK_ENABLED =
+    buildConf("spark.gluten.sql.fallbackEncryptedParquet")
+      .doc(
+        "If enabled, Gluten will not offload scan when encrypted parquet files 
are" +
+          " detected. Defaulted to " + 
s"${PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED.key}.")
+      .booleanConf
+      .createOptional
+
   val PARQUET_ENCRYPTED_FALLBACK_FILE_LIMIT =
     buildConf("spark.gluten.sql.fallbackEncryptedParquet.limit")
       .doc(
         "If supplied, `limit` number of files will be checked to determine 
encryption " +
-          s"and falling back java scan. Defaulted to " +
+          s"and falling back to java scan. Defaulted to " +
           s"${PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT.key}.")
       .intConf
       .checkValue(_ > 0, s"must be positive.")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [VL] Disable Parquet metadata validation by default due to performance regression (#11233)

Reply via email to