This is an automated email from the ASF dual-hosted git repository.

felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 2e90570bcb [VL] Minor fix, rename dictionarygess to footerestimate 
(#10411)
2e90570bcb is described below

commit 2e90570bcbb5fedf860704ec8f44de162a615559
Author: BInwei Yang <[email protected]>
AuthorDate: Wed Aug 13 21:14:22 2025 -0700

    [VL] Minor fix, rename dictionarygess to footerestimate (#10411)
    
    To align with Velox's name. Added Deprecated infomation to avoid break 
current use case.
---
 .../gluten/backendsapi/velox/VeloxListenerApi.scala  | 15 +++++++++++++++
 .../scala/org/apache/gluten/config/VeloxConfig.scala | 20 +++++++++++++++++---
 cpp/velox/config/VeloxConfig.h                       |  1 +
 cpp/velox/utils/ConfigExtractor.cc                   |  3 ++-
 docs/velox-configuration.md                          |  7 ++++---
 .../org/apache/gluten/config/GlutenCoreConfig.scala  |  4 ++++
 .../org/apache/gluten/config/GlutenConfig.scala      |  4 ++++
 7 files changed, 47 insertions(+), 7 deletions(-)

diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
index fe10f97f85..5468ad2c56 100644
--- 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
+++ 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala
@@ -68,6 +68,15 @@ class VeloxListenerApi extends ListenerApi with Logging {
           s"${COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.key} should be enabled 
together.")
     }
 
+    if (
+      conf.get(COLUMNAR_VELOX_CACHE_ENABLED) &&
+      !conf.get(GlutenConfig.GLUTEN_SOFT_AFFINITY_ENABLED)
+    ) {
+      logWarning(
+        s"It's recommened to enable 
${GlutenConfig.GLUTEN_SOFT_AFFINITY_ENABLED.key} when " +
+          s"${COLUMNAR_VELOX_CACHE_ENABLED.key} is set to get better 
locality.")
+    }
+
     if (conf.get(COLUMNAR_VELOX_CACHE_ENABLED) && conf.get(LOAD_QUANTUM) > 8 * 
1024 * 1024) {
       throw new IllegalArgumentException(
         s"Velox currently only support up to 8MB load quantum size " +
@@ -75,6 +84,12 @@ class VeloxListenerApi extends ListenerApi with Logging {
           s"User can set ${LOAD_QUANTUM.key} <= 8MB skip this error.")
     }
 
+    if (conf.contains(DIRECTORY_SIZE_GUESS.key)) {
+      logWarning(
+        s"${DIRECTORY_SIZE_GUESS.key} is Deprecated " +
+          s"replacing it with ${FOOTER_ESTIMATED_SIZE.key} instead.")
+    }
+
     // Generate HDFS client configurations.
     HdfsConfGenerator.addHdfsClientToSparkWorkDirectory(sc)
 
diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala 
b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
index fd451d1ffd..52bde8044d 100644
--- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
+++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala
@@ -24,6 +24,10 @@ import org.apache.spark.sql.internal.SQLConf
 import java.util.Locale
 import java.util.concurrent.TimeUnit
 
+/*
+ * Note: Gluten configiguration.md is automatically generated from this code.
+ * Make sure to run dev/gen_all_config_docs.sh after making changes to this 
file.
+ */
 class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
   import VeloxConfig._
 
@@ -100,7 +104,8 @@ object VeloxConfig {
   val COLUMNAR_VELOX_CACHE_ENABLED =
     buildStaticConf("spark.gluten.sql.columnar.backend.velox.cacheEnabled")
       .internal()
-      .doc("Enable Velox cache, default off")
+      .doc("Enable Velox cache, default off. It's recommended to enable" +
+        "soft-affinity as well when enable velox cache.")
       .booleanConf
       .createWithDefault(false)
 
@@ -479,14 +484,23 @@ object VeloxConfig {
   val DIRECTORY_SIZE_GUESS =
     
buildStaticConf("spark.gluten.sql.columnar.backend.velox.directorySizeGuess")
       .internal()
-      .doc("Set the directory size guess for velox file scan")
+      .doc("Deprecated, rename to 
spark.gluten.sql.columnar.backend.velox.footerEstimatedSize")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefaultString("32KB")
+
+  val FOOTER_ESTIMATED_SIZE =
+    
buildStaticConf("spark.gluten.sql.columnar.backend.velox.footerEstimatedSize")
+      .internal()
+      .doc("Set the footer estimated size for velox file scan, " +
+        "refer to Velox's footer-estimated-size")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefaultString("32KB")
 
   val FILE_PRELOAD_THRESHOLD =
     
buildStaticConf("spark.gluten.sql.columnar.backend.velox.filePreloadThreshold")
       .internal()
-      .doc("Set the file preload threshold for velox file scan")
+      .doc("Set the file preload threshold for velox file scan, " +
+        "refer to Velox's file-preload-threshold")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefaultString("1MB")
 
diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
index 7573bf0ab8..e38253a3fb 100644
--- a/cpp/velox/config/VeloxConfig.h
+++ b/cpp/velox/config/VeloxConfig.h
@@ -132,6 +132,7 @@ const bool kVeloxFileHandleCacheEnabledDefault = false;
 
 /* configs for file read in velox*/
 const std::string kDirectorySizeGuess = 
"spark.gluten.sql.columnar.backend.velox.directorySizeGuess";
+const std::string kFooterEstimatedSize = 
"spark.gluten.sql.columnar.backend.velox.footerEstimatedSize";
 const std::string kFilePreloadThreshold = 
"spark.gluten.sql.columnar.backend.velox.filePreloadThreshold";
 const std::string kPrefetchRowGroups = 
"spark.gluten.sql.columnar.backend.velox.prefetchRowGroups";
 const std::string kLoadQuantum = 
"spark.gluten.sql.columnar.backend.velox.loadQuantum";
diff --git a/cpp/velox/utils/ConfigExtractor.cc 
b/cpp/velox/utils/ConfigExtractor.cc
index 4e06deccd5..6802470ca2 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -226,8 +226,9 @@ std::shared_ptr<facebook::velox::config::ConfigBase> 
getHiveConfig(
       conf->get<std::string>(kPrefetchRowGroups, "1");
   hiveConfMap[facebook::velox::connector::hive::HiveConfig::kLoadQuantum] =
       conf->get<std::string>(kLoadQuantum, "268435456"); // 256M
+  auto footerEstimatedSize = conf->get<std::string>(kDirectorySizeGuess, 
"32768"); // 32K
   
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFooterEstimatedSize] 
=
-      conf->get<std::string>(kDirectorySizeGuess, "32768"); // 32K
+      conf->get<std::string>(kFooterEstimatedSize, footerEstimatedSize); // 32K
   
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kFilePreloadThreshold]
 =
       conf->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
 
diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md
index b7712c00a9..d6fdb7ca2a 100644
--- a/docs/velox-configuration.md
+++ b/docs/velox-configuration.md
@@ -19,16 +19,17 @@ nav_order: 16
 | spark.gluten.sql.columnar.backend.velox.bloomFilter.expectedNumItems         
    | 1000000           | The default number of expected items for the velox 
bloomfilter: 'spark.bloom_filter.expected_num_items'                            
                                                                                
                                                                                
                                                                                
                 [...]
 | spark.gluten.sql.columnar.backend.velox.bloomFilter.maxNumBits               
    | 4194304           | The max number of bits to use for the velox bloom 
filter: 'spark.bloom_filter.max_num_bits'                                       
                                                                                
                                                                                
                                                                                
                  [...]
 | spark.gluten.sql.columnar.backend.velox.bloomFilter.numBits                  
    | 8388608           | The default number of bits to use for the velox bloom 
filter: 'spark.bloom_filter.num_bits'                                           
                                                                                
                                                                                
                                                                                
              [...]
-| spark.gluten.sql.columnar.backend.velox.cacheEnabled                         
    | false             | Enable Velox cache, default off                       
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| spark.gluten.sql.columnar.backend.velox.cacheEnabled                         
    | false             | Enable Velox cache, default off. It's recommended to 
enablesoft-affinity as well when enable velox cache.                            
                                                                                
                                                                                
                                                                                
               [...]
 | spark.gluten.sql.columnar.backend.velox.cachePrefetchMinPct                  
    | 0                 | Set prefetch cache min pct for velox file scan        
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
 | spark.gluten.sql.columnar.backend.velox.checkUsageLeak                       
    | true              | Enable check memory usage leak.                       
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
-| spark.gluten.sql.columnar.backend.velox.directorySizeGuess                   
    | 32KB              | Set the directory size guess for velox file scan      
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| spark.gluten.sql.columnar.backend.velox.directorySizeGuess                   
    | 32KB              | Deprecated, rename to 
spark.gluten.sql.columnar.backend.velox.footerEstimatedSize                     
                                                                                
                                                                                
                                                                                
                                              [...]
 | spark.gluten.sql.columnar.backend.velox.enableSystemExceptionStacktrace      
    | true              | Enable the stacktrace for system type of 
VeloxException                                                                  
                                                                                
                                                                                
                                                                                
                           [...]
 | spark.gluten.sql.columnar.backend.velox.enableUserExceptionStacktrace        
    | true              | Enable the stacktrace for user type of VeloxException 
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
 | spark.gluten.sql.columnar.backend.velox.fileHandleCacheEnabled               
    | false             | Disables caching if false. File handle cache should 
be disabled if files are mutable, i.e. file content may change while file path 
stays the same.                                                                 
                                                                                
                                                                                
                 [...]
-| spark.gluten.sql.columnar.backend.velox.filePreloadThreshold                 
    | 1MB               | Set the file preload threshold for velox file scan    
                                                                                
                                                                                
                                                                                
                                                                                
              [...]
+| spark.gluten.sql.columnar.backend.velox.filePreloadThreshold                 
    | 1MB               | Set the file preload threshold for velox file scan, 
refer to Velox's file-preload-threshold                                         
                                                                                
                                                                                
                                                                                
                [...]
 | spark.gluten.sql.columnar.backend.velox.floatingPointMode                    
    | loose             | Config used to control the tolerance of floating 
point operations alignment with Spark. When the mode is set to strict, flushing 
is disabled for sum(float/double)and avg(float/double). When set to loose, 
flushing will be enabled.                                                       
                                                                                
                        [...]
 | spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation          
    | true              | Enable flushable aggregation. If true, Gluten will 
try converting regular aggregation into Velox's flushable aggregation when 
applicable. A flushable aggregation could emit intermediate result at anytime 
when memory is full / data reduction ratio is low.                              
                                                                                
                        [...]
+| spark.gluten.sql.columnar.backend.velox.footerEstimatedSize                  
    | 32KB              | Set the footer estimated size for velox file scan, 
refer to Velox's footer-estimated-size                                          
                                                                                
                                                                                
                                                                                
                 [...]
 | spark.gluten.sql.columnar.backend.velox.glogSeverityLevel                    
    | 1                 | Set glog severity level in Velox backend, same as 
FLAGS_minloglevel.                                                              
                                                                                
                                                                                
                                                                                
                  [...]
 | spark.gluten.sql.columnar.backend.velox.glogVerboseLevel                     
    | 0                 | Set glog verbose level in Velox backend, same as 
FLAGS_v.                                                                        
                                                                                
                                                                                
                                                                                
                   [...]
 | spark.gluten.sql.columnar.backend.velox.loadQuantum                          
    | 256MB             | Set the load quantum for velox file scan, recommend 
to use the default value (256MB) for performance consideration. If Velox cache 
is enabled, it can be 8MB at most.                                              
                                                                                
                                                                                
                 [...]
diff --git 
a/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala 
b/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
index b3112f2990..89bad5e52f 100644
--- a/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
+++ b/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala
@@ -58,6 +58,10 @@ class GlutenCoreConfig(conf: SQLConf) extends Logging {
     getConf(DYNAMIC_OFFHEAP_SIZING_MEMORY_FRACTION)
 }
 
+/*
+ * Note: Gluten configiguration.md is automatically generated from this code.
+ * Make sure to run dev/gen_all_config_docs.sh after making changes to this 
file.
+ */
 object GlutenCoreConfig {
   def buildConf(key: String): ConfigBuilder = ConfigBuilder(key)
 
diff --git 
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala 
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 60e469f055..e81d6b0664 100644
--- 
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++ 
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -50,6 +50,10 @@ case object RssSortShuffleWriterType extends 
ShuffleWriterType {
   override val name: String = ReservedKeys.GLUTEN_RSS_SORT_SHUFFLE_WRITER
 }
 
+/*
+ * Note: Gluten configiguration.md is automatically generated from this code.
+ * Make sure to run dev/gen_all_config_docs.sh after making changes to this 
file.
+ */
 class GlutenConfig(conf: SQLConf) extends GlutenCoreConfig(conf) {
   import GlutenConfig._
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to