This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 9ed4c6f56f feat(parquet): precompute `offset_index_disabled` at 
build-time (#9724)
9ed4c6f56f is described below

commit 9ed4c6f56f8cffb016aeacb69cfdb1d763debf75
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Thu Apr 16 07:28:30 2026 -0400

    feat(parquet): precompute `offset_index_disabled` at build-time (#9724)
    
    # Which issue does this PR close?
    
    - Depends on #9723
    - Contributes to #9722
    
    # Rationale for this change
    
    `WriterProperties::offset_index_disabled()` checked whether any column
    in the `column_properties` HashMap has page-level statistics enabled,
    scanning the entire map on every call. This method is called from
    `GenericColumnWriter::new` — once per column per row group. With N
    columns each having per-column properties, this resulted in quadratic
    HashMap iterations during row group construction.
    
    # What changes are included in this PR?
    
    Move the scan into `WriterPropertiesBuilder::build()` so it runs once at
    construction time.
    
    Benchmark results (vs baseline in #9723):
    
    ```
      writer_overhead/1000_cols/per_column_props     2.44 ms  (was   3.25 ms, 
−25%)
      writer_overhead/5000_cols/per_column_props    13.28 ms  (was  47.45 ms, 
−72%)
      writer_overhead/10000_cols/per_column_props   27.97 ms  (was 197.97 ms, 
−86%)
    ```
    
    Scaling now linear.
    
    # Are these changes tested?
    
    All tests passing.
    
    # Are there any user-facing changes?
    
    None.
    
    Signed-off-by: Hippolyte Barraud <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 parquet/src/file/properties.rs | 55 ++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 65630cfed2..2f7a16a32d 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -181,6 +181,23 @@ pub enum BloomFilterPosition {
 /// Reference counted writer properties.
 pub type WriterPropertiesPtr = Arc<WriterProperties>;
 
+/// Resolved state of [`WriterPropertiesBuilder::set_offset_index_disabled`].
+///
+/// When a user disables offset indexes but page-level statistics are enabled,
+/// the setting is overridden (offset indexes remain enabled). This enum
+/// preserves the user's original intent so that a round-trip through
+/// `WriterPropertiesBuilder` does not lose it.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum OffsetIndexSetting {
+    /// Offset indexes are enabled (the default).
+    Enabled,
+    /// User disabled offset indexes and no page-level statistics override it.
+    Disabled,
+    /// User disabled offset indexes, but page-level statistics require them,
+    /// so they remain enabled.
+    DisabledOverridden,
+}
+
 /// Configuration settings for writing parquet files.
 ///
 /// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change 
settings.
@@ -224,7 +241,7 @@ pub struct WriterProperties {
     bloom_filter_position: BloomFilterPosition,
     writer_version: WriterVersion,
     created_by: String,
-    offset_index_disabled: bool,
+    offset_index_setting: OffsetIndexSetting,
     pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
     default_column_properties: ColumnProperties,
     column_properties: HashMap<ColumnPath, ColumnProperties>,
@@ -374,18 +391,7 @@ impl WriterProperties {
     ///
     /// For more details see 
[`WriterPropertiesBuilder::set_offset_index_disabled`]
     pub fn offset_index_disabled(&self) -> bool {
-        // If page statistics are to be collected, then do not disable the 
offset indexes.
-        let default_page_stats_enabled =
-            self.default_column_properties.statistics_enabled() == 
Some(EnabledStatistics::Page);
-        let column_page_stats_enabled = self
-            .column_properties
-            .iter()
-            .any(|path_props| path_props.1.statistics_enabled() == 
Some(EnabledStatistics::Page));
-        if default_page_stats_enabled || column_page_stats_enabled {
-            return false;
-        }
-
-        self.offset_index_disabled
+        matches!(self.offset_index_setting, OffsetIndexSetting::Disabled)
     }
 
     /// Returns `key_value_metadata` KeyValue pairs.
@@ -593,6 +599,22 @@ impl Default for WriterPropertiesBuilder {
 impl WriterPropertiesBuilder {
     /// Finalizes the configuration and returns immutable writer properties 
struct.
     pub fn build(self) -> WriterProperties {
+        // Pre-compute offset_index_setting
+        let offset_index_setting = if self.offset_index_disabled {
+            let default_page_stats_enabled = 
self.default_column_properties.statistics_enabled()
+                == Some(EnabledStatistics::Page);
+            let column_page_stats_enabled = 
self.column_properties.iter().any(|path_props| {
+                path_props.1.statistics_enabled() == 
Some(EnabledStatistics::Page)
+            });
+            if default_page_stats_enabled || column_page_stats_enabled {
+                OffsetIndexSetting::DisabledOverridden
+            } else {
+                OffsetIndexSetting::Disabled
+            }
+        } else {
+            OffsetIndexSetting::Enabled
+        };
+
         // Resolve bloom filter NDV for columns where it wasn't explicitly set:
         // default to max_row_group_row_count so the filter is never 
undersized.
         let default_ndv = self
@@ -613,7 +635,7 @@ impl WriterPropertiesBuilder {
             bloom_filter_position: self.bloom_filter_position,
             writer_version: self.writer_version,
             created_by: self.created_by,
-            offset_index_disabled: self.offset_index_disabled,
+            offset_index_setting,
             key_value_metadata: self.key_value_metadata,
             default_column_properties,
             column_properties,
@@ -1148,7 +1170,10 @@ impl From<WriterProperties> for WriterPropertiesBuilder {
             bloom_filter_position: props.bloom_filter_position,
             writer_version: props.writer_version,
             created_by: props.created_by,
-            offset_index_disabled: props.offset_index_disabled,
+            offset_index_disabled: !matches!(
+                props.offset_index_setting,
+                OffsetIndexSetting::Enabled
+            ),
             key_value_metadata: props.key_value_metadata,
             default_column_properties: props.default_column_properties,
             column_properties: props.column_properties,

Reply via email to