This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 9ed4c6f56f feat(parquet): precompute `offset_index_disabled` at
build-time (#9724)
9ed4c6f56f is described below
commit 9ed4c6f56f8cffb016aeacb69cfdb1d763debf75
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Thu Apr 16 07:28:30 2026 -0400
feat(parquet): precompute `offset_index_disabled` at build-time (#9724)
# Which issue does this PR close?
- Depends on #9723
- Contributes to #9722
# Rationale for this change
`WriterProperties::offset_index_disabled()` checked whether any column
in the `column_properties` HashMap has page-level statistics enabled,
scanning the entire map on every call. This method is called from
`GenericColumnWriter::new` — once per column per row group. With N
columns each having per-column properties, this resulted in quadratic
HashMap iterations during row group construction.
# What changes are included in this PR?
Move the scan into `WriterPropertiesBuilder::build()` so it runs once at
construction time.
Benchmark results (vs baseline in #9723):
```
writer_overhead/1000_cols/per_column_props 2.44 ms (was 3.25 ms,
−25%)
writer_overhead/5000_cols/per_column_props 13.28 ms (was 47.45 ms,
−72%)
writer_overhead/10000_cols/per_column_props 27.97 ms (was 197.97 ms,
−86%)
```
Scaling now linear.
# Are these changes tested?
All tests passing.
# Are there any user-facing changes?
None.
Signed-off-by: Hippolyte Barraud <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/file/properties.rs | 55 ++++++++++++++++++++++++++++++------------
1 file changed, 40 insertions(+), 15 deletions(-)
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 65630cfed2..2f7a16a32d 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -181,6 +181,23 @@ pub enum BloomFilterPosition {
/// Reference counted writer properties.
pub type WriterPropertiesPtr = Arc<WriterProperties>;
+/// Resolved state of [`WriterPropertiesBuilder::set_offset_index_disabled`].
+///
+/// When a user disables offset indexes but page-level statistics are enabled,
+/// the setting is overridden (offset indexes remain enabled). This enum
+/// preserves the user's original intent so that a round-trip through
+/// `WriterPropertiesBuilder` does not lose it.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum OffsetIndexSetting {
+ /// Offset indexes are enabled (the default).
+ Enabled,
+ /// User disabled offset indexes and no page-level statistics override it.
+ Disabled,
+ /// User disabled offset indexes, but page-level statistics require them,
+ /// so they remain enabled.
+ DisabledOverridden,
+}
+
/// Configuration settings for writing parquet files.
///
/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change
settings.
@@ -224,7 +241,7 @@ pub struct WriterProperties {
bloom_filter_position: BloomFilterPosition,
writer_version: WriterVersion,
created_by: String,
- offset_index_disabled: bool,
+ offset_index_setting: OffsetIndexSetting,
pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
default_column_properties: ColumnProperties,
column_properties: HashMap<ColumnPath, ColumnProperties>,
@@ -374,18 +391,7 @@ impl WriterProperties {
///
/// For more details see
[`WriterPropertiesBuilder::set_offset_index_disabled`]
pub fn offset_index_disabled(&self) -> bool {
- // If page statistics are to be collected, then do not disable the
offset indexes.
- let default_page_stats_enabled =
- self.default_column_properties.statistics_enabled() ==
Some(EnabledStatistics::Page);
- let column_page_stats_enabled = self
- .column_properties
- .iter()
- .any(|path_props| path_props.1.statistics_enabled() ==
Some(EnabledStatistics::Page));
- if default_page_stats_enabled || column_page_stats_enabled {
- return false;
- }
-
- self.offset_index_disabled
+ matches!(self.offset_index_setting, OffsetIndexSetting::Disabled)
}
/// Returns `key_value_metadata` KeyValue pairs.
@@ -593,6 +599,22 @@ impl Default for WriterPropertiesBuilder {
impl WriterPropertiesBuilder {
/// Finalizes the configuration and returns immutable writer properties
struct.
pub fn build(self) -> WriterProperties {
+ // Pre-compute offset_index_setting
+ let offset_index_setting = if self.offset_index_disabled {
+ let default_page_stats_enabled =
self.default_column_properties.statistics_enabled()
+ == Some(EnabledStatistics::Page);
+ let column_page_stats_enabled =
self.column_properties.iter().any(|path_props| {
+ path_props.1.statistics_enabled() ==
Some(EnabledStatistics::Page)
+ });
+ if default_page_stats_enabled || column_page_stats_enabled {
+ OffsetIndexSetting::DisabledOverridden
+ } else {
+ OffsetIndexSetting::Disabled
+ }
+ } else {
+ OffsetIndexSetting::Enabled
+ };
+
// Resolve bloom filter NDV for columns where it wasn't explicitly set:
// default to max_row_group_row_count so the filter is never
undersized.
let default_ndv = self
@@ -613,7 +635,7 @@ impl WriterPropertiesBuilder {
bloom_filter_position: self.bloom_filter_position,
writer_version: self.writer_version,
created_by: self.created_by,
- offset_index_disabled: self.offset_index_disabled,
+ offset_index_setting,
key_value_metadata: self.key_value_metadata,
default_column_properties,
column_properties,
@@ -1148,7 +1170,10 @@ impl From<WriterProperties> for WriterPropertiesBuilder {
bloom_filter_position: props.bloom_filter_position,
writer_version: props.writer_version,
created_by: props.created_by,
- offset_index_disabled: props.offset_index_disabled,
+ offset_index_disabled: !matches!(
+ props.offset_index_setting,
+ OffsetIndexSetting::Enabled
+ ),
key_value_metadata: props.key_value_metadata,
default_column_properties: props.default_column_properties,
column_properties: props.column_properties,