This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new aa3c9d3ecc feat(parquet): add BloomFilterPropertiesBuilder (#9877)
aa3c9d3ecc is described below
commit aa3c9d3ecc35891793d962015fd2cdff708ac63f
Author: Yu-Chuan Hung <[email protected]>
AuthorDate: Fri May 8 02:03:15 2026 +0800
feat(parquet): add BloomFilterPropertiesBuilder (#9877)
# Which issue does this PR close?
- Closes #9667.
# Rationale for this change
No builder exists for `BloomFilterProperties`, so callers write
`BloomFilterProperties { fpp, ndv }` literals — pinning field layout to
the API and skipping fpp validation. `WriterPropertiesBuilder` also has
no setter that takes a built `BloomFilterProperties`.
# What changes are included in this PR?
- `BloomFilterPropertiesBuilder` (`with_fpp`, `with_max_ndv`, `build`,
`try_build`). Two entry points per discussion in the issue:
`BloomFilterProperties::builder()` and
`BloomFilterPropertiesBuilder::new()`.
- `WriterPropertiesBuilder::set_bloom_filter_properties` + per-column
variant. NDV from the passed-in struct is honoured (no row-group-size
override). For dynamic NDV, keep using `set_bloom_filter_enabled` /
`set_bloom_filter_fpp`.
- Renamed `set_bloom_filter_ndv` → `set_bloom_filter_max_ndv` (also
per-column). Old names are `#[deprecated(since = "59.0.0")]` aliases.
# Are these changes tested?
Yes — 10 new unit tests + a doc-test.
# Are there any user-facing changes?
Additive only. `set_bloom_filter_ndv` (and per-column) emit a
deprecation warning pointing to `_max_ndv`.
---
parquet/src/arrow/arrow_writer/mod.rs | 2 +-
parquet/src/bin/parquet-rewrite.rs | 3 +-
parquet/src/file/properties.rs | 353 ++++++++++++++++++++++++++++++++--
3 files changed, 341 insertions(+), 17 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index d1f1a3cffe..04a7983914 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -2798,7 +2798,7 @@ mod tests {
.set_bloom_filter_enabled(bloom_filter)
.set_bloom_filter_position(bloom_filter_position);
if let Some(ndv) = bloom_filter_ndv {
- builder = builder.set_bloom_filter_ndv(ndv);
+ builder = builder.set_bloom_filter_max_ndv(ndv);
}
let props = builder.build();
diff --git a/parquet/src/bin/parquet-rewrite.rs
b/parquet/src/bin/parquet-rewrite.rs
index 7637f26d77..2428c49141 100644
--- a/parquet/src/bin/parquet-rewrite.rs
+++ b/parquet/src/bin/parquet-rewrite.rs
@@ -384,7 +384,8 @@ fn main() {
writer_properties_builder =
writer_properties_builder.set_bloom_filter_fpp(value);
}
if let Some(value) = args.bloom_filter_ndv {
- writer_properties_builder =
writer_properties_builder.set_bloom_filter_ndv(value);
+ writer_properties_builder =
+ writer_properties_builder.set_bloom_filter_max_ndv(value);
}
if let Some(value) = args.bloom_filter_position {
writer_properties_builder =
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 1ab8b5c710..816973f83c 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -20,6 +20,7 @@ use crate::basic::{Compression, Encoding};
use crate::compression::{CodecOptions, CodecOptionsBuilder};
#[cfg(feature = "encryption")]
use crate::encryption::encrypt::FileEncryptionProperties;
+use crate::errors::{ParquetError, Result};
use crate::file::metadata::{KeyValue, SortingColumn};
use crate::schema::types::ColumnPath;
use std::str::FromStr;
@@ -1119,10 +1120,10 @@ impl WriterPropertiesBuilder {
/// * If the bloom filter is enabled previously then it is a no-op.
///
/// * If the bloom filter is not enabled, default values for ndv and fpp
- /// value are used used. See [`set_bloom_filter_ndv`] and
+ /// value are used used. See [`set_bloom_filter_max_ndv`] and
/// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
///
- /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
+ /// [`set_bloom_filter_max_ndv`]: Self::set_bloom_filter_max_ndv
/// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
self.default_column_properties
@@ -1154,11 +1155,17 @@ impl WriterPropertiesBuilder {
/// been called.
///
/// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
- pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
+ pub fn set_bloom_filter_max_ndv(mut self, value: u64) -> Self {
self.default_column_properties.set_bloom_filter_ndv(value);
self
}
+ /// Deprecated alias for [`Self::set_bloom_filter_max_ndv`].
+ #[deprecated(since = "59.0.0", note = "Use `set_bloom_filter_max_ndv`
instead")]
+ pub fn set_bloom_filter_ndv(self, value: u64) -> Self {
+ self.set_bloom_filter_max_ndv(value)
+ }
+
// ----------------------------------------------------------------------
// Setters for a specific column
@@ -1257,10 +1264,11 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets the number of distinct values for bloom filter for a specific
column.
+ /// Sets the maximum expected number of distinct values for bloom filter
for
+ /// a specific column.
///
- /// Takes precedence over [`Self::set_bloom_filter_ndv`].
- pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64)
-> Self {
+ /// Takes precedence over [`Self::set_bloom_filter_max_ndv`].
+ pub fn set_column_bloom_filter_max_ndv(mut self, col: ColumnPath, value:
u64) -> Self {
self.get_mut_props(col).set_bloom_filter_ndv(value);
self
}
@@ -1280,6 +1288,41 @@ impl WriterPropertiesBuilder {
.set_data_page_v2_compression_ratio_threshold(value);
self
}
+
+ /// Deprecated alias for [`Self::set_column_bloom_filter_max_ndv`].
+ #[deprecated(
+ since = "59.0.0",
+ note = "Use `set_column_bloom_filter_max_ndv` instead"
+ )]
+ pub fn set_column_bloom_filter_ndv(self, col: ColumnPath, value: u64) ->
Self {
+ self.set_column_bloom_filter_max_ndv(col, value)
+ }
+
+ /// Sets the [`BloomFilterProperties`] for all columns, implicitly enabling
+ /// the bloom filter.
+ ///
+ /// Both `fpp` and `ndv` from `value` are treated as explicit and will not
+ /// be overridden by the build-time row-group-size NDV fallback. For
+ /// dynamic NDV sizing (resolved to `max_row_group_row_count` at build
+ /// time), use [`Self::set_bloom_filter_enabled`] or
+ /// [`Self::set_bloom_filter_fpp`] instead.
+ pub fn set_bloom_filter_properties(mut self, value: BloomFilterProperties)
-> Self {
+ self.default_column_properties
+ .set_bloom_filter_properties(value);
+ self
+ }
+
+ /// Sets the [`BloomFilterProperties`] for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_bloom_filter_properties`].
+ pub fn set_column_bloom_filter_properties(
+ mut self,
+ col: ColumnPath,
+ value: BloomFilterProperties,
+ ) -> Self {
+ self.get_mut_props(col).set_bloom_filter_properties(value);
+ self
+ }
}
impl From<WriterProperties> for WriterPropertiesBuilder {
@@ -1368,6 +1411,29 @@ impl Default for EnabledStatistics {
/// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details
on the
/// folding algorithm.
///
+/// # Example
+///
+/// ```rust
+/// # use parquet::{
+/// # file::properties::{BloomFilterProperties, WriterProperties},
+/// # schema::types::ColumnPath,
+/// # };
+/// // Build a BloomFilterProperties via the builder, then apply it to one
column.
+/// let bf = BloomFilterProperties::builder()
+/// .with_fpp(0.01)
+/// .with_max_ndv(10_000)
+/// .build();
+///
+/// let props = WriterProperties::builder()
+/// .set_column_bloom_filter_properties(ColumnPath::from("user_id"),
bf.clone())
+/// .build();
+///
+/// assert_eq!(
+/// props.bloom_filter_properties(&ColumnPath::from("user_id")),
+/// Some(&bf)
+/// );
+/// ```
+///
/// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
#[derive(Debug, Clone, PartialEq)]
pub struct BloomFilterProperties {
@@ -1384,7 +1450,7 @@ pub struct BloomFilterProperties {
pub fpp: f64,
/// Maximum expected number of distinct values. Defaults to
[`DEFAULT_BLOOM_FILTER_NDV`].
///
- /// You should set this value by calling
[`WriterPropertiesBuilder::set_bloom_filter_ndv`].
+ /// You should set this value by calling
[`WriterPropertiesBuilder::set_bloom_filter_max_ndv`].
///
/// When not explicitly set via the builder, this defaults to
/// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count)
(resolved at
@@ -1401,7 +1467,7 @@ pub struct BloomFilterProperties {
/// of the number of distinct values in a row group, it is recommended to
set this value explicitly
/// rather than relying on the default dynamic sizing based on
`max_row_group_row_count`.
/// If you do set this value explicitly it is probably best to set it for
each column
- /// individually via
[`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
+ /// individually via
[`WriterPropertiesBuilder::set_column_bloom_filter_max_ndv`] rather than
globally,
/// since different columns may have different numbers of distinct values.
pub ndv: u64,
}
@@ -1415,6 +1481,93 @@ impl Default for BloomFilterProperties {
}
}
+impl BloomFilterProperties {
+ /// Returns a new [`BloomFilterPropertiesBuilder`] for constructing
+ /// [`BloomFilterProperties`] with custom values.
+ pub fn builder() -> BloomFilterPropertiesBuilder {
+ BloomFilterPropertiesBuilder::new()
+ }
+}
+
+/// Builder for [`BloomFilterProperties`].
+///
+/// Use [`BloomFilterProperties::builder`] or
[`BloomFilterPropertiesBuilder::new`]
+/// as the entry point.
+#[derive(Debug, Clone, Default)]
+pub struct BloomFilterPropertiesBuilder {
+ fpp: Option<f64>,
+ ndv: Option<u64>,
+}
+
+impl BloomFilterPropertiesBuilder {
+ /// Returns a new builder with no fields set.
+ ///
+ /// Equivalent to [`BloomFilterProperties::builder`].
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Sets the target false positive probability.
+ ///
+ /// The value must be in `(0.0, 1.0)` exclusively; this is validated at
+ /// build time by [`Self::build`] / [`Self::try_build`]. When unset, the
+ /// default is `0.05` (5%, see [`DEFAULT_BLOOM_FILTER_FPP`]).
+ pub fn with_fpp(mut self, fpp: f64) -> Self {
+ self.fpp = Some(fpp);
+ self
+ }
+
+ /// Sets the maximum expected number of distinct values used to size the
+ /// bloom filter before folding.
+ ///
+ /// When unset, the default is `1_048_576` (see
[`DEFAULT_BLOOM_FILTER_NDV`]),
+ /// which at the default fpp of 5% reserves roughly 1 MiB per column for
the
+ /// filter bitset, derived as follows:
+ ///
+ /// ```text
+ /// ndv = 1,048,576, fpp = 0.05
+ /// 0.05^(1/8) ≈ 0.6877
+ /// 1 - 0.6877 ≈ 0.3123
+ /// ln(0.3123) ≈ -1.164
+ /// num_bits = -8 * 1,048,576 / -1.164 ≈ 7,206,000 bits
+ /// ≈ 900,750 bytes (~900 KB)
+ /// next_power_of_two(900 KB) = 1 MiB (= 1,048,576 bytes)
+ /// ```
+ pub fn with_max_ndv(mut self, ndv: u64) -> Self {
+ self.ndv = Some(ndv);
+ self
+ }
+
+ /// Builds [`BloomFilterProperties`].
+ ///
+ /// Panics if the configured `fpp` is not in `(0.0, 1.0)` exclusive.
+ /// Use [`Self::try_build`] for a non-panicking alternative.
+ pub fn build(self) -> BloomFilterProperties {
+ self.try_build().unwrap_or_else(|e| panic!("{e}"))
+ }
+
+ /// Builds [`BloomFilterProperties`], returning an error instead of
+ /// panicking when the configured `fpp` is not in `(0.0, 1.0)` exclusive.
+ pub fn try_build(self) -> Result<BloomFilterProperties> {
+ let fpp = self.fpp.unwrap_or(DEFAULT_BLOOM_FILTER_FPP);
+ validate_bloom_filter_fpp(fpp).map_err(ParquetError::General)?;
+ let ndv = self.ndv.unwrap_or(DEFAULT_BLOOM_FILTER_NDV);
+ Ok(BloomFilterProperties { fpp, ndv })
+ }
+}
+
+/// Single source of truth for the bloom filter fpp range check, shared by
+/// [`ColumnProperties::set_bloom_filter_fpp`] (panic path) and
+/// [`BloomFilterPropertiesBuilder::try_build`] (Result path).
+fn validate_bloom_filter_fpp(fpp: f64) -> std::result::Result<(), String> {
+ if !(fpp > 0.0 && fpp < 1.0) {
+ return Err(format!(
+ "fpp must be between 0.0 and 1.0 exclusive, got {fpp}"
+ ));
+ }
+ Ok(())
+}
+
/// Container for column properties that can be changed as part of writer.
///
/// If a field is `None`, it means that no specific value has been set for
this column,
@@ -1500,11 +1653,9 @@ impl ColumnProperties {
///
/// Panics if the `value` is not between 0 and 1 exclusive
fn set_bloom_filter_fpp(&mut self, value: f64) {
- assert!(
- value > 0. && value < 1.0,
- "fpp must be between 0 and 1 exclusive, got {value}"
- );
-
+ if let Err(msg) = validate_bloom_filter_fpp(value) {
+ panic!("{msg}");
+ }
self.bloom_filter_properties
.get_or_insert_with(Default::default)
.fpp = value;
@@ -1519,6 +1670,17 @@ impl ColumnProperties {
self.bloom_filter_ndv_is_set = true;
}
+ /// Sets the bloom filter properties for this column from a fully-built
+ /// [`BloomFilterProperties`], implicitly enabling the bloom filter.
+ ///
+ /// Both `fpp` and `ndv` from `value` are treated as explicit, so the
+ /// build-time row-group-size NDV fallback in
+ /// [`WriterPropertiesBuilder::build`] will not override them.
+ fn set_bloom_filter_properties(&mut self, value: BloomFilterProperties) {
+ self.bloom_filter_properties = Some(value);
+ self.bloom_filter_ndv_is_set = true;
+ }
+
/// Sets the Data Page v2 compression ratio threshold for this column.
///
/// # Panics
@@ -1839,7 +2001,7 @@ mod tests {
.set_column_dictionary_enabled(ColumnPath::from("col"), true)
.set_column_statistics_enabled(ColumnPath::from("col"),
EnabledStatistics::Chunk)
.set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
- .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
+ .set_column_bloom_filter_max_ndv(ColumnPath::from("col"), 100_u64)
.set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
.build();
@@ -1961,7 +2123,7 @@ mod tests {
);
assert_eq!(
WriterProperties::builder()
- .set_bloom_filter_ndv(100)
+ .set_bloom_filter_max_ndv(100)
.build()
.bloom_filter_properties(&ColumnPath::from("col")),
Some(&BloomFilterProperties {
@@ -2009,6 +2171,30 @@ mod tests {
.build();
}
+ #[test]
+ #[allow(deprecated)]
+ fn test_writer_properties_deprecated_bloom_filter_ndv_setters_still_work()
{
+ let col = ColumnPath::from("col");
+ let props = WriterProperties::builder()
+ .set_bloom_filter_ndv(100)
+ .set_column_bloom_filter_ndv(col.clone(), 200)
+ .build();
+ assert_eq!(
+ props.bloom_filter_properties(&ColumnPath::from("other")),
+ Some(&BloomFilterProperties {
+ fpp: DEFAULT_BLOOM_FILTER_FPP,
+ ndv: 100,
+ })
+ );
+ assert_eq!(
+ props.bloom_filter_properties(&col),
+ Some(&BloomFilterProperties {
+ fpp: DEFAULT_BLOOM_FILTER_FPP,
+ ndv: 200,
+ })
+ );
+ }
+
#[test]
fn test_writer_properties_column_dictionary_page_size_limit() {
let props = WriterProperties::builder()
@@ -2125,4 +2311,141 @@ mod tests {
assert_eq!(custom, custom);
assert_ne!(opts, custom);
}
+
+ #[test]
+ fn test_bloom_filter_builder_default() {
+ let props = BloomFilterProperties::builder().build();
+ assert_eq!(props.fpp, DEFAULT_BLOOM_FILTER_FPP);
+ assert_eq!(props.ndv, DEFAULT_BLOOM_FILTER_NDV);
+ assert_eq!(props, BloomFilterProperties::default());
+ assert_eq!(
+ BloomFilterPropertiesBuilder::new().build(),
+ BloomFilterProperties::default()
+ );
+ }
+
+ #[test]
+ fn test_bloom_filter_builder_explicit_fpp() {
+ let props = BloomFilterProperties::builder().with_fpp(0.01).build();
+ assert_eq!(props.fpp, 0.01);
+ assert_eq!(props.ndv, DEFAULT_BLOOM_FILTER_NDV);
+ }
+
+ #[test]
+ fn test_bloom_filter_builder_explicit_ndv() {
+ let props =
BloomFilterProperties::builder().with_max_ndv(1000).build();
+ assert_eq!(props.fpp, DEFAULT_BLOOM_FILTER_FPP);
+ assert_eq!(props.ndv, 1000);
+ }
+
+ #[test]
+ fn test_bloom_filter_builder_validates_fpp() {
+ for wrong_val in [0.0_f64, 1.0, -0.5, 2.0] {
+ let result = std::panic::catch_unwind(|| {
+ BloomFilterProperties::builder().with_fpp(wrong_val).build()
+ });
+ assert!(
+ result.is_err(),
+ "with_fpp({wrong_val}).build() should reject value outside (0,
1)"
+ );
+ }
+ }
+
+ #[test]
+ fn test_bloom_filter_builder_try_build_validates_fpp() {
+ for wrong_val in [0.0_f64, 1.0, -0.5, 2.0] {
+ let result = BloomFilterProperties::builder()
+ .with_fpp(wrong_val)
+ .try_build();
+ assert!(
+ result.is_err(),
+ "try_build() should return Err for fpp outside (0, 1)"
+ );
+ }
+
+ let ok = BloomFilterProperties::builder()
+ .with_fpp(0.01)
+ .with_max_ndv(1000)
+ .try_build()
+ .expect("valid fpp should yield Ok");
+ assert_eq!(ok.fpp, 0.01);
+ assert_eq!(ok.ndv, 1000);
+ }
+
+ #[test]
+ fn test_column_specific_implicit_ndv_uses_row_group_size() {
+ let custom_row_group_size: usize = 7777;
+ let col = ColumnPath::from("col");
+ let props = WriterProperties::builder()
+ .set_max_row_group_row_count(Some(custom_row_group_size))
+ .set_column_bloom_filter_enabled(col.clone(), true)
+ .build();
+ let bf = props
+ .bloom_filter_properties(&col)
+ .expect("bloom filter should be enabled for col");
+
+ assert_eq!(bf.ndv, custom_row_group_size as u64);
+ assert_eq!(bf.fpp, DEFAULT_BLOOM_FILTER_FPP);
+ }
+
+ #[test]
+ fn test_set_bloom_filter_properties_applied_globally() {
+ let bf = BloomFilterProperties::builder()
+ .with_fpp(0.01)
+ .with_max_ndv(500)
+ .build();
+ let props = WriterProperties::builder()
+ .set_bloom_filter_properties(bf.clone())
+ .build();
+
+ assert_eq!(
+ props.bloom_filter_properties(&ColumnPath::from("a")),
+ Some(&bf),
+ );
+ assert_eq!(
+ props.bloom_filter_properties(&ColumnPath::from("b")),
+ Some(&bf),
+ );
+ }
+
+ #[test]
+ fn test_set_column_bloom_filter_properties_overrides_global() {
+ let global = BloomFilterProperties::builder()
+ .with_fpp(0.01)
+ .with_max_ndv(500)
+ .build();
+ let tailored = BloomFilterProperties::builder()
+ .with_fpp(0.02)
+ .with_max_ndv(1000)
+ .build();
+
+ let col = ColumnPath::from("col");
+ let props = WriterProperties::builder()
+ .set_bloom_filter_properties(global.clone())
+ .set_column_bloom_filter_properties(col.clone(), tailored.clone())
+ .build();
+
+ assert_eq!(props.bloom_filter_properties(&col), Some(&tailored));
+ assert_eq!(
+ props.bloom_filter_properties(&ColumnPath::from("other")),
+ Some(&global)
+ );
+ }
+
+ #[test]
+ fn test_set_bloom_filter_properties_preserve_explicit_ndv() {
+ let bf = BloomFilterProperties::builder().with_max_ndv(42).build();
+ let props = WriterProperties::builder()
+ .set_max_row_group_row_count(Some(99_999))
+ .set_bloom_filter_properties(bf)
+ .build();
+ let result = props
+ .bloom_filter_properties(&ColumnPath::from("col"))
+ .expect("bloom filter should be enabled");
+
+ assert_eq!(
+ result.ndv, 42,
+ "explicit ndv must not be overridden by row-group-size fallback"
+ );
+ }
}