This is an automated email from the ASF dual-hosted git repository. jiayuliu pushed a commit to branch add-bloom-filter-3 in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 777b0dc6f7d4a08af896772893071681c9d17b21 Author: Jiayu Liu <ji...@hey.com> AuthorDate: Tue Nov 15 20:53:32 2022 +0800 add column setter --- parquet/Cargo.toml | 1 + parquet/src/file/properties.rs | 102 +++++++++++++++++++++++++++++++++++------ 2 files changed, 89 insertions(+), 14 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index fc7c8218a..72baaf338 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -58,6 +58,7 @@ futures = { version = "0.3", default-features = false, features = ["std"], optio tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] } hashbrown = { version = "0.13", default-features = false } twox-hash = { version = "1.6", optional = true } +paste = "1.0" [dev-dependencies] base64 = { version = "0.13", default-features = false, features = ["std"] } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index cf821df21..c0e789ca1 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -248,6 +248,15 @@ impl WriterProperties { .or_else(|| self.default_column_properties.max_statistics_size()) .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) } + + /// Returns `true` if bloom filter is enabled for a column. + pub fn bloom_filter_enabled(&self, col: &ColumnPath) -> bool { + self.column_properties + .get(col) + .and_then(|c| c.bloom_filter_enabled()) + .or_else(|| self.default_column_properties.bloom_filter_enabled()) + .unwrap_or(false) + } } /// Writer properties builder. @@ -264,6 +273,16 @@ pub struct WriterPropertiesBuilder { column_properties: HashMap<ColumnPath, ColumnProperties>, } +macro_rules! def_per_col_setter { + ($field:ident, $field_type:expr) => { + // The macro will expand into the contents of this block. + pub fn concat_idents!(set_, $field)(mut self, value: $field_type) -> Self { + self.$field = value; + self + } + }; +} + impl WriterPropertiesBuilder { /// Returns default state of the builder. fn with_defaults() -> Self { @@ -276,7 +295,7 @@ impl WriterPropertiesBuilder { writer_version: DEFAULT_WRITER_VERSION, created_by: DEFAULT_CREATED_BY.to_string(), key_value_metadata: None, - default_column_properties: ColumnProperties::new(), + default_column_properties: Default::default(), column_properties: HashMap::new(), } } @@ -306,6 +325,8 @@ impl WriterPropertiesBuilder { self } + def_per_col_setter!(writer_version, WriterVersion); + /// Sets best effort maximum size of a data page in bytes. /// /// Note: this is a best effort limit based on value of @@ -423,7 +444,7 @@ impl WriterPropertiesBuilder { fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { self.column_properties .entry(col) - .or_insert_with(ColumnProperties::new) + .or_insert_with(Default::default) } /// Sets encoding for a column. @@ -476,6 +497,17 @@ impl WriterPropertiesBuilder { self.get_mut_props(col).set_max_statistics_size(value); self } + + /// Sets bloom filter enabled for a column. + /// Takes precedence over globally defined settings. + pub fn set_column_bloom_filter_enabled( + mut self, + col: ColumnPath, + value: bool, + ) -> Self { + self.get_mut_props(col).set_bloom_filter_enabled(value); + self + } } /// Controls the level of statistics to be computed by the writer @@ -499,27 +531,24 @@ impl Default for EnabledStatistics { /// /// If a field is `None`, it means that no specific value has been set for this column, /// so some subsequent or default value must be used. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Default, PartialEq)] struct ColumnProperties { encoding: Option<Encoding>, codec: Option<Compression>, dictionary_enabled: Option<bool>, statistics_enabled: Option<EnabledStatistics>, max_statistics_size: Option<usize>, + /// bloom filter enabled + bloom_filter_enabled: Option<bool>, + /// bloom filter expected number of distinct values + bloom_filter_ndv: Option<u64>, + /// bloom filter false positive probability + bloom_filter_fpp: Option<f64>, + /// bloom filter max number of bytes + bloom_filter_max_bytes: Option<u32>, } impl ColumnProperties { - /// Initialise column properties with default values. - fn new() -> Self { - Self { - encoding: None, - codec: None, - dictionary_enabled: None, - statistics_enabled: None, - max_statistics_size: None, - } - } - /// Sets encoding for this column. /// /// If dictionary is not enabled, this is treated as a primary encoding for a column. @@ -556,6 +585,26 @@ impl ColumnProperties { self.max_statistics_size = Some(value); } + /// Sets bloom filter enabled + fn set_bloom_filter_enabled(&mut self, enabled: bool) { + self.bloom_filter_enabled = Some(enabled); + } + + /// Sets bloom filter max size in bytes + fn set_bloom_filter_max_size(&mut self, value: u32) { + self.bloom_filter_max_bytes = Some(value); + } + + /// Sets bloom filter expected number of distinct values + fn set_bloom_filter_ndv(&mut self, value: u64) { + self.bloom_filter_ndv = Some(value); + } + + /// Sets bloom filter false positive probability + fn set_bloom_filter_fpp(&mut self, value: f64) { + self.bloom_filter_fpp = Some(value); + } + /// Returns optional encoding for this column. fn encoding(&self) -> Option<Encoding> { self.encoding @@ -583,6 +632,30 @@ impl ColumnProperties { fn max_statistics_size(&self) -> Option<usize> { self.max_statistics_size } + + /// Returns `Some(true)` if bloom filter is enabled for this column, if disabled then + /// returns `Some(false)`. If result is `None`, then no setting has been provided. + fn bloom_filter_enabled(&self) -> Option<bool> { + self.bloom_filter_enabled + } + + /// Returns `Some(u32)` if bloom filter max size in bytes is set for this column, + /// if not set then returns `None`. + fn bloom_filter_max_bytes(&self) -> Option<u32> { + self.bloom_filter_max_bytes + } + + /// Returns `Some(u64)` if bloom filter number of distinct values is set for this column, + /// if not set then returns `None`. + fn bloom_filter_ndv(&self) -> Option<u64> { + self.bloom_filter_ndv + } + + /// Returns `Some(f64)` if bloom filter false positive probability is set for this column, + /// if not set then returns `None`. + fn bloom_filter_fpp(&self) -> Option<f64> { + self.bloom_filter_fpp + } } /// Reference counted reader properties. @@ -685,6 +758,7 @@ mod tests { props.max_statistics_size(&ColumnPath::from("col")), DEFAULT_MAX_STATISTICS_SIZE ); + assert_eq!(props.bloom_filter_enabled(&ColumnPath::from("col")), false); } #[test]