This is an automated email from the ASF dual-hosted git repository. jiayuliu pushed a commit to branch add-bloom-filter-3 in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 63fa6434aca28e9f646bf6840334fdf6fe0abedc Author: Jiayu Liu <ji...@hey.com> AuthorDate: Tue Nov 15 21:23:02 2022 +0800 add writer properties --- parquet/src/file/properties.rs | 148 ++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 68 deletions(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index c0e789ca1..c62bfe0bc 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -64,6 +64,7 @@ //! .build(); //! ``` +use paste::paste; use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; @@ -81,6 +82,9 @@ const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); +const DEFAULT_BLOOM_FILTER_ENABLED: bool = false; +const DEFAULT_BLOOM_FILTER_MAX_BYTES: u32 = 1024 * 1024; +const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.01; /// Parquet writer version. /// @@ -123,6 +127,26 @@ pub struct WriterProperties { column_properties: HashMap<ColumnPath, ColumnProperties>, } +macro_rules! def_col_property_getter { + ($field:ident, $field_type:ty) => { + pub fn $field(&self, col: &ColumnPath) -> Option<$field_type> { + self.column_properties + .get(col) + .and_then(|c| c.$field()) + .or_else(|| self.default_column_properties.$field()) + } + }; + ($field:ident, $field_type:ty, $default_val:expr) => { + pub fn $field(&self, col: &ColumnPath) -> $field_type { + self.column_properties + .get(col) + .and_then(|c| c.$field()) + .or_else(|| self.default_column_properties.$field()) + .unwrap_or($default_val) + } + }; +} + impl WriterProperties { /// Returns builder for writer properties with default values. pub fn builder() -> WriterPropertiesBuilder { @@ -249,14 +273,10 @@ impl WriterProperties { .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) } - /// Returns `true` if bloom filter is enabled for a column. - pub fn bloom_filter_enabled(&self, col: &ColumnPath) -> bool { - self.column_properties - .get(col) - .and_then(|c| c.bloom_filter_enabled()) - .or_else(|| self.default_column_properties.bloom_filter_enabled()) - .unwrap_or(false) - } + def_col_property_getter!(bloom_filter_enabled, bool, DEFAULT_BLOOM_FILTER_ENABLED); + def_col_property_getter!(bloom_filter_fpp, f64, DEFAULT_BLOOM_FILTER_FPP); + def_col_property_getter!(bloom_filter_ndv, u64); + def_col_property_getter!(bloom_filter_max_bytes, u32, DEFAULT_BLOOM_FILTER_MAX_BYTES); } /// Writer properties builder. @@ -273,16 +293,40 @@ pub struct WriterPropertiesBuilder { column_properties: HashMap<ColumnPath, ColumnProperties>, } -macro_rules! def_per_col_setter { - ($field:ident, $field_type:expr) => { - // The macro will expand into the contents of this block. - pub fn concat_idents!(set_, $field)(mut self, value: $field_type) -> Self { - self.$field = value; - self +macro_rules! def_opt_field_setter { + ($field: ident, $type: ty) => { + paste! { + pub fn [<set_ $field>](&mut self, value: $type) -> &mut Self { + self.$field = Some(value); + self + } + } + }; +} + +macro_rules! def_opt_field_getter { + ($field: ident, $type: ty) => { + paste! { + #[doc = "Returns " $field " if set."] + pub fn $field(&self) -> Option<$type> { + self.$field + } } }; } +macro_rules! def_per_col_setter { + ($field:ident, $field_type:ty) => { + paste! { + #[doc = "Sets " $field " for a column. Takes precedence over globally defined settings."] + pub fn [<set_column_ $field>](mut self, col: ColumnPath, value: $field_type) -> Self { + self.get_mut_props(col).[<set_ $field>](value); + self + } + } + } +} + impl WriterPropertiesBuilder { /// Returns default state of the builder. fn with_defaults() -> Self { @@ -325,8 +369,6 @@ impl WriterPropertiesBuilder { self } - def_per_col_setter!(writer_version, WriterVersion); - /// Sets best effort maximum size of a data page in bytes. /// /// Note: this is a best effort limit based on value of @@ -498,16 +540,10 @@ impl WriterPropertiesBuilder { self } - /// Sets bloom filter enabled for a column. - /// Takes precedence over globally defined settings. - pub fn set_column_bloom_filter_enabled( - mut self, - col: ColumnPath, - value: bool, - ) -> Self { - self.get_mut_props(col).set_bloom_filter_enabled(value); - self - } + def_per_col_setter!(bloom_filter_enabled, bool); + def_per_col_setter!(bloom_filter_fpp, f64); + def_per_col_setter!(bloom_filter_max_bytes, u32); + def_per_col_setter!(bloom_filter_ndv, u64); } /// Controls the level of statistics to be computed by the writer @@ -585,25 +621,10 @@ impl ColumnProperties { self.max_statistics_size = Some(value); } - /// Sets bloom filter enabled - fn set_bloom_filter_enabled(&mut self, enabled: bool) { - self.bloom_filter_enabled = Some(enabled); - } - - /// Sets bloom filter max size in bytes - fn set_bloom_filter_max_size(&mut self, value: u32) { - self.bloom_filter_max_bytes = Some(value); - } - - /// Sets bloom filter expected number of distinct values - fn set_bloom_filter_ndv(&mut self, value: u64) { - self.bloom_filter_ndv = Some(value); - } - - /// Sets bloom filter false positive probability - fn set_bloom_filter_fpp(&mut self, value: f64) { - self.bloom_filter_fpp = Some(value); - } + def_opt_field_setter!(bloom_filter_enabled, bool); + def_opt_field_setter!(bloom_filter_fpp, f64); + def_opt_field_setter!(bloom_filter_max_bytes, u32); + def_opt_field_setter!(bloom_filter_ndv, u64); /// Returns optional encoding for this column. fn encoding(&self) -> Option<Encoding> { @@ -633,29 +654,10 @@ impl ColumnProperties { self.max_statistics_size } - /// Returns `Some(true)` if bloom filter is enabled for this column, if disabled then - /// returns `Some(false)`. If result is `None`, then no setting has been provided. - fn bloom_filter_enabled(&self) -> Option<bool> { - self.bloom_filter_enabled - } - - /// Returns `Some(u32)` if bloom filter max size in bytes is set for this column, - /// if not set then returns `None`. - fn bloom_filter_max_bytes(&self) -> Option<u32> { - self.bloom_filter_max_bytes - } - - /// Returns `Some(u64)` if bloom filter number of distinct values is set for this column, - /// if not set then returns `None`. - fn bloom_filter_ndv(&self) -> Option<u64> { - self.bloom_filter_ndv - } - - /// Returns `Some(f64)` if bloom filter false positive probability is set for this column, - /// if not set then returns `None`. - fn bloom_filter_fpp(&self) -> Option<f64> { - self.bloom_filter_fpp - } + def_opt_field_getter!(bloom_filter_enabled, bool); + def_opt_field_getter!(bloom_filter_fpp, f64); + def_opt_field_getter!(bloom_filter_max_bytes, u32); + def_opt_field_getter!(bloom_filter_ndv, u64); } /// Reference counted reader properties. @@ -759,6 +761,12 @@ mod tests { DEFAULT_MAX_STATISTICS_SIZE ); assert_eq!(props.bloom_filter_enabled(&ColumnPath::from("col")), false); + assert_eq!(props.bloom_filter_fpp(&ColumnPath::from("col")), 0.01); + assert_eq!(props.bloom_filter_ndv(&ColumnPath::from("col")), None); + assert_eq!( + props.bloom_filter_max_bytes(&ColumnPath::from("col")), + 1024 * 1024 + ); } #[test] @@ -842,6 +850,10 @@ mod tests { EnabledStatistics::Chunk, ) .set_column_max_statistics_size(ColumnPath::from("col"), 123) + .set_column_bloom_filter_enabled(ColumnPath::from("col"), true) + .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100) + .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1) + .set_column_bloom_filter_max_bytes(ColumnPath::from("col"), 1000) .build(); assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);