This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a commit to branch add-bloom-filter-3
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit 63fa6434aca28e9f646bf6840334fdf6fe0abedc
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Tue Nov 15 21:23:02 2022 +0800

    add writer properties
---
 parquet/src/file/properties.rs | 148 ++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 68 deletions(-)

diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index c0e789ca1..c62bfe0bc 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -64,6 +64,7 @@
 //!     .build();
 //! ```
 
+use paste::paste;
 use std::{collections::HashMap, sync::Arc};
 
 use crate::basic::{Compression, Encoding};
@@ -81,6 +82,9 @@ const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = 
EnabledStatistics::Page;
 const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
 const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
 const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY");
+const DEFAULT_BLOOM_FILTER_ENABLED: bool = false;
+const DEFAULT_BLOOM_FILTER_MAX_BYTES: u32 = 1024 * 1024;
+const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.01;
 
 /// Parquet writer version.
 ///
@@ -123,6 +127,26 @@ pub struct WriterProperties {
     column_properties: HashMap<ColumnPath, ColumnProperties>,
 }
 
+macro_rules! def_col_property_getter {
+    ($field:ident, $field_type:ty) => {
+        pub fn $field(&self, col: &ColumnPath) -> Option<$field_type> {
+            self.column_properties
+                .get(col)
+                .and_then(|c| c.$field())
+                .or_else(|| self.default_column_properties.$field())
+        }
+    };
+    ($field:ident, $field_type:ty, $default_val:expr) => {
+        pub fn $field(&self, col: &ColumnPath) -> $field_type {
+            self.column_properties
+                .get(col)
+                .and_then(|c| c.$field())
+                .or_else(|| self.default_column_properties.$field())
+                .unwrap_or($default_val)
+        }
+    };
+}
+
 impl WriterProperties {
     /// Returns builder for writer properties with default values.
     pub fn builder() -> WriterPropertiesBuilder {
@@ -249,14 +273,10 @@ impl WriterProperties {
             .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
     }
 
-    /// Returns `true` if bloom filter is enabled for a column.
-    pub fn bloom_filter_enabled(&self, col: &ColumnPath) -> bool {
-        self.column_properties
-            .get(col)
-            .and_then(|c| c.bloom_filter_enabled())
-            .or_else(|| self.default_column_properties.bloom_filter_enabled())
-            .unwrap_or(false)
-    }
+    def_col_property_getter!(bloom_filter_enabled, bool, 
DEFAULT_BLOOM_FILTER_ENABLED);
+    def_col_property_getter!(bloom_filter_fpp, f64, DEFAULT_BLOOM_FILTER_FPP);
+    def_col_property_getter!(bloom_filter_ndv, u64);
+    def_col_property_getter!(bloom_filter_max_bytes, u32, 
DEFAULT_BLOOM_FILTER_MAX_BYTES);
 }
 
 /// Writer properties builder.
@@ -273,16 +293,40 @@ pub struct WriterPropertiesBuilder {
     column_properties: HashMap<ColumnPath, ColumnProperties>,
 }
 
-macro_rules! def_per_col_setter {
-    ($field:ident, $field_type:expr) => {
-        // The macro will expand into the contents of this block.
-        pub fn concat_idents!(set_, $field)(mut self, value: $field_type) -> 
Self {
-            self.$field = value;
-            self
+macro_rules! def_opt_field_setter {
+    ($field: ident, $type: ty) => {
+        paste! {
+            pub fn [<set_ $field>](&mut self, value: $type) -> &mut Self {
+                self.$field = Some(value);
+                self
+            }
+        }
+    };
+}
+
+macro_rules! def_opt_field_getter {
+    ($field: ident, $type: ty) => {
+        paste! {
+            #[doc = "Returns " $field " if set."]
+            pub fn $field(&self) -> Option<$type> {
+                self.$field
+            }
         }
     };
 }
 
+macro_rules! def_per_col_setter {
+    ($field:ident, $field_type:ty) => {
+        paste! {
+            #[doc = "Sets " $field " for a column. Takes precedence over 
globally defined settings."]
+            pub fn [<set_column_ $field>](mut self, col: ColumnPath, value: 
$field_type) -> Self {
+                self.get_mut_props(col).[<set_ $field>](value);
+                self
+            }
+        }
+    }
+}
+
 impl WriterPropertiesBuilder {
     /// Returns default state of the builder.
     fn with_defaults() -> Self {
@@ -325,8 +369,6 @@ impl WriterPropertiesBuilder {
         self
     }
 
-    def_per_col_setter!(writer_version, WriterVersion);
-
     /// Sets best effort maximum size of a data page in bytes.
     ///
     /// Note: this is a best effort limit based on value of
@@ -498,16 +540,10 @@ impl WriterPropertiesBuilder {
         self
     }
 
-    /// Sets bloom filter enabled for a column.
-    /// Takes precedence over globally defined settings.
-    pub fn set_column_bloom_filter_enabled(
-        mut self,
-        col: ColumnPath,
-        value: bool,
-    ) -> Self {
-        self.get_mut_props(col).set_bloom_filter_enabled(value);
-        self
-    }
+    def_per_col_setter!(bloom_filter_enabled, bool);
+    def_per_col_setter!(bloom_filter_fpp, f64);
+    def_per_col_setter!(bloom_filter_max_bytes, u32);
+    def_per_col_setter!(bloom_filter_ndv, u64);
 }
 
 /// Controls the level of statistics to be computed by the writer
@@ -585,25 +621,10 @@ impl ColumnProperties {
         self.max_statistics_size = Some(value);
     }
 
-    /// Sets bloom filter enabled
-    fn set_bloom_filter_enabled(&mut self, enabled: bool) {
-        self.bloom_filter_enabled = Some(enabled);
-    }
-
-    /// Sets bloom filter max size in bytes
-    fn set_bloom_filter_max_size(&mut self, value: u32) {
-        self.bloom_filter_max_bytes = Some(value);
-    }
-
-    /// Sets bloom filter expected number of distinct values
-    fn set_bloom_filter_ndv(&mut self, value: u64) {
-        self.bloom_filter_ndv = Some(value);
-    }
-
-    /// Sets bloom filter false positive probability
-    fn set_bloom_filter_fpp(&mut self, value: f64) {
-        self.bloom_filter_fpp = Some(value);
-    }
+    def_opt_field_setter!(bloom_filter_enabled, bool);
+    def_opt_field_setter!(bloom_filter_fpp, f64);
+    def_opt_field_setter!(bloom_filter_max_bytes, u32);
+    def_opt_field_setter!(bloom_filter_ndv, u64);
 
     /// Returns optional encoding for this column.
     fn encoding(&self) -> Option<Encoding> {
@@ -633,29 +654,10 @@ impl ColumnProperties {
         self.max_statistics_size
     }
 
-    /// Returns `Some(true)` if bloom filter is enabled for this column, if 
disabled then
-    /// returns `Some(false)`. If result is `None`, then no setting has been 
provided.
-    fn bloom_filter_enabled(&self) -> Option<bool> {
-        self.bloom_filter_enabled
-    }
-
-    /// Returns `Some(u32)` if bloom filter max size in bytes is set for this 
column,
-    /// if not set then returns `None`.
-    fn bloom_filter_max_bytes(&self) -> Option<u32> {
-        self.bloom_filter_max_bytes
-    }
-
-    /// Returns `Some(u64)` if bloom filter number of distinct values is set 
for this column,
-    /// if not set then returns `None`.
-    fn bloom_filter_ndv(&self) -> Option<u64> {
-        self.bloom_filter_ndv
-    }
-
-    /// Returns `Some(f64)` if bloom filter false positive probability is set 
for this column,
-    /// if not set then returns `None`.
-    fn bloom_filter_fpp(&self) -> Option<f64> {
-        self.bloom_filter_fpp
-    }
+    def_opt_field_getter!(bloom_filter_enabled, bool);
+    def_opt_field_getter!(bloom_filter_fpp, f64);
+    def_opt_field_getter!(bloom_filter_max_bytes, u32);
+    def_opt_field_getter!(bloom_filter_ndv, u64);
 }
 
 /// Reference counted reader properties.
@@ -759,6 +761,12 @@ mod tests {
             DEFAULT_MAX_STATISTICS_SIZE
         );
         assert_eq!(props.bloom_filter_enabled(&ColumnPath::from("col")), 
false);
+        assert_eq!(props.bloom_filter_fpp(&ColumnPath::from("col")), 0.01);
+        assert_eq!(props.bloom_filter_ndv(&ColumnPath::from("col")), None);
+        assert_eq!(
+            props.bloom_filter_max_bytes(&ColumnPath::from("col")),
+            1024 * 1024
+        );
     }
 
     #[test]
@@ -842,6 +850,10 @@ mod tests {
                 EnabledStatistics::Chunk,
             )
             .set_column_max_statistics_size(ColumnPath::from("col"), 123)
+            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
+            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100)
+            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
+            .set_column_bloom_filter_max_bytes(ColumnPath::from("col"), 1000)
             .build();
 
         assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);

Reply via email to