wgtmac commented on code in PR #37400:
URL: https://github.com/apache/arrow/pull/37400#discussion_r2121578249


##########
cpp/src/parquet/properties.h:
##########
@@ -167,6 +167,32 @@ static constexpr Compression::type 
DEFAULT_COMPRESSION_TYPE = Compression::UNCOM
 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
 static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
     SizeStatisticsLevel::PageAndColumnChunk;
+static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
+static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;
+
+struct PARQUET_EXPORT BloomFilterOptions {
+  /// Expected number of distinct values to be inserted into the bloom filter.
+  ///
+  /// Usage of bloom filter is most beneficial for columns with large 
cardinality,

Review Comment:
   I think there are abundant resources available for bloom filters so we don't 
need to add the introduction about BF with (biased) suggestions. It would be 
good enough to just let readers know what each parameter is about and the 
consequence of it.



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+///   bloom_filter_builder->AppendRowGroup();
+///   auto* bloom_filter =
+///   bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+///   // Add bloom filter entries in `bloom_filter`.
+///   // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// @brief API to create a BloomFilterBuilder.
+  ///
+  /// @param schema The schema of the file. The life cycle of the schema must 
be
+  /// longer than the BloomFilterBuilder.
+  /// @param properties The properties of the file. The life cycle of the 
properties
+  /// must be longer than the BloomFilterBuilder.
+  static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* 
schema,
+                                                  const WriterProperties* 
properties);
+
+  /// @brief Append a new row group to host all incoming bloom filters.

Review Comment:
   ```suggestion
     /// @brief Start a new row group to host all bloom filters belong to it.
   ```



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+///   bloom_filter_builder->AppendRowGroup();
+///   auto* bloom_filter =
+///   bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+///   // Add bloom filter entries in `bloom_filter`.
+///   // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// @brief API to create a BloomFilterBuilder.
+  ///
+  /// @param schema The schema of the file. The life cycle of the schema must 
be
+  /// longer than the BloomFilterBuilder.
+  /// @param properties The properties of the file. The life cycle of the 
properties
+  /// must be longer than the BloomFilterBuilder.
+  static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* 
schema,
+                                                  const WriterProperties* 
properties);
+
+  /// @brief Append a new row group to host all incoming bloom filters.
+  ///
+  /// This method must be called before `GetOrCreateBloomFilter` for a new row 
group.

Review Comment:
   ```suggestion
     /// This method must be called before `GetOrCreateBloomFilter` for columns 
of a new row group.
   ```



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+///   bloom_filter_builder->AppendRowGroup();
+///   auto* bloom_filter =
+///   bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+///   // Add bloom filter entries in `bloom_filter`.
+///   // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// @brief API to create a BloomFilterBuilder.
+  ///
+  /// @param schema The schema of the file. The life cycle of the schema must 
be
+  /// longer than the BloomFilterBuilder.
+  /// @param properties The properties of the file. The life cycle of the 
properties
+  /// must be longer than the BloomFilterBuilder.
+  static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* 
schema,
+                                                  const WriterProperties* 
properties);
+
+  /// @brief Append a new row group to host all incoming bloom filters.
+  ///
+  /// This method must be called before `GetOrCreateBloomFilter` for a new row 
group.
+  ///
+  /// @throws ParquetException if WriteTo() has been called to flush bloom 
filters.
+  virtual void AppendRowGroup() = 0;
+
+  /// @brief Get the BloomFilter from column ordinal.
+  ///
+  /// @param column_ordinal Column ordinal in schema, which is only for leaf 
columns.
+  ///
+  /// \return BloomFilter for the column and its memory ownership belongs to 
the
+  /// BloomFilterBuilder. It will return nullptr if bloom filter is not 
enabled for the
+  /// column.
+  ///
+  /// @throws ParquetException if any of following conditions applies:
+  /// 1) column_ordinal is out of bound.
+  /// 2) `WriteTo()` has been called already.
+  /// 3) `AppendRowGroup()` is not called before `GetOrCreateBloomFilter()`.

Review Comment:
   This is only true for the 1st row group. We should be clear about this.



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);

Review Comment:
   Although I agree that it is better to provide good examples, current example 
is incomplete and not very clear on `bloom_filter_column` and what to do next 
with `bloom_filter`. What about removing this example since it is not a public 
API?



##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -373,8 +374,11 @@ set_source_files_properties(public_api_test.cc PROPERTIES 
SKIP_UNITY_BUILD_INCLU
 
 add_parquet_test(internals-test
                  SOURCES
-                 bloom_filter_reader_test.cc
                  bloom_filter_test.cc
+                 bloom_filter_reader_writer_test.cc
+                 properties_test.cc
+                 statistics_test.cc
+                 encoding_test.cc

Review Comment:
   ```suggestion
                    bloom_filter_reader_writer_test.cc
                    encoding_test.cc
   ```



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -157,6 +160,8 @@ inline const T* AddIfNotNull(const T* base, int64_t offset) 
{
   return nullptr;
 }
 
+constexpr int64_t kHashBatchSize = 256;

Review Comment:
   What about moving it closer to line 1238?



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+///   bloom_filter_builder->AppendRowGroup();
+///   auto* bloom_filter =
+///   bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+///   // Add bloom filter entries in `bloom_filter`.
+///   // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// @brief API to create a BloomFilterBuilder.
+  ///
+  /// @param schema The schema of the file. The life cycle of the schema must 
be
+  /// longer than the BloomFilterBuilder.
+  /// @param properties The properties of the file. The life cycle of the 
properties
+  /// must be longer than the BloomFilterBuilder.

Review Comment:
   ```suggestion
     /// @param properties The properties of the file with a set of 
`BloomFilterOption`s
     /// for columns enabling bloom filters. It must outlive the created 
BloomFilterBuilder.
   ```



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.

Review Comment:
   ```suggestion
   /// @brief Interface for building bloom filters of a parquet file.
   ```



##########
cpp/src/parquet/bloom_filter_builder_internal.cc:
##########
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#include "parquet/bloom_filter_builder_internal.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/logging.h"
+
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+namespace {
+/// BloomFilterBuilderImpl is the implementation of BloomFilterBuilder.
+///
+/// Note: Column encryption for bloom filter is not implemented yet.
+class BloomFilterBuilderImpl : public BloomFilterBuilder {
+ public:
+  explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema,
+                                  const WriterProperties* properties)
+      : schema_(schema), properties_(properties) {}
+  BloomFilterBuilderImpl(const BloomFilterBuilderImpl&) = delete;
+  BloomFilterBuilderImpl(BloomFilterBuilderImpl&&) = default;
+
+  /// Append a new row group to host all incoming bloom filters.
+  void AppendRowGroup() override;
+
+  BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) override;
+
+  /// Serialize all bloom filters with header and bitset in the order of row 
group and
+  /// column id. The side effect is that it deletes all bloom filters after 
they have
+  /// been flushed.

Review Comment:
   ```suggestion
     void AppendRowGroup() override;
   
     BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) override;
   
   ```
   
   I don't think we need to add comments for an override function.



##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+///   bloom_filter_builder->AppendRowGroup();
+///   auto* bloom_filter =
+///   bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+///   // Add bloom filter entries in `bloom_filter`.
+///   // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// @brief API to create a BloomFilterBuilder.
+  ///
+  /// @param schema The schema of the file. The life cycle of the schema must 
be
+  /// longer than the BloomFilterBuilder.

Review Comment:
   ```suggestion
     /// @param schema The schema of the file and it must outlive the created 
BloomFilterBuilder.
   ```



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1230,6 +1235,166 @@ Status ConvertDictionaryToDense(const ::arrow::Array& 
array, MemoryPool* pool,
   return Status::OK();
 }
 
+template <typename ParquetType>
+class BloomFilterWriterImpl {

Review Comment:
   Is there any compelling reason not to move these `BloomFilterWriterImpl`s to 
`bloom_filter_builder_internal.h/cc`?



##########
cpp/src/parquet/properties.h:
##########
@@ -167,6 +167,32 @@ static constexpr Compression::type 
DEFAULT_COMPRESSION_TYPE = Compression::UNCOM
 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
 static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
     SizeStatisticsLevel::PageAndColumnChunk;
+static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
+static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;
+
+struct PARQUET_EXPORT BloomFilterOptions {
+  /// Expected number of distinct values to be inserted into the bloom filter.
+  ///
+  /// Usage of bloom filter is most beneficial for columns with large 
cardinality,
+  /// so a good heuristic is to set ndv to number of rows. However, it can 
reduce
+  /// disk size if you know in advance a smaller number of distinct values.
+  /// For very small ndv value it is probably not worth it to use bloom filter 
anyway.
+  ///
+  /// Increasing this value (without increasing fpp) will result in an 
increase in
+  /// disk or memory size.
+  int32_t ndv = DEFAULT_BLOOM_FILTER_NDV;
+  /// False-positive probability of the bloom filter.
+  ///
+  /// The bloom filter data structure is a trade of between disk and memory 
space

Review Comment:
   ```suggestion
     /// The bloom filter data structure is a trade-off between disk and memory 
space
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to