wgtmac commented on code in PR #37400:
URL: https://github.com/apache/arrow/pull/37400#discussion_r2121578249
##########
cpp/src/parquet/properties.h:
##########
@@ -167,6 +167,32 @@ static constexpr Compression::type
DEFAULT_COMPRESSION_TYPE = Compression::UNCOM
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
SizeStatisticsLevel::PageAndColumnChunk;
+static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
+static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;
+
+struct PARQUET_EXPORT BloomFilterOptions {
+ /// Expected number of distinct values to be inserted into the bloom filter.
+ ///
+ /// Usage of bloom filter is most beneficial for columns with large
cardinality,
Review Comment:
I think there are abundant resources available for bloom filters so we don't
need to add the introduction about BF with (biased) suggestions. It would be
good enough to just let readers know what each parameter is about and the
consequence of it.
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+/// bloom_filter_builder->AppendRowGroup();
+/// auto* bloom_filter =
+/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+/// // Add bloom filter entries in `bloom_filter`.
+/// // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+ /// @brief API to create a BloomFilterBuilder.
+ ///
+ /// @param schema The schema of the file. The life cycle of the schema must
be
+ /// longer than the BloomFilterBuilder.
+ /// @param properties The properties of the file. The life cycle of the
properties
+ /// must be longer than the BloomFilterBuilder.
+ static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor*
schema,
+ const WriterProperties*
properties);
+
+ /// @brief Append a new row group to host all incoming bloom filters.
Review Comment:
```suggestion
/// @brief Start a new row group to host all bloom filters belong to it.
```
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+/// bloom_filter_builder->AppendRowGroup();
+/// auto* bloom_filter =
+/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+/// // Add bloom filter entries in `bloom_filter`.
+/// // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+ /// @brief API to create a BloomFilterBuilder.
+ ///
+ /// @param schema The schema of the file. The life cycle of the schema must
be
+ /// longer than the BloomFilterBuilder.
+ /// @param properties The properties of the file. The life cycle of the
properties
+ /// must be longer than the BloomFilterBuilder.
+ static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor*
schema,
+ const WriterProperties*
properties);
+
+ /// @brief Append a new row group to host all incoming bloom filters.
+ ///
+ /// This method must be called before `GetOrCreateBloomFilter` for a new row
group.
Review Comment:
```suggestion
/// This method must be called before `GetOrCreateBloomFilter` for columns
of a new row group.
```
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+/// bloom_filter_builder->AppendRowGroup();
+/// auto* bloom_filter =
+/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+/// // Add bloom filter entries in `bloom_filter`.
+/// // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+ /// @brief API to create a BloomFilterBuilder.
+ ///
+ /// @param schema The schema of the file. The life cycle of the schema must
be
+ /// longer than the BloomFilterBuilder.
+ /// @param properties The properties of the file. The life cycle of the
properties
+ /// must be longer than the BloomFilterBuilder.
+ static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor*
schema,
+ const WriterProperties*
properties);
+
+ /// @brief Append a new row group to host all incoming bloom filters.
+ ///
+ /// This method must be called before `GetOrCreateBloomFilter` for a new row
group.
+ ///
+ /// @throws ParquetException if WriteTo() has been called to flush bloom
filters.
+ virtual void AppendRowGroup() = 0;
+
+ /// @brief Get the BloomFilter from column ordinal.
+ ///
+ /// @param column_ordinal Column ordinal in schema, which is only for leaf
columns.
+ ///
+ /// \return BloomFilter for the column and its memory ownership belongs to
the
+ /// BloomFilterBuilder. It will return nullptr if bloom filter is not
enabled for the
+ /// column.
+ ///
+ /// @throws ParquetException if any of following conditions applies:
+ /// 1) column_ordinal is out of bound.
+ /// 2) `WriteTo()` has been called already.
+ /// 3) `AppendRowGroup()` is not called before `GetOrCreateBloomFilter()`.
Review Comment:
This is only true for the 1st row group. We should be clear about this.
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
Review Comment:
Although I agree that it is better to provide good examples, current example
is incomplete and not very clear on `bloom_filter_column` and what to do next
with `bloom_filter`. What about removing this example since it is not a public
API?
##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -373,8 +374,11 @@ set_source_files_properties(public_api_test.cc PROPERTIES
SKIP_UNITY_BUILD_INCLU
add_parquet_test(internals-test
SOURCES
- bloom_filter_reader_test.cc
bloom_filter_test.cc
+ bloom_filter_reader_writer_test.cc
+ properties_test.cc
+ statistics_test.cc
+ encoding_test.cc
Review Comment:
```suggestion
bloom_filter_reader_writer_test.cc
encoding_test.cc
```
##########
cpp/src/parquet/column_writer.cc:
##########
@@ -157,6 +160,8 @@ inline const T* AddIfNotNull(const T* base, int64_t offset)
{
return nullptr;
}
+constexpr int64_t kHashBatchSize = 256;
Review Comment:
What about moving it closer to line 1238?
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+/// bloom_filter_builder->AppendRowGroup();
+/// auto* bloom_filter =
+/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+/// // Add bloom filter entries in `bloom_filter`.
+/// // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+ /// @brief API to create a BloomFilterBuilder.
+ ///
+ /// @param schema The schema of the file. The life cycle of the schema must
be
+ /// longer than the BloomFilterBuilder.
+ /// @param properties The properties of the file. The life cycle of the
properties
+ /// must be longer than the BloomFilterBuilder.
Review Comment:
```suggestion
/// @param properties The properties of the file with a set of
`BloomFilterOption`s
/// for columns enabling bloom filters. It must outlive the created
BloomFilterBuilder.
```
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
Review Comment:
```suggestion
/// @brief Interface for building bloom filters of a parquet file.
```
##########
cpp/src/parquet/bloom_filter_builder_internal.cc:
##########
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#include "parquet/bloom_filter_builder_internal.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/logging.h"
+
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+namespace {
+/// BloomFilterBuilderImpl is the implementation of BloomFilterBuilder.
+///
+/// Note: Column encryption for bloom filter is not implemented yet.
+class BloomFilterBuilderImpl : public BloomFilterBuilder {
+ public:
+ explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema,
+ const WriterProperties* properties)
+ : schema_(schema), properties_(properties) {}
+ BloomFilterBuilderImpl(const BloomFilterBuilderImpl&) = delete;
+ BloomFilterBuilderImpl(BloomFilterBuilderImpl&&) = default;
+
+ /// Append a new row group to host all incoming bloom filters.
+ void AppendRowGroup() override;
+
+ BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) override;
+
+ /// Serialize all bloom filters with header and bitset in the order of row
group and
+ /// column id. The side effect is that it deletes all bloom filters after
they have
+ /// been flushed.
Review Comment:
```suggestion
void AppendRowGroup() override;
BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) override;
```
I don't think we need to add comments for an override function.
##########
cpp/src/parquet/bloom_filter_builder_internal.h:
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// @brief Interface for collecting bloom filter of a parquet file.
+///
+/// ```
+/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties);
+/// for (int i = 0; i < num_row_groups; i++) {
+/// bloom_filter_builder->AppendRowGroup();
+/// auto* bloom_filter =
+/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column);
+/// // Add bloom filter entries in `bloom_filter`.
+/// // ...
+/// }
+/// bloom_filter_builder->WriteTo(sink, location);
+/// ```
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+ /// @brief API to create a BloomFilterBuilder.
+ ///
+ /// @param schema The schema of the file. The life cycle of the schema must
be
+ /// longer than the BloomFilterBuilder.
Review Comment:
```suggestion
/// @param schema The schema of the file and it must outlive the created
BloomFilterBuilder.
```
##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1230,6 +1235,166 @@ Status ConvertDictionaryToDense(const ::arrow::Array&
array, MemoryPool* pool,
return Status::OK();
}
+template <typename ParquetType>
+class BloomFilterWriterImpl {
Review Comment:
Is there any compelling reason not to move these `BloomFilterWriterImpl`s to
`bloom_filter_builder_internal.h/cc`?
##########
cpp/src/parquet/properties.h:
##########
@@ -167,6 +167,32 @@ static constexpr Compression::type
DEFAULT_COMPRESSION_TYPE = Compression::UNCOM
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
SizeStatisticsLevel::PageAndColumnChunk;
+static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
+static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;
+
+struct PARQUET_EXPORT BloomFilterOptions {
+ /// Expected number of distinct values to be inserted into the bloom filter.
+ ///
+ /// Usage of bloom filter is most beneficial for columns with large
cardinality,
+ /// so a good heuristic is to set ndv to number of rows. However, it can
reduce
+ /// disk size if you know in advance a smaller number of distinct values.
+ /// For very small ndv value it is probably not worth it to use bloom filter
anyway.
+ ///
+ /// Increasing this value (without increasing fpp) will result in an
increase in
+ /// disk or memory size.
+ int32_t ndv = DEFAULT_BLOOM_FILTER_NDV;
+ /// False-positive probability of the bloom filter.
+ ///
+ /// The bloom filter data structure is a trade of between disk and memory
space
Review Comment:
```suggestion
/// The bloom filter data structure is a trade-off between disk and memory
space
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]