HuaHuaY commented on code in PR #50008:
URL: https://github.com/apache/arrow/pull/50008#discussion_r3354901481
##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -345,9 +348,88 @@ void BlockSplitBloomFilter::WriteTo(ArrowOutputStream*
sink) const {
PARQUET_THROW_NOT_OK(sink->Write(data_->data(), num_bytes_));
}
+void BlockSplitBloomFilter::FoldToTargetFpp(double target_fpp) {
+ const uint32_t num_folds = NumFoldsForTargetFpp(target_fpp);
+ if (num_folds > 0) {
+ Fold(num_folds);
+ }
+}
+
+uint32_t BlockSplitBloomFilter::NumFoldsForTargetFpp(double target_fpp) const {
+ const uint32_t num_blocks = NumBlocks();
+ if (num_blocks < 2) {
+ return 0;
+ }
+ DCHECK_EQ(num_blocks & (num_blocks - 1), 0);
+
+ // Estimate the fill rate after folding from the current average fill rate.
+ // Folding ORs block groups together, so each fold changes the estimated
fill rate
+ // from f to 1 - (1 - f)^2. A membership check tests kBitsSetPerBlock bits,
making
+ // the estimated FPP equal to std::pow(folded_fill_rate, kBitsSetPerBlock).
+ //
+ // See also: Sailhan and Stehr, "Folding and Unfolding Bloom Filters", 2012:
+ // https://hal.science/hal-01126174v1
+ uint64_t total_set_bits = 0;
+ const auto* bitset32 = reinterpret_cast<const uint32_t*>(data_->data());
+ const uint32_t num_words = num_bytes_ /
static_cast<uint32_t>(sizeof(uint32_t));
+ for (uint32_t i = 0; i < num_words; ++i) {
+ total_set_bits += static_cast<uint64_t>(std::popcount(bitset32[i]));
+ }
+
+ const double avg_fill = static_cast<double>(total_set_bits) /
+ (static_cast<double>(num_blocks) *
kBytesPerFilterBlock * 8);
+ const auto max_folds = static_cast<uint32_t>(std::countr_zero(num_blocks));
+
+ if (avg_fill == 0.0) {
+ return max_folds;
+ }
+
+ uint32_t num_folds = 0;
+ double unset_probability_after_folds = 1.0 - avg_fill;
+ for (uint32_t i = 0; i < max_folds; ++i) {
+ unset_probability_after_folds *= unset_probability_after_folds;
+ const double folded_fill_rate = 1.0 - unset_probability_after_folds;
+ const double estimated_fpp = std::pow(folded_fill_rate, kBitsSetPerBlock);
+ if (estimated_fpp > target_fpp) {
+ break;
+ }
+ ++num_folds;
+ }
+ return num_folds;
Review Comment:
I have add a comment in front of `group_size`.
```cpp
// A fold group is a consecutive run of blocks ORed into one output block.
// Keeping the group size as (1 << num_folds) preserves a power-of-two
bitset
// size, which existing Arrow C++ readers validate on deserialization.
const uint32_t group_size = UINT32_C(1) << num_folds;
```
After more thinking, I think the actual size reduction must be a power of 2.
Because the block index is calculated by `static_cast<uint32_t>(((hash >> 32) *
NumBlocks()) >> 32);`, which is required by parquet's document. And we must
ensure that the calculated block index is the same before and after the fold.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]