Copilot commented on code in PR #9206:
URL: https://github.com/apache/arrow-rs/pull/9206#discussion_r2701137342
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -489,6 +571,74 @@ fn hash_as_bytes<A: AsBytes + ?Sized>(value: &A) -> u64 {
mod tests {
use super::*;
+ /// Exact FPP calculation from Putze et al. equation 3.
+ /// This is a direct port of the C `libfilter_block_fpp_detail` function
for validation purposes.
+ /// See https://github.com/jbapple/libfilter/blob/master/c/lib/util.c
+ ///
+ /// For Parquet SBBF:
+ /// - word_bits = 32 (each word is 32 bits)
+ /// - bucket_words = 8 (8 words per block, i.e., 256 bits per block)
+ /// - hash_bits = 32 (32-bit hash for block selection)
+ fn exact_fpp_from_formula(ndv: u64, bytes: usize) -> f64 {
+ const CHAR_BIT: usize = 8;
+ const WORD_BITS: f64 = 32.0;
+ const BUCKET_WORDS: f64 = 8.0;
+ const HASH_BITS: f64 = 32.0;
+
+ // Edge cases from the C implementation
+ if ndv == 0 {
+ return 0.0;
+ }
+ if bytes == 0 {
+ return 1.0;
+ }
+ let bits = bytes * CHAR_BIT;
+ if ndv as f64 / bits as f64 > 3.0 {
+ return 1.0;
+ }
+
+ let mut result = 0.0;
+
+ // Lambda parameter: average number of elements per block
+ // lam = bucket_words * word_bits / ((bytes * CHAR_BIT) / ndv)
+ // = bucket_words * word_bits * ndv / (bytes * CHAR_BIT)
+ let lam = BUCKET_WORDS * WORD_BITS * (ndv as f64) / (bits as f64);
+ let loglam = lam.ln();
+ let log1collide = -HASH_BITS * 2.0_f64.ln();
+
+ const MAX_J: u64 = 10000;
+ for j in 0..MAX_J {
+ let i = MAX_J - 1 - j;
+ let i_f64 = i as f64;
+
+ // logp: log of Poisson probability P(X = i) where X ~ Poisson(lam)
+ // P(X = i) = (lam^i * e^(-lam)) / i!
+ // log(P(X = i)) = i*log(lam) - lam - log(i!)
+ let logp = i_f64 * loglam - lam - libm::lgamma(i_f64 + 1.0);
+
+ // Probability that a block with i elements is saturated (all bits
set)
+ // After inserting i elements, each of bucket_words bits has
probability
+ // (1 - 1/word_bits)^i of being 0. We need all bucket_words bits
to have
+ // at least one bit set.
+ let logfinner = if i == 0 {
+ f64::NEG_INFINITY // log(0) = -infinity, contributes 0 to
result
+ } else {
+ BUCKET_WORDS * (1.0 - (1.0 - 1.0 / WORD_BITS).powf(i_f64)).ln()
Review Comment:
Potential numerical precision issue for large i values: When i is large
(e.g., > 100), the expression (1.0 - 1.0 / WORD_BITS).powf(i_f64) becomes very
small, and 1.0 minus this small value can suffer from catastrophic
cancellation. The result might round to exactly 1.0, making ln(1.0) equal to
0.0 instead of the correct small positive value. Consider using log1p for
better numerical stability: `BUCKET_WORDS * (-(1.0 - 1.0 /
WORD_BITS).powf(i_f64)).log1p()`. Note: This only affects test code, not
production logic.
```suggestion
BUCKET_WORDS * (-(1.0 - 1.0 / WORD_BITS).powf(i_f64)).log1p()
```
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
num_bytes.next_power_of_two()
}
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of
distinct values
+/// Compensation table from Table I of the paper referenced in the module
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c')
for B=512.
+const COMPENSATION_TABLE: &[(f64, f64)] = &[
+ (5.0, 6.0),
+ (6.0, 7.0),
+ (7.0, 8.0),
+ (8.0, 9.0),
+ (9.0, 10.0),
+ (10.0, 11.0),
+ (11.0, 12.0),
+ (12.0, 13.0),
+ (13.0, 14.0),
+ (14.0, 16.0),
+ (15.0, 17.0),
+ (16.0, 18.0),
+ (17.0, 20.0),
+ (18.0, 21.0),
+ (19.0, 23.0),
+ (20.0, 25.0),
+ (21.0, 26.0),
+ (22.0, 28.0),
+ (23.0, 30.0),
+ (24.0, 32.0),
+ (25.0, 35.0),
+ (26.0, 38.0),
+ (27.0, 40.0),
+ (28.0, 44.0),
+ (29.0, 48.0),
+ (30.0, 51.0),
+ (31.0, 58.0),
+ (32.0, 64.0),
+ (33.0, 74.0),
+ (34.0, 90.0),
+];
+
+/// Calculates the number of bits needed for a Split Block Bloom Filter given
NDV and FPP.
+///
+/// Uses the compensation approach described in the module documentation to
account for
+/// block structure overhead. The calculation:
+/// 1. Determines the theoretical bits-per-element (c) for an ideal Bloom
filter with k=8
+/// 2. Applies compensation factors using linear interpolation between table
entries
+/// 3. Multiplies by NDV to get the total number of bits needed
#[inline]
fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
- let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
- num_bits as usize
+ // Calculate the theoretical 'c' (bits per element) for an IDEAL filter.
+ // With k=8 fixed: c = -k / ln(1 - fpp^(1/k))
+ let k = 8.0;
+ let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln();
+
+ // Apply compensation using linear interpolation between table entries
+ let compensated_c = if theoretical_c <= COMPENSATION_TABLE[0].0 {
+ // Below table range, use first entry
+ COMPENSATION_TABLE[0].1
Review Comment:
When theoretical_c is below 5.0 (the minimum value in the compensation
table), the code always uses 6.0 as the compensated value. However, this means
that for very high FPP values (e.g., 0.5, 0.7) which would have low
theoretical_c values (e.g., 2-4), we're significantly over-allocating space.
Consider adding a more precise handling for this edge case, such as using a
proportional adjustment or capping at theoretical_c if it provides adequate FPP.
```suggestion
// Below table range: scale proportionally using the first entry's
ratio
let (c0, c0_prime) = COMPENSATION_TABLE[0];
theoretical_c * (c0_prime / c0)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]