Copilot commented on code in PR #9206:
URL: https://github.com/apache/arrow-rs/pull/9206#discussion_r2701137342


##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -489,6 +571,74 @@ fn hash_as_bytes<A: AsBytes + ?Sized>(value: &A) -> u64 {
 mod tests {
     use super::*;
 
+    /// Exact FPP calculation from Putze et al. equation 3.
+    /// This is a direct port of the C `libfilter_block_fpp_detail` function 
for validation purposes.
+    /// See https://github.com/jbapple/libfilter/blob/master/c/lib/util.c
+    ///
+    /// For Parquet SBBF:
+    /// - word_bits = 32 (each word is 32 bits)
+    /// - bucket_words = 8 (8 words per block, i.e., 256 bits per block)
+    /// - hash_bits = 32 (32-bit hash for block selection)
+    fn exact_fpp_from_formula(ndv: u64, bytes: usize) -> f64 {
+        const CHAR_BIT: usize = 8;
+        const WORD_BITS: f64 = 32.0;
+        const BUCKET_WORDS: f64 = 8.0;
+        const HASH_BITS: f64 = 32.0;
+
+        // Edge cases from the C implementation
+        if ndv == 0 {
+            return 0.0;
+        }
+        if bytes == 0 {
+            return 1.0;
+        }
+        let bits = bytes * CHAR_BIT;
+        if ndv as f64 / bits as f64 > 3.0 {
+            return 1.0;
+        }
+
+        let mut result = 0.0;
+
+        // Lambda parameter: average number of elements per block
+        // lam = bucket_words * word_bits / ((bytes * CHAR_BIT) / ndv)
+        //     = bucket_words * word_bits * ndv / (bytes * CHAR_BIT)
+        let lam = BUCKET_WORDS * WORD_BITS * (ndv as f64) / (bits as f64);
+        let loglam = lam.ln();
+        let log1collide = -HASH_BITS * 2.0_f64.ln();
+
+        const MAX_J: u64 = 10000;
+        for j in 0..MAX_J {
+            let i = MAX_J - 1 - j;
+            let i_f64 = i as f64;
+
+            // logp: log of Poisson probability P(X = i) where X ~ Poisson(lam)
+            // P(X = i) = (lam^i * e^(-lam)) / i!
+            // log(P(X = i)) = i*log(lam) - lam - log(i!)
+            let logp = i_f64 * loglam - lam - libm::lgamma(i_f64 + 1.0);
+
+            // Probability that a block with i elements is saturated (all bits 
set)
+            // After inserting i elements, each of bucket_words bits has 
probability
+            // (1 - 1/word_bits)^i of being 0. We need all bucket_words bits 
to have
+            // at least one bit set.
+            let logfinner = if i == 0 {
+                f64::NEG_INFINITY // log(0) = -infinity, contributes 0 to 
result
+            } else {
+                BUCKET_WORDS * (1.0 - (1.0 - 1.0 / WORD_BITS).powf(i_f64)).ln()

Review Comment:
   Potential numerical precision issue for large i values: When i is large 
(e.g., > 100), the expression (1.0 - 1.0 / WORD_BITS).powf(i_f64) becomes very 
small, and 1.0 minus this small value can suffer from catastrophic 
cancellation. The result might round to exactly 1.0, making ln(1.0) equal to 
0.0 instead of the correct small positive value. Consider using log1p for 
better numerical stability: `BUCKET_WORDS * (-(1.0 - 1.0 / 
WORD_BITS).powf(i_f64)).log1p()`. Note: This only affects test code, not 
production logic.
   ```suggestion
                   BUCKET_WORDS * (-(1.0 - 1.0 / WORD_BITS).powf(i_f64)).log1p()
   ```



##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
     num_bytes.next_power_of_two()
 }
 
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of 
distinct values
+/// Compensation table from Table I of the paper referenced in the module 
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c') 
for B=512.
+const COMPENSATION_TABLE: &[(f64, f64)] = &[
+    (5.0, 6.0),
+    (6.0, 7.0),
+    (7.0, 8.0),
+    (8.0, 9.0),
+    (9.0, 10.0),
+    (10.0, 11.0),
+    (11.0, 12.0),
+    (12.0, 13.0),
+    (13.0, 14.0),
+    (14.0, 16.0),
+    (15.0, 17.0),
+    (16.0, 18.0),
+    (17.0, 20.0),
+    (18.0, 21.0),
+    (19.0, 23.0),
+    (20.0, 25.0),
+    (21.0, 26.0),
+    (22.0, 28.0),
+    (23.0, 30.0),
+    (24.0, 32.0),
+    (25.0, 35.0),
+    (26.0, 38.0),
+    (27.0, 40.0),
+    (28.0, 44.0),
+    (29.0, 48.0),
+    (30.0, 51.0),
+    (31.0, 58.0),
+    (32.0, 64.0),
+    (33.0, 74.0),
+    (34.0, 90.0),
+];
+
+/// Calculates the number of bits needed for a Split Block Bloom Filter given 
NDV and FPP.
+///
+/// Uses the compensation approach described in the module documentation to 
account for
+/// block structure overhead. The calculation:
+/// 1. Determines the theoretical bits-per-element (c) for an ideal Bloom 
filter with k=8
+/// 2. Applies compensation factors using linear interpolation between table 
entries
+/// 3. Multiplies by NDV to get the total number of bits needed
 #[inline]
 fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
-    let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
-    num_bits as usize
+    // Calculate the theoretical 'c' (bits per element) for an IDEAL filter.
+    // With k=8 fixed: c = -k / ln(1 - fpp^(1/k))
+    let k = 8.0;
+    let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln();
+
+    // Apply compensation using linear interpolation between table entries
+    let compensated_c = if theoretical_c <= COMPENSATION_TABLE[0].0 {
+        // Below table range, use first entry
+        COMPENSATION_TABLE[0].1

Review Comment:
   When theoretical_c is below 5.0 (the minimum value in the compensation 
table), the code always uses 6.0 as the compensated value. However, this means 
that for very high FPP values (e.g., 0.5, 0.7) which would have low 
theoretical_c values (e.g., 2-4), we're significantly over-allocating space. 
Consider adding a more precise handling for this edge case, such as using a 
proportional adjustment or capping at theoretical_c if it provides adequate FPP.
   ```suggestion
           // Below table range: scale proportionally using the first entry's 
ratio
           let (c0, c0_prime) = COMPENSATION_TABLE[0];
           theoretical_c * (c0_prime / c0)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to