Copilot commented on code in PR #9206:
URL: https://github.com/apache/arrow-rs/pull/9206#discussion_r2700606513
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
num_bytes.next_power_of_two()
}
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of
distinct values
+/// Compensation table from Table I of the paper referenced in the module
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c')
for B=512.
+const COMPENSATION_TABLE: &[(f64, f64)] = &[
+ (5.0, 6.0),
+ (6.0, 7.0),
+ (7.0, 8.0),
+ (8.0, 9.0),
+ (9.0, 10.0),
+ (10.0, 11.0),
+ (11.0, 12.0),
+ (12.0, 13.0),
+ (13.0, 14.0),
+ (14.0, 16.0),
+ (15.0, 17.0),
+ (16.0, 18.0),
+ (17.0, 20.0),
+ (18.0, 21.0),
+ (19.0, 23.0),
+ (20.0, 25.0),
+ (21.0, 26.0),
+ (22.0, 28.0),
+ (23.0, 30.0),
+ (24.0, 32.0),
+ (25.0, 35.0),
+ (26.0, 38.0),
+ (27.0, 40.0),
+ (28.0, 44.0),
+ (29.0, 48.0),
+ (30.0, 51.0),
+ (31.0, 58.0),
+ (32.0, 64.0),
+ (33.0, 74.0),
+ (34.0, 90.0),
+];
+
+/// Calculates the number of bits needed for a Split Block Bloom Filter given
NDV and FPP.
+///
+/// Uses the compensation approach described in the module documentation to
account for
+/// block structure overhead. The calculation:
+/// 1. Determines the theoretical bits-per-element (c) for an ideal Bloom
filter with k=8
+/// 2. Applies compensation factors using linear interpolation between table
entries
+/// 3. Multiplies by NDV to get the total number of bits needed
#[inline]
fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
Review Comment:
The function doesn't validate that ndv is greater than zero. If ndv is 0,
the calculation would produce 0 bits, which could lead to incorrect bloom
filter behavior. Consider adding validation to ensure ndv > 0 and return an
appropriate error if it's not.
```suggestion
fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
assert!(
ndv > 0,
"num_of_bits_from_ndv_fpp: ndv (number of distinct values) must be
greater than zero"
);
```
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
num_bytes.next_power_of_two()
}
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of
distinct values
+/// Compensation table from Table I of the paper referenced in the module
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c')
for B=512.
+const COMPENSATION_TABLE: &[(f64, f64)] = &[
+ (5.0, 6.0),
+ (6.0, 7.0),
+ (7.0, 8.0),
+ (8.0, 9.0),
+ (9.0, 10.0),
+ (10.0, 11.0),
+ (11.0, 12.0),
+ (12.0, 13.0),
+ (13.0, 14.0),
+ (14.0, 16.0),
+ (15.0, 17.0),
+ (16.0, 18.0),
+ (17.0, 20.0),
+ (18.0, 21.0),
+ (19.0, 23.0),
+ (20.0, 25.0),
+ (21.0, 26.0),
+ (22.0, 28.0),
+ (23.0, 30.0),
+ (24.0, 32.0),
+ (25.0, 35.0),
+ (26.0, 38.0),
+ (27.0, 40.0),
+ (28.0, 44.0),
+ (29.0, 48.0),
+ (30.0, 51.0),
+ (31.0, 58.0),
+ (32.0, 64.0),
+ (33.0, 74.0),
+ (34.0, 90.0),
+];
+
+/// Calculates the number of bits needed for a Split Block Bloom Filter given
NDV and FPP.
+///
+/// Uses the compensation approach described in the module documentation to
account for
+/// block structure overhead. The calculation:
+/// 1. Determines the theoretical bits-per-element (c) for an ideal Bloom
filter with k=8
+/// 2. Applies compensation factors using linear interpolation between table
entries
+/// 3. Multiplies by NDV to get the total number of bits needed
#[inline]
fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
- let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
- num_bits as usize
+ // Calculate the theoretical 'c' (bits per element) for an IDEAL filter.
+ // With k=8 fixed: c = -k / ln(1 - fpp^(1/k))
+ let k = 8.0;
+ let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln();
+
+ // Apply compensation using linear interpolation between table entries
+ let compensated_c = if theoretical_c <= COMPENSATION_TABLE[0].0 {
+ // Below table range, use first entry
+ COMPENSATION_TABLE[0].1
+ } else if theoretical_c >= COMPENSATION_TABLE[COMPENSATION_TABLE.len() -
1].0 {
+ // Beyond c=34, SBBF efficiency drops off a cliff
+ theoretical_c * 2.65
+ } else {
+ // Find the two table entries to interpolate between
+ let idx = COMPENSATION_TABLE
+ .iter()
+ .position(|(c, _)| *c >= theoretical_c)
+ .unwrap(); // Safe because we checked bounds above
+
+ if COMPENSATION_TABLE[idx].0 == theoretical_c {
+ // Exact match
+ COMPENSATION_TABLE[idx].1
+ } else {
+ // Linear interpolation between consecutive entries
+ let (c_low, c_prime_low) = COMPENSATION_TABLE[idx - 1];
+ let (c_high, c_prime_high) = COMPENSATION_TABLE[idx];
+ let ratio = (theoretical_c - c_low) / (c_high - c_low);
+ c_prime_low + ratio * (c_prime_high - c_prime_low)
+ }
+ };
+
+ (ndv as f64 * compensated_c).ceil() as usize
Review Comment:
The conversion from f64 to usize without overflow checking could cause
issues for very large NDV and FPP combinations. While the
`optimal_num_of_bytes` function caps the result at `BITSET_MAX_LENGTH`, this
overflow could occur before that check. Consider adding a bounds check or
saturation arithmetic to handle extremely large calculations gracefully, or at
minimum document the valid range of NDV and FPP values that this function
supports.
```suggestion
// Compute the number of bits, guarding against overflow and non-finite
values
let bits_f64 = (ndv as f64 * compensated_c).ceil();
if !bits_f64.is_finite() {
// Saturate on NaN/inf rather than overflowing silently on cast
usize::MAX
} else if bits_f64 <= 0.0 {
0
} else if bits_f64 >= usize::MAX as f64 {
usize::MAX
} else {
bits_f64 as usize
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]