Jimexist commented on code in PR #3119: URL: https://github.com/apache/arrow-rs/pull/3119#discussion_r1027043345
########## parquet/src/bloom_filter/mod.rs: ########## @@ -113,7 +116,48 @@ fn read_bloom_filter_header_and_length( )) } +const BITSET_MIN_LENGTH: usize = 32; +const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024; + +#[inline] +fn optimal_num_of_bytes(num_bytes: usize) -> usize { + let num_bytes = num_bytes.min(BITSET_MAX_LENGTH); + let num_bytes = num_bytes.max(BITSET_MIN_LENGTH); + num_bytes.next_power_of_two() +} + +// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf +// given fpp = (1 - e^(-k * n / m)) ^ k +// we have m = - k * n / ln(1 - fpp ^ (1 / k)) +// where k = number of hash functions, m = number of bits, n = number of distinct values +#[inline] +fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> f64 { + -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln() +} + impl Sbbf { + /// Create a new [Sbbf] with given number of distinct values and false positive probability. + /// Will panic if `fpp` is greater than 1.0 or less than 0.0. + pub fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Self { Review Comment: will update once we decided on using fpp or ndv or both: - #3138 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org