This is an automated email from the ASF dual-hosted git repository.
reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 949f2c5e4a OAK-11858: Remove remains of Oak bloom filter in
oak-run-commons (#2449)
949f2c5e4a is described below
commit 949f2c5e4a6d2bb09e937d2e066d1fc2ac371926
Author: Julian Reschke <[email protected]>
AuthorDate: Thu Aug 14 11:49:35 2025 +0200
OAK-11858: Remove remains of Oak bloom filter in oak-run-commons (#2449)
---
.../flatfile/analysis/utils/BloomFilter.java | 162 ------------------
.../document/flatfile/analysis/utils/Hash.java | 73 --------
.../flatfile/analysis/utils/HyperLogLog.java | 95 -----------
.../flatfile/analysis/utils/BloomFilterTest.java | 126 --------------
.../flatfile/analysis/utils/HyperLogLogTest.java | 188 ---------------------
5 files changed, 644 deletions(-)
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
deleted file mode 100644
index 3ad439a909..0000000000
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-
-/**
- * A Bloom filter implementation.
- */
-public class BloomFilter {
-
- private final int k;
- private final int arraySize;
- private final long[] data;
-
- private BloomFilter(long[] data, int k) {
- this.data = data;
- this.k = k;
- this.arraySize = data.length;
- }
-
- /**
- * Construct a Bloom filter. With a fpp of 0.01, the memory usage is
roughly 1
- * byte per entry.
- *
- * @param bytes the size in number of bytes (eg. 64_000_000 for 64 MB
memory
- * usage)
- * @param fpp the false-positive probability (eg. 0.01 for a 1%
false-positive
- * probability)
- * @return the Bloom filter
- */
- public static BloomFilter construct(long n, double fpp) {
- long m = calculateBits(n, fpp);
- int k = calculateK((double) m / n);
- return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
- }
-
- // See also https://hur.st/bloomfilter
-
- /**
- * Calculate the best k parameter for a Bloom filter.
- *
- * @param bitsPerKey the number of bits per key (eg. 10)
- * @return the k parameter
- */
- public static int calculateK(double bitsPerKey) {
- return Math.max(1, (int) Math.round(bitsPerKey * Math.log(2)));
- }
-
- /**
- * Calculate the number of bits needed for a Bloom filter, given a number
of entries and the k parameter.
- *
- * @param n the number of entries (eg. 1_000_000)
- * @param fpp the false positive probability (eg. 0.01)
- * @return the bits needed
- */
- public static long calculateBits(long n, double fpp) {
- return (long) Math.ceil((n * Math.log(fpp)) / Math.log(1 / Math.pow(2,
Math.log(2))));
- }
-
- /**
- * Calculate the maximum number of entries in the set, given the the
memory size
- * in bits, and a target false positive probability.
- *
- * @param bits the number of bits (eg. 10_000_000)
- * @param fpp the false positive probability (eg. 0.01)
- * @return the maximum number of entries to be added
- */
- public static long calculateN(long bits, double fpp) {
- return (long) Math.ceil((bits * Math.log(Math.pow(0.5, Math.log(2))) /
Math.log(fpp)));
- }
-
- /**
- * Calculate the false positive probability.
- *
- * @param bits the number of bits (eg. 10_000_000)
- * @param fpp the false positive probability (eg. 0.01)
- * @return the maximum number of entries to be added
- */
- public static double calculateFpp(long n, long bits, int k) {
- // p = pow(1 - exp(-k / (m / n)), k)
- return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
- }
-
- /**
- * Add an entry.
- *
- * @param hash the hash value (need to be a high quality hash code, with
all
- * bits having high entropy)
- */
- public void add(long hash) {
- long a = (hash >>> 32) | (hash << 32);
- long b = hash;
- for (int i = 0; i < k; i++) {
- data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
- a += b;
- }
- }
-
- /**
- * Whether the entry may be in the set.
- *
- * @param hash the hash value (need to be a high quality hash code, with
all
- * bits having high entropy)
- * @return true if the entry was added, or, with a certain false positive
- * probability, even if it was not added
- */
- public boolean mayContain(long hash) {
- long a = (hash >>> 32) | (hash << 32);
- long b = hash;
- for (int i = 0; i < k; i++) {
- if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) ==
0) {
- return false;
- }
- a += b;
- }
- return true;
- }
-
- /**
- * Get the number of bits needed for the array.
- *
- * @return the number of bits
- */
- public long getBitCount() {
- return data.length * 64L;
- }
-
- public int getK() {
- return k;
- }
-
- /**
- * Get the estimated entry count (number of distinct items added). This
- * operation is relatively slow, as it loops over all the entries.
- *
- * @return the estimated entry count, or Long.MAX_VALUE if the number can
not be estimated.
- */
- public long getEstimatedEntryCount() {
- long x = 0;
- for (long d : data) {
- x += Long.bitCount(d);
- }
- double m = getBitCount();
- return (long) (-(m / k) * Math.log(1 - (x / m)));
- }
-
-}
\ No newline at end of file
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
deleted file mode 100644
index a2cd599e7b..0000000000
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-
-/**
- * A hash function utility class.
- */
-public class Hash {
-
- private Hash() {
- // utility class
- }
-
- /**
- * Calculate a 64-bit hash value from a value, using a seed.
- *
- * The current algorithm used the finalizer of the MurmurHash3 hash
function,
- * but callers shouldn't rely on that.
- *
- * @param x the value
- * @param seed the seed
- * @return the hash value
- */
- public static long hash64(long x, long seed) {
- x += seed;
- x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL;
- x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L;
- x = x ^ (x >>> 33);
- return x;
- }
-
- /**
- * Calculate a 64-bit hash value from a value. The input is a 64-bit value
and
- * the output is a 64-bit values. Two different inputs are never mapped to
the
- * same output. The operation is reversible.
- *
- * @param x the value
- * @return the hash value
- */
- public static long hash64(long x) {
- return hash64(x, 100);
- }
-
- /**
- * Shrink the hash to a value 0..n. Kind of like modulo, but using
- * multiplication and shift, which are faster to compute.
- *
- * @param hash the hash
- * @param n the maximum of the result
- * @return the reduced value
- */
- public static int reduce(int hash, int n) {
- //
http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
- return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
- }
-
-}
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
deleted file mode 100644
index dd20a1e777..0000000000
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-
-import java.util.HashSet;
-
-/**
- * A HyperLogLog implementation.
- */
-public class HyperLogLog {
-
- private final double amm2;
- private final int m;
- private final byte[] counters;
- private final int maxSmallSetSize;
- private HashSet<Long> smallSet;
-
- public HyperLogLog(int m, int maxSmallSetSize) {
- this.maxSmallSetSize = maxSmallSetSize;
- if (maxSmallSetSize > 0) {
- smallSet = new HashSet<>();
- } else {
- smallSet = null;
- }
- if (m < 16) {
- throw new IllegalArgumentException("Must be >= 16, is " + m);
- }
- if (Integer.bitCount(m) != 1) {
- throw new IllegalArgumentException("Must be a power of 2, is " +
m);
- }
- this.m = m;
- double am;
- switch (m) {
- case 32:
- am = 0.697;
- break;
- case 64:
- am = 0.709;
- break;
- default:
- am = 0.7213 / (1.0 + 1.079 / m);
- }
- amm2 = am * m * m;
- this.counters = new byte[m];
- }
-
- public void add(long hash) {
- if (smallSet != null) {
- smallSet.add(hash);
- if (smallSet.size() > maxSmallSetSize) {
- smallSet = null;
- }
- }
- int i = (int) (hash & (m - 1));
- counters[i] = (byte) Math.max(counters[i], 1 +
Long.numberOfLeadingZeros(hash));
- }
-
- public long estimate() {
- if (smallSet != null) {
- return smallSet.size();
- }
- double sum = 0;
- int countZero = 0;
- for(int c : counters) {
- countZero += c == 0 ? 1 : 0;
- sum += 1. / (1L << (c & 0xff));
- }
- if (sum == 0) {
- sum = 1;
- }
- long est = (long) (1. / sum * amm2);
- if (est <= 5 * m && countZero > 0) {
- // linear counting
- est = (long) (m * Math.log((double) m / countZero));
- }
- return est;
- }
-
-}
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
deleted file mode 100644
index c3813f340d..0000000000
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.junit.Test;
-
-/**
- * Tests the Bloom filter implementation.
- */
-public class BloomFilterTest {
-
- @Test
- public void calculateBits() {
- assertEquals(9_585_059, BloomFilter.calculateBits(1_000_000, 0.01));
- // test some extreme values
- assertEquals(0, BloomFilter.calculateBits(1, 1.0));
- assertEquals(1, BloomFilter.calculateBits(1, 0.99));
- assertEquals(0, BloomFilter.calculateBits(0, 0.0));
- assertTrue(BloomFilter.calculateBits(Integer.MAX_VALUE, 0.0) >
Integer.MAX_VALUE);
- }
-
- @Test
- public void calculateK() {
- assertEquals(7, BloomFilter.calculateK(10));
- // test some extreme values
- assertEquals(1, BloomFilter.calculateK(1));
- assertEquals(1, BloomFilter.calculateK(0));
- assertEquals(69, BloomFilter.calculateK(100));
- }
-
- @Test
- public void calculateN() {
- assertEquals(11, BloomFilter.calculateN(100, 0.01));
- // test some extreme values
- assertEquals(1, BloomFilter.calculateN(1, 0.01));
- assertEquals(1, BloomFilter.calculateN(1, 0.1));
- assertEquals(0, BloomFilter.calculateN(0, 0.01));
- }
-
- @Test
- public void construct() {
- BloomFilter f = BloomFilter.construct(100, 0.01);
- assertEquals(960, f.getBitCount());
- assertEquals(7, f.getK());
- f = BloomFilter.construct(0, 0.01);
- assertEquals(0, f.getBitCount());
- assertEquals(1, f.getK());
- }
-
- @Test
- public void fpp() {
- for (double fpp = 0.001; fpp < 1; fpp *= 2) {
- int size = 500_000;
- BloomFilter f = BloomFilter.construct(size, fpp);
- for (int i = 0; i < size; i++) {
- f.add(Hash.hash64(i));
- }
- for (int i = 0; i < size; i++) {
- assertTrue(f.mayContain(Hash.hash64(i)));
- }
- int falsePositives = 0;
- for (int i = 0; i < size; i++) {
- if (f.mayContain(Hash.hash64(i + size))) {
- falsePositives++;
- }
- }
- double realFpp = (double) falsePositives / size;
- // expected to be within 10%
- assertTrue("expected fpp: " + fpp + " got: " + realFpp, realFpp >=
fpp * 0.9 && realFpp <= fpp * 1.1);
- long est = f.getEstimatedEntryCount();
- assertTrue("expected n: " + size + " got: " + est, size >= est *
0.9 && size <= est * 1.1);
-
- double fpp2 = BloomFilter.calculateFpp(size, f.getBitCount(),
f.getK());
- assertTrue("expected fpp: " + fpp + " got: " + fpp2, fpp2 >= fpp *
0.9 && fpp2 <= fpp * 1.1);
- }
- }
-
- @Test
- public void estimatedEntryCount() {
- // let's assume we have a 1 KB Bloom filter with a false positive rate
of 1%:
- double fpp = 0.01;
- long bits = 1000 * 8;
- long n = BloomFilter.calculateN(bits, fpp);
- BloomFilter bloom = BloomFilter.construct(n, fpp);
- // and a HyperLogLog of 1 KB:
- HyperLogLog hll = new HyperLogLog(1024, 0);
- // now we calculate estimations with both the Bloom filter and
HyperLogLog
- for(int i = 0; i < 20_000; i++) {
- long x = Hash.hash64(i);
- bloom.add(x);
- hll.add(x);
- if (i > 0 && i % 1000 == 0) {
- long estBloom = bloom.getEstimatedEntryCount();
- long estHll = hll.estimate();
- int errBloom = (int) (Math.abs((double) i / estBloom - 1) *
10000);
- int errHll = (int) (Math.abs((double) i / estHll - 1) * 10000);
- if (i < 10_000) {
- assertTrue(errBloom < 1000);
- } else {
- assertEquals(Long.MAX_VALUE, estBloom);
- }
- assertTrue(errHll < 1000);
- }
- }
- }
-
-}
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
deleted file mode 100644
index 570af9f16f..0000000000
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.junit.Test;
-
-public class HyperLogLogTest {
-
- @Test(expected = IllegalArgumentException.class)
- public void illegalHyperLogLogTooSmall() {
- new HyperLogLog(8, 0);
- }
-
- @Test(expected = IllegalArgumentException.class)
- public void illegalHyperLogLogNotPowerOfTwo() {
- new HyperLogLog(30, 0);
- }
-
- @Test
- public void smallSet() {
- HyperLogLog hll100 = new HyperLogLog(16, 100);
- assertEquals(0, hll100.estimate());
- HyperLogLog hll0 = new HyperLogLog(16, 0);
- assertEquals(0, hll0.estimate());
- for (int i = 0; i < 10_000; i++) {
- hll100.add(i % 100);
- hll0.add(i % 100);
- }
- assertEquals(100, hll100.estimate());
- assertNotEquals(100, hll0.estimate());
- }
-
- @Test
- public void test() {
- int testCount = 50;
- for (int m = 8; m <= 128; m *= 2) {
- double avg = Math.sqrt(averageOverRange(m, 30_000, testCount,
false, 2));
- int min, max;
- switch (m) {
- case 8:
- min = 16;
- max = 17;
- break;
- case 16:
- min = 22;
- max = 23;
- break;
- case 32:
- min = 15;
- max = 16;
- break;
- case 64:
- min = 10;
- max = 11;
- break;
- case 128:
- min = 7;
- max = 8;
- break;
- default:
- min = 0;
- max = 0;
- break;
- }
- // System.out.println(type + " expected " + min + ".." + max + "
got " + avg);
- assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
- }
- }
-
- private static double averageOverRange(int m, long maxSize, int testCount,
boolean debug, double exponent) {
- double sum = 0;
- int count = 0;
- for (long size = 1; size <= 20; size++) {
- sum += test(m, size, testCount, debug, exponent);
- count++;
- }
- for (long size = 22; size <= 300; size += size / 5) {
- sum += test(m, size, testCount, debug, exponent);
- count++;
- }
- for (long size = 400; size <= maxSize; size *= 2) {
- sum += test(m, size, testCount, debug, exponent);
- count++;
- }
- return sum / count;
- }
-
- private static double test(int m, long size, int testCount, boolean debug,
double exponent) {
- long x = 0;
- long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
- long ns = System.nanoTime();
- double sumSquareError = 0;
- double sum = 0;
- double sumFirst = 0;
- int repeat = 10;
- int runs = 2;
- for (int test = 0; test < testCount; test++) {
- HyperLogLog hll;
- if (m == 8) {
- hll = new HyperLogLogUsingLong(16, 0);
- } else {
- hll = new HyperLogLog(m, 0);
- }
- long baseX = x;
- for (int i = 0; i < size; i++) {
- hll.add(Hash.hash64(x));
- x++;
- }
- long e = hll.estimate();
- sum += e;
- min = Math.min(min, e);
- max = Math.max(max, e);
- long error = e - size;
- sumSquareError += error * error;
- sumFirst += e;
- for (int add = 0; add < repeat; add++) {
- long x2 = baseX;
- for (int i = 0; i < size; i++) {
- hll.add(Hash.hash64(x2));
- x2++;
- }
- }
- e = hll.estimate();
- sum += e;
- min = Math.min(min, e);
- max = Math.max(max, e);
- error = e - size;
- sumSquareError += error * error;
- }
- ns = System.nanoTime() - ns;
- long nsPerItem = ns / testCount / runs / (1 + repeat) / size;
- double stdDev = Math.sqrt(sumSquareError / testCount / runs);
- double relStdDevP = stdDev / size * 100;
- int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100);
- int biasP = (int) (100 * (sum / testCount / runs / size) - 100);
- if (debug) {
- System.out.println("m " + m + " size " + size + " relStdDev% " +
(int) relStdDevP +
- " range " + min + ".." + max +
- " biasFirst% " + biasFirstP +
- " bias% " + biasP +
- " avg " + (sum / testCount / runs) +
- " time " + nsPerItem);
- }
- // we try to reduce the relStdDevP, make sure there are no large values
- // (trying to reduce sumSquareError directly
- // would mean we care more about larger sets, but we don't)
- return Math.pow(relStdDevP, exponent);
- }
-
- static class HyperLogLogUsingLong extends HyperLogLog {
-
- private long value;
-
- public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
- super(m, maxSmallSetSize);
- }
-
- public void add(long hash) {
- value = HyperLogLog3Linear64.add(value, hash);
- }
-
- public long estimate() {
- return HyperLogLog3Linear64.estimate(value);
- }
-
- }
-
-}