This is an automated email from the ASF dual-hosted git repository.
reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 391318cf79 OAK-11842: Copy Bloom Filter implementation from
oak-run-commons to oak-commons for re-use (#2432)
391318cf79 is described below
commit 391318cf791074943d93c93e9517927de48fa57d
Author: Julian Reschke <[email protected]>
AuthorDate: Tue Aug 12 08:51:27 2025 +0200
OAK-11842: Copy Bloom Filter implementation from oak-run-commons to
oak-commons for re-use (#2432)
* OAK-11842: Copy Bloom Filter implementation from oak-run-commons to
oak-commons for re-use
* OAK-11842: Copy Bloom Filter implementation from oak-run-commons to
oak-commons for re-use - remove unintended changes
* OAK-11842 Bloom Filter: improve javadocs
---------
Co-authored-by: Thomas Mueller <[email protected]>
---
oak-commons/pom.xml | 4 +
.../oak/commons/collections/BloomFilter.java | 200 +++++++++++++++++++++
.../oak/commons/collections/HashUtils.java | 86 +++++++++
.../oak/commons/collections/HyperLogLog.java | 95 ++++++++++
.../oak/commons/collections/package-info.java | 2 +-
.../oak/commons/collections/BloomFilterTest.java | 200 +++++++++++++++++++++
.../oak/commons/collections/HyperLogLogTest.java | 161 +++++++++++++++++
7 files changed, 747 insertions(+), 1 deletion(-)
diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml
index 8c493f40cb..7ec4827f78 100644
--- a/oak-commons/pom.xml
+++ b/oak-commons/pom.xml
@@ -102,6 +102,10 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
</dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </dependency>
<dependency>
<groupId>org.apache.jackrabbit</groupId>
<artifactId>jackrabbit-jcr-commons</artifactId>
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
new file mode 100644
index 0000000000..95f1b2cc71
--- /dev/null
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+/**
+ * A Bloom filter implementation.
+ */
+public class BloomFilter {
+
+ private final int k;
+ private final int arraySize;
+ private final long[] data;
+
+ private BloomFilter(long[] data, int k) {
+ this.data = data;
+ this.k = k;
+ this.arraySize = data.length;
+ }
+
+ /**
+ * Construct a Bloom filter. With a fpp of 0.01, the memory usage is
roughly 1
+ * byte per entry.
+ *
+ * @param n the number of expected entries (eg. 1_000_000)
+ * @param fpp the false-positive probability (eg. 0.01 for a 1%
false-positive
+ * probability)
+ * @return the Bloom filter
+ */
+ public static BloomFilter construct(long n, double fpp) {
+ if (n <= 0) {
+ throw new IllegalArgumentException("n must be greater than 0");
+ }
+ if (fpp <= 0 || fpp >= 1) {
+ throw new IllegalArgumentException("fpp must be between 0 and 1");
+ }
+ long m = calculateBits(n, fpp);
+ int k = calculateK((double) m / n);
+ return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
+ }
+
+ // See also https://hur.st/bloomfilter
+
+ /**
+ * Calculate the best k parameter for a Bloom filter.
+ * (k is the number of hash functions to use for one entry).
+ *
+ * @param bitsPerKey the number of bits per key (eg. 10)
+ * @return the k parameter
+ */
+ public static int calculateK(double bitsPerKey) {
+ return Math.max(1, (int) Math.round(bitsPerKey * Math.log(2)));
+ }
+
+ /**
+ * Calculate the number of bits needed for a Bloom filter,
+ * for a given false positive probability.
+ *
+ * @param n the number of entries (eg. 1_000_000)
+ * @param fpp the false positive probability (eg. 0.01)
+ * @return the bits needed
+ */
+ public static long calculateBits(long n, double fpp) {
+ return (long) Math.ceil((n * Math.log(fpp)) / Math.log(1 / Math.pow(2,
Math.log(2))));
+ }
+
+ /**
+ * Calculate the maximum number of entries in the set, given the memory
size
+ * in bits, and a target false positive probability.
+ *
+ * @param bits the number of bits (eg. 10_000_000)
+ * @param fpp the false positive probability (eg. 0.01)
+ * @return the maximum number of entries to be added
+ */
+ public static long calculateN(long bits, double fpp) {
+ return (long) Math.ceil((bits * Math.log(Math.pow(0.5, Math.log(2))) /
Math.log(fpp)));
+ }
+
+ /**
+ * Calculate the false positive probability.
+ *
+ * @param n the number of entries (eg. 1_000_000)
+ * @param bits the number of bits (eg. 10_000_000)
+ * @param k the number of hash functions
+ * @return the false positive probability (eg. 0.01)
+ */
+ public static double calculateFpp(long n, long bits, int k) {
+ // p = pow(1 - exp(-k / (m / n)), k)
+ return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
+ }
+
+ /**
+ * Add an entry.
+ *
+ * @param value the value to add
+ */
+ public void add(String value) {
+ add(HashUtils.hash64(value));
+ }
+
+ /**
+ * Add an entry.
+ *
+ * Note that the false positive rate will increase if the quality of the
hash
+ * value is low, eg. if only 32 bit hash values are used.
+ * If needed, use HashUtils.hash64(value) as a supplemental hash.
+ *
+ * @param hash the hash value (need to be a high quality hash code, with
all
+ * bits having high entropy)
+ */
+ public void add(long hash) {
+ long a = (hash >>> 32) | (hash << 32);
+ long b = hash;
+ for (int i = 0; i < k; i++) {
+ data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
+ a += b;
+ }
+ }
+
+ /**
+ * Tests whether the entry might be in the set.
+ *
+ * @param value the value to check
+ * @return true if the entry was added, or, with a certain false positive
+ * probability, even if it was not added
+ */
+ public boolean mayContain(String value) {
+ return mayContain(HashUtils.hash64(value));
+ }
+
+ /**
+ * Tests whether the entry might be in the set.
+ *
+ * @param hash the hash value (need to be a high quality hash code, with
all
+ * bits having high entropy)
+ * @return true if the entry was added, or, with a certain false positive
+ * probability, even if it was not added
+ */
+ public boolean mayContain(long hash) {
+ long a = (hash >>> 32) | (hash << 32);
+ long b = hash;
+ for (int i = 0; i < k; i++) {
+ if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L <<
a) == 0) {
+ return false;
+ }
+ a += b;
+ }
+ return true;
+ }
+
+ /**
+ * Get the number of bits needed for the array.
+ *
+ * @return the number of bits
+ */
+ public long getBitCount() {
+ return data.length * 64L;
+ }
+
+ /**
+ * Get the k parameter (the number of hash functions for an entry).
+ *
+ * @return the k parameter
+ */
+ public int getK() {
+ return k;
+ }
+
+ /**
+ * Get the estimated entry count (number of distinct items added). This
+ * operation is relatively slow, as it loops over all the entries.
+ *
+ * @return the estimated entry count,
+ * or Long.MAX_VALUE if the number can not be estimated.
+ */
+ public long getEstimatedEntryCount() {
+ long x = 0;
+ for (long d : data) {
+ x += Long.bitCount(d);
+ }
+ double m = getBitCount();
+ return (long) (-(m / k) * Math.log(1 - (x / m)));
+ }
+
+}
\ No newline at end of file
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
new file mode 100644
index 0000000000..90d998664c
--- /dev/null
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.codec.digest.MurmurHash3;
+
+/**
+ * A hash function utility class.
+ */
+public class HashUtils {
+
+ private HashUtils() {
+ // utility class
+ }
+
+ /**
+ * Calculate a 64-bit hash value from a string.
+ *
+ * @param s the string
+ * @return the hash value
+ */
+ public static long hash64(String s) {
+ return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0];
+ }
+
+ /**
+ * Calculate a 64-bit hash value from a value, using a seed.
+ *
+ * The current algorithm used the finalizer of the MurmurHash3 hash
function,
+ * but callers shouldn't rely on that.
+ *
+ * @param x the value
+ * @param seed the seed
+ * @return the hash value
+ */
+ public static long hash64(long x, long seed) {
+ x += seed;
+ x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL;
+ x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L;
+ x = x ^ (x >>> 33);
+ return x;
+ }
+
+ /**
+ * Calculate a 64-bit hash value from a value. The input is a 64-bit value
and
+ * the output is a 64-bit values. Two different inputs are never mapped to
the
+ * same output. The operation is reversible.
+ *
+ * @param x the value
+ * @return the hash value
+ */
+ public static long hash64(long x) {
+ return hash64(x, 100);
+ }
+
+ /**
+ * Shrink the hash to a value 0..n. Kind of like modulo, but using
+ * multiplication and shift, which are faster to compute.
+ *
+ * @param hash the hash
+ * @param n the maximum of the result
+ * @return the reduced value
+ */
+ public static int reduce(int hash, int n) {
+ //
http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+ return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
+ }
+
+}
\ No newline at end of file
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
new file mode 100644
index 0000000000..2054decff0
--- /dev/null
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import java.util.HashSet;
+
+/**
+ * A HyperLogLog implementation.
+ */
+public class HyperLogLog {
+
+ private final double amm2;
+ private final int m;
+ private final byte[] counters;
+ private final int maxSmallSetSize;
+ private HashSet<Long> smallSet;
+
+ public HyperLogLog(int m, int maxSmallSetSize) {
+ this.maxSmallSetSize = maxSmallSetSize;
+ if (maxSmallSetSize > 0) {
+ smallSet = new HashSet<>();
+ } else {
+ smallSet = null;
+ }
+ if (m < 16) {
+ throw new IllegalArgumentException("Must be >= 16, is " + m);
+ }
+ if (Integer.bitCount(m) != 1) {
+ throw new IllegalArgumentException("Must be a power of 2, is " +
m);
+ }
+ this.m = m;
+ double am;
+ switch (m) {
+ case 32:
+ am = 0.697;
+ break;
+ case 64:
+ am = 0.709;
+ break;
+ default:
+ am = 0.7213 / (1.0 + 1.079 / m);
+ }
+ amm2 = am * m * m;
+ this.counters = new byte[m];
+ }
+
+ public void add(long hash) {
+ if (smallSet != null) {
+ smallSet.add(hash);
+ if (smallSet.size() > maxSmallSetSize) {
+ smallSet = null;
+ }
+ }
+ int i = (int) (hash & (m - 1));
+ counters[i] = (byte) Math.max(counters[i], 1 +
Long.numberOfLeadingZeros(hash));
+ }
+
+ public long estimate() {
+ if (smallSet != null) {
+ return smallSet.size();
+ }
+ double sum = 0;
+ int countZero = 0;
+ for(int c : counters) {
+ countZero += c == 0 ? 1 : 0;
+ sum += 1. / (1L << (c & 0xff));
+ }
+ if (sum == 0) {
+ sum = 1;
+ }
+ long est = (long) (1. / sum * amm2);
+ if (est <= 5 * m && countZero > 0) {
+ // linear counting
+ est = (long) (m * Math.log((double) m / countZero));
+ }
+ return est;
+ }
+
+}
\ No newline at end of file
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
index 81c20db5ca..5d1490f210 100644
---
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
@@ -21,7 +21,7 @@
* Utilities for Java collections and streams.
*/
@Internal(since = "1.0.0")
-@Version("2.2.0")
+@Version("2.3.0")
package org.apache.jackrabbit.oak.commons.collections;
import org.apache.jackrabbit.oak.commons.annotations.Internal;
import org.osgi.annotation.versioning.Version;
diff --git
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
new file mode 100644
index 0000000000..464de8c4ef
--- /dev/null
+++
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThrows;
+
+import org.junit.Test;
+
+/**
+ * Tests the Bloom filter implementation.
+ */
+public class BloomFilterTest {
+
+ @Test
+ public void calculateBits() {
+ assertEquals(9_585_059, BloomFilter.calculateBits(1_000_000, 0.01));
+ // test some extreme values
+ assertEquals(0, BloomFilter.calculateBits(1, 1.0));
+ assertEquals(1, BloomFilter.calculateBits(1, 0.99));
+ assertEquals(0, BloomFilter.calculateBits(0, 0.0));
+ assertTrue(BloomFilter.calculateBits(Integer.MAX_VALUE, 0.0) >
Integer.MAX_VALUE);
+ }
+
+ @Test
+ public void calculateK() {
+ assertEquals(7, BloomFilter.calculateK(10));
+ // test some extreme values
+ assertEquals(1, BloomFilter.calculateK(1));
+ assertEquals(1, BloomFilter.calculateK(0));
+ assertEquals(69, BloomFilter.calculateK(100));
+ }
+
+ @Test
+ public void calculateN() {
+ assertEquals(11, BloomFilter.calculateN(100, 0.01));
+ // test some extreme values
+ assertEquals(1, BloomFilter.calculateN(1, 0.01));
+ assertEquals(1, BloomFilter.calculateN(1, 0.1));
+ assertEquals(0, BloomFilter.calculateN(0, 0.01));
+ }
+
+ @Test
+ public void construct() {
+ BloomFilter f = BloomFilter.construct(100, 0.01);
+ assertEquals(960, f.getBitCount());
+ assertEquals(7, f.getK());
+ }
+
+ @Test
+ public void fpp() {
+ for (double fpp = 0.001; fpp < 1; fpp *= 2) {
+ int size = 500_000;
+ BloomFilter f = BloomFilter.construct(size, fpp);
+ for (int i = 0; i < size; i++) {
+ f.add(HashUtils.hash64(i));
+ }
+ for (int i = 0; i < size; i++) {
+ assertTrue(f.mayContain(HashUtils.hash64(i)));
+ }
+ int falsePositives = 0;
+ for (int i = 0; i < size; i++) {
+ if (f.mayContain(HashUtils.hash64(i + size))) {
+ falsePositives++;
+ }
+ }
+ double realFpp = (double) falsePositives / size;
+ // expected to be within 10%
+ assertTrue("expected fpp: " + fpp + " got: " + realFpp, realFpp >=
fpp * 0.9 && realFpp <= fpp * 1.1);
+ long est = f.getEstimatedEntryCount();
+ assertTrue("expected n: " + size + " got: " + est, size >= est *
0.9 && size <= est * 1.1);
+
+ double fpp2 = BloomFilter.calculateFpp(size, f.getBitCount(),
f.getK());
+ assertTrue("expected fpp: " + fpp + " got: " + fpp2, fpp2 >= fpp *
0.9 && fpp2 <= fpp * 1.1);
+ }
+ }
+
+ @Test
+ public void estimatedEntryCount() {
+ // let's assume we have a 1 KB Bloom filter with a false positive rate
of 1%:
+ double fpp = 0.01;
+ long bits = 1000 * 8;
+ long n = BloomFilter.calculateN(bits, fpp);
+ BloomFilter bloom = BloomFilter.construct(n, fpp);
+ // and a HyperLogLog of 1 KB:
+ HyperLogLog hll = new HyperLogLog(1024, 0);
+ // now we calculate estimations with both the Bloom filter and
HyperLogLog
+ for(int i = 0; i < 20_000; i++) {
+ long x = HashUtils.hash64(i);
+ bloom.add(x);
+ hll.add(x);
+ if (i > 0 && i % 1000 == 0) {
+ long estBloom = bloom.getEstimatedEntryCount();
+ long estHll = hll.estimate();
+ int errBloom = (int) (Math.abs((double) i / estBloom - 1) *
10000);
+ int errHll = (int) (Math.abs((double) i / estHll - 1) * 10000);
+ if (i < 10_000) {
+ assertTrue(errBloom < 1000);
+ } else {
+ assertEquals(Long.MAX_VALUE, estBloom);
+ }
+ assertTrue(errHll < 1000);
+ }
+ }
+ }
+
+ @Test
+ public void filterFunctionality() {
+ BloomFilter filter = BloomFilter.construct(100, 0.01);
+ String testValue = "test-value";
+
+ // Initially should not contain anything
+ assertFalse(filter.mayContain(testValue));
+
+ // Add the item and verify it's found
+ filter.add(testValue);
+ assertTrue(filter.mayContain(testValue));
+
+ // Verify another value is not found
+ assertFalse(filter.mayContain("different-value"));
+ }
+
+ @Test
+ public void filterWithMultipleEntries() {
+ BloomFilter filter = BloomFilter.construct(100, 0.01);
+
+ // Add multiple entries
+ for (int i = 0; i < 100; i++) {
+ filter.add("value-" + i);
+ }
+
+ // Verify all entries are found
+ for (int i = 0; i < 100; i++) {
+ assertTrue(filter.mayContain("value-" + i));
+ }
+ }
+
+ @Test
+ public void falsePositiveProbability() {
+ // Create a filter with high false positive probability for testing
+ double fpp = 0.3;
+ BloomFilter filter = BloomFilter.construct(100, fpp);
+
+ // Fill the filter to capacity
+ for (int i = 0; i < 100; i++) {
+ filter.add("existing-" + i);
+ }
+
+ // Test with values not in the filter
+ int falsePositives = 0;
+ int trials = 1000;
+
+ for (int i = 0; i < trials; i++) {
+ if (filter.mayContain("nonexistent-" + i)) {
+ falsePositives++;
+ }
+ }
+
+ // The false positive rate should be approximately fpp
+ double actualFpp = (double) falsePositives / trials;
+ assertTrue("False positive rate should be close to expected, got " +
actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15);
+ }
+
+ @Test
+ public void invalidEntries() {
+ // Should throw exception for entries < 1
+ assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(0, 0.01));
+ }
+
+ @Test
+ public void invalidFppZero() {
+ // Should throw exception for fpp <= 0
+ assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(100, 0.0));
+ }
+
+ @Test
+ public void invalidFppOne() {
+ // Should throw exception for fpp >= 1
+ assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(100, 1.0));
+ }
+
+}
\ No newline at end of file
diff --git
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
new file mode 100644
index 0000000000..fd3fb39bff
--- /dev/null
+++
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+public class HyperLogLogTest {
+
+ @Test(expected = IllegalArgumentException.class)
+ public void illegalHyperLogLogTooSmall() {
+ new HyperLogLog(8, 0);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void illegalHyperLogLogNotPowerOfTwo() {
+ new HyperLogLog(30, 0);
+ }
+
+ @Test
+ public void smallSet() {
+ HyperLogLog hll100 = new HyperLogLog(16, 100);
+ assertEquals(0, hll100.estimate());
+ HyperLogLog hll0 = new HyperLogLog(16, 0);
+ assertEquals(0, hll0.estimate());
+ for (int i = 0; i < 10_000; i++) {
+ hll100.add(i % 100);
+ hll0.add(i % 100);
+ }
+ assertEquals(100, hll100.estimate());
+ assertNotEquals(100, hll0.estimate());
+ }
+
+ @Test
+ public void test() {
+ int testCount = 50;
+ for (int m = 16; m <= 128; m *= 2) {
+ double avg = Math.sqrt(averageOverRange(m, 30_000, testCount,
false, 2));
+ int min, max;
+ switch (m) {
+ case 16:
+ min = 22;
+ max = 23;
+ break;
+ case 32:
+ min = 15;
+ max = 16;
+ break;
+ case 64:
+ min = 10;
+ max = 11;
+ break;
+ case 128:
+ min = 7;
+ max = 8;
+ break;
+ default:
+ min = 0;
+ max = 0;
+ break;
+ }
+ assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
+ }
+ }
+
+ private static double averageOverRange(int m, long maxSize, int testCount,
boolean debug, double exponent) {
+ double sum = 0;
+ int count = 0;
+ for (long size = 1; size <= 20; size++) {
+ sum += test(m, size, testCount, debug, exponent);
+ count++;
+ }
+ for (long size = 22; size <= 300; size += size / 5) {
+ sum += test(m, size, testCount, debug, exponent);
+ count++;
+ }
+ for (long size = 400; size <= maxSize; size *= 2) {
+ sum += test(m, size, testCount, debug, exponent);
+ count++;
+ }
+ return sum / count;
+ }
+
+ private static double test(int m, long size, int testCount, boolean debug,
double exponent) {
+ long x = 0;
+ long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+ long ns = System.nanoTime();
+ double sumSquareError = 0;
+ double sum = 0;
+ double sumFirst = 0;
+ int repeat = 10;
+ int runs = 2;
+ for (int test = 0; test < testCount; test++) {
+ HyperLogLog hll;
+ hll = new HyperLogLog(m, 0);
+ long baseX = x;
+ for (int i = 0; i < size; i++) {
+ hll.add(HashUtils.hash64(x));
+ x++;
+ }
+ long e = hll.estimate();
+ sum += e;
+ min = Math.min(min, e);
+ max = Math.max(max, e);
+ long error = e - size;
+ sumSquareError += error * error;
+ sumFirst += e;
+ for (int add = 0; add < repeat; add++) {
+ long x2 = baseX;
+ for (int i = 0; i < size; i++) {
+ hll.add(HashUtils.hash64(x2));
+ x2++;
+ }
+ }
+ e = hll.estimate();
+ sum += e;
+ min = Math.min(min, e);
+ max = Math.max(max, e);
+ error = e - size;
+ sumSquareError += error * error;
+ }
+ ns = System.nanoTime() - ns;
+ long nsPerItem = ns / testCount / runs / (1 + repeat) / size;
+ double stdDev = Math.sqrt(sumSquareError / testCount / runs);
+ double relStdDevP = stdDev / size * 100;
+ int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100);
+ int biasP = (int) (100 * (sum / testCount / runs / size) - 100);
+ if (debug) {
+ System.out.println("m " + m + " size " + size + " relStdDev% " +
(int) relStdDevP +
+ " range " + min + ".." + max +
+ " biasFirst% " + biasFirstP +
+ " bias% " + biasP +
+ " avg " + (sum / testCount / runs) +
+ " time " + nsPerItem);
+ }
+ // we try to reduce the relStdDevP, make sure there are no large values
+ // (trying to reduce sumSquareError directly
+ // would mean we care more about larger sets, but we don't)
+ return Math.pow(relStdDevP, exponent);
+ }
+
+}
\ No newline at end of file