This is an automated email from the ASF dual-hosted git repository.

reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 391318cf79 OAK-11842: Copy Bloom Filter implementation from 
oak-run-commons to oak-commons for re-use (#2432)
391318cf79 is described below

commit 391318cf791074943d93c93e9517927de48fa57d
Author: Julian Reschke <[email protected]>
AuthorDate: Tue Aug 12 08:51:27 2025 +0200

    OAK-11842: Copy Bloom Filter implementation from oak-run-commons to 
oak-commons for re-use (#2432)
    
    * OAK-11842: Copy Bloom Filter implementation from oak-run-commons to 
oak-commons for re-use
    
    * OAK-11842: Copy Bloom Filter implementation from oak-run-commons to 
oak-commons for re-use - remove unintended changes
    
    * OAK-11842 Bloom Filter: improve javadocs
    
    ---------
    
    Co-authored-by: Thomas Mueller <[email protected]>
---
 oak-commons/pom.xml                                |   4 +
 .../oak/commons/collections/BloomFilter.java       | 200 +++++++++++++++++++++
 .../oak/commons/collections/HashUtils.java         |  86 +++++++++
 .../oak/commons/collections/HyperLogLog.java       |  95 ++++++++++
 .../oak/commons/collections/package-info.java      |   2 +-
 .../oak/commons/collections/BloomFilterTest.java   | 200 +++++++++++++++++++++
 .../oak/commons/collections/HyperLogLogTest.java   | 161 +++++++++++++++++
 7 files changed, 747 insertions(+), 1 deletion(-)

diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml
index 8c493f40cb..7ec4827f78 100644
--- a/oak-commons/pom.xml
+++ b/oak-commons/pom.xml
@@ -102,6 +102,10 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-collections4</artifactId>
     </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.jackrabbit</groupId>
       <artifactId>jackrabbit-jcr-commons</artifactId>
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
new file mode 100644
index 0000000000..95f1b2cc71
--- /dev/null
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+/**
+ * A Bloom filter implementation.
+ */
+public class BloomFilter {
+
+    private final int k;
+    private final int arraySize;
+    private final long[] data;
+
+    private BloomFilter(long[] data, int k) {
+        this.data = data;
+        this.k = k;
+        this.arraySize = data.length;
+    }
+
+    /**
+     * Construct a Bloom filter. With a fpp of 0.01, the memory usage is 
roughly 1
+     * byte per entry.
+     *
+     * @param n     the number of expected entries (eg. 1_000_000)
+     * @param fpp   the false-positive probability (eg. 0.01 for a 1% 
false-positive
+     *              probability)
+     * @return the Bloom filter
+     */
+    public static BloomFilter construct(long n, double fpp) {
+        if (n <= 0) {
+            throw new IllegalArgumentException("n must be greater than 0");
+        }
+        if (fpp <= 0 || fpp >= 1) {
+            throw new IllegalArgumentException("fpp must be between 0 and 1");
+        }
+        long m = calculateBits(n, fpp);
+        int k = calculateK((double) m / n);
+        return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
+    }
+
+    // See also https://hur.st/bloomfilter
+
+    /**
+     * Calculate the best k parameter for a Bloom filter.
+     * (k is the number of hash functions to use for one entry).
+     *
+     * @param bitsPerKey the number of bits per key (eg. 10)
+     * @return the k parameter
+     */
+    public static int calculateK(double bitsPerKey) {
+        return Math.max(1, (int) Math.round(bitsPerKey * Math.log(2)));
+    }
+
+    /**
+     * Calculate the number of bits needed for a Bloom filter,
+     * for a given false positive probability.
+     *
+     * @param n the number of entries (eg. 1_000_000)
+     * @param fpp the false positive probability (eg. 0.01)
+     * @return the bits needed
+     */
+    public static long calculateBits(long n, double fpp) {
+        return (long) Math.ceil((n * Math.log(fpp)) / Math.log(1 / Math.pow(2, 
Math.log(2))));
+    }
+
+    /**
+     * Calculate the maximum number of entries in the set, given the memory 
size
+     * in bits, and a target false positive probability.
+     *
+     * @param bits the number of bits (eg. 10_000_000)
+     * @param fpp  the false positive probability (eg. 0.01)
+     * @return the maximum number of entries to be added
+     */
+    public static long calculateN(long bits, double fpp) {
+        return (long) Math.ceil((bits * Math.log(Math.pow(0.5, Math.log(2))) / 
Math.log(fpp)));
+    }
+
+    /**
+     * Calculate the false positive probability.
+     *
+     * @param n    the number of entries (eg. 1_000_000)
+     * @param bits the number of bits (eg. 10_000_000)
+     * @param k    the number of hash functions
+     * @return the false positive probability (eg. 0.01)
+     */
+    public static double calculateFpp(long n, long bits, int k) {
+        // p = pow(1 - exp(-k / (m / n)), k)
+        return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
+    }
+
+    /**
+     * Add an entry.
+     *
+     * @param value the value to add
+     */
+    public void add(String value) {
+        add(HashUtils.hash64(value));
+    }
+
+    /**
+     * Add an entry.
+     *
+     * Note that the false positive rate will increase if the quality of the 
hash
+     * value is low, eg. if only 32 bit hash values are used.
+     * If needed, use HashUtils.hash64(value) as a supplemental hash.
+     *
+     * @param hash the hash value (need to be a high quality hash code, with 
all
+     *             bits having high entropy)
+     */
+    public void add(long hash) {
+        long a = (hash >>> 32) | (hash << 32);
+        long b = hash;
+        for (int i = 0; i < k; i++) {
+            data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
+            a += b;
+        }
+    }
+
+    /**
+     * Tests whether the entry might be in the set.
+     *
+     * @param value the value to check
+     * @return true if the entry was added, or, with a certain false positive
+     *         probability, even if it was not added
+     */
+    public boolean mayContain(String value) {
+        return mayContain(HashUtils.hash64(value));
+    }
+
+    /**
+     * Tests whether the entry might be in the set.
+     *
+     * @param hash the hash value (need to be a high quality hash code, with 
all
+     *             bits having high entropy)
+     * @return true if the entry was added, or, with a certain false positive
+     *         probability, even if it was not added
+     */
+    public boolean mayContain(long hash) {
+        long a = (hash >>> 32) | (hash << 32);
+        long b = hash;
+        for (int i = 0; i < k; i++) {
+            if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L << 
a) == 0) {
+                return false;
+            }
+            a += b;
+        }
+        return true;
+    }
+
+    /**
+     * Get the number of bits needed for the array.
+     *
+     * @return the number of bits
+     */
+    public long getBitCount() {
+        return data.length * 64L;
+    }
+
+    /**
+     * Get the k parameter (the number of hash functions for an entry).
+     *
+     * @return the k parameter
+     */
+    public int getK() {
+        return k;
+    }
+
+    /**
+     * Get the estimated entry count (number of distinct items added). This
+     * operation is relatively slow, as it loops over all the entries.
+     *
+     * @return the estimated entry count,
+     *         or Long.MAX_VALUE if the number can not be estimated.
+     */
+    public long getEstimatedEntryCount() {
+        long x = 0;
+        for (long d : data) {
+            x += Long.bitCount(d);
+        }
+        double m = getBitCount();
+        return (long) (-(m / k) * Math.log(1 - (x / m)));
+    }
+
+} 
\ No newline at end of file
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
new file mode 100644
index 0000000000..90d998664c
--- /dev/null
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.codec.digest.MurmurHash3;
+
+/**
+ * A hash function utility class.
+ */
+public class HashUtils {
+
+    private HashUtils() {
+        // utility class
+    }
+
+    /**
+     * Calculate a 64-bit hash value from a string.
+     *
+     * @param s the string
+     * @return the hash value
+     */
+    public static long hash64(String s) {
+        return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0];
+    }
+
+    /**
+     * Calculate a 64-bit hash value from a value, using a seed.
+     *
+     * The current algorithm used the finalizer of the MurmurHash3 hash 
function,
+     * but callers shouldn't rely on that.
+     *
+     * @param x    the value
+     * @param seed the seed
+     * @return the hash value
+     */
+    public static long hash64(long x, long seed) {
+        x += seed;
+        x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL;
+        x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L;
+        x = x ^ (x >>> 33);
+        return x;
+    }
+
+    /**
+     * Calculate a 64-bit hash value from a value. The input is a 64-bit value 
and
+     * the output is a 64-bit values. Two different inputs are never mapped to 
the
+     * same output. The operation is reversible.
+     *
+     * @param x the value
+     * @return the hash value
+     */
+    public static long hash64(long x) {
+        return hash64(x, 100);
+    }
+
+    /**
+     * Shrink the hash to a value 0..n. Kind of like modulo, but using
+     * multiplication and shift, which are faster to compute.
+     *
+     * @param hash the hash
+     * @param n the maximum of the result
+     * @return the reduced value
+     */
+    public static int reduce(int hash, int n) {
+        // 
http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+        return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
+    }
+
+} 
\ No newline at end of file
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
new file mode 100644
index 0000000000..2054decff0
--- /dev/null
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import java.util.HashSet;
+
+/**
+ * A HyperLogLog implementation.
+ */
+public class HyperLogLog {
+
+    private final double amm2;
+    private final int m;
+    private final byte[] counters;
+    private final int maxSmallSetSize;
+    private HashSet<Long> smallSet;
+
+    public HyperLogLog(int m, int maxSmallSetSize) {
+        this.maxSmallSetSize = maxSmallSetSize;
+        if (maxSmallSetSize > 0) {
+            smallSet = new HashSet<>();
+        } else {
+            smallSet = null;
+        }
+        if (m < 16) {
+            throw new IllegalArgumentException("Must be >= 16, is " + m);
+        }
+        if (Integer.bitCount(m) != 1) {
+            throw new IllegalArgumentException("Must be a power of 2, is " + 
m);
+        }
+        this.m = m;
+        double am;
+        switch (m) {
+        case 32:
+            am = 0.697;
+            break;
+        case 64:
+            am = 0.709;
+            break;
+        default:
+            am = 0.7213 / (1.0 + 1.079 / m);
+        }
+        amm2 = am * m * m;
+        this.counters = new byte[m];
+    }
+
+    public void add(long hash) {
+        if (smallSet != null) {
+            smallSet.add(hash);
+            if (smallSet.size() > maxSmallSetSize) {
+                smallSet = null;
+            }
+        }
+        int i = (int) (hash & (m - 1));
+        counters[i] = (byte) Math.max(counters[i], 1 + 
Long.numberOfLeadingZeros(hash));
+    }
+
+    public long estimate() {
+        if (smallSet != null) {
+            return smallSet.size();
+        }
+        double sum = 0;
+        int countZero = 0;
+        for(int c : counters) {
+            countZero += c == 0 ? 1 : 0;
+            sum += 1. / (1L << (c & 0xff));
+        }
+        if (sum == 0) {
+            sum = 1;
+        }
+        long est = (long) (1. / sum * amm2);
+        if (est <= 5 * m && countZero > 0) {
+            // linear counting
+            est = (long) (m * Math.log((double) m / countZero));
+        }
+        return est;
+    }
+
+} 
\ No newline at end of file
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
index 81c20db5ca..5d1490f210 100644
--- 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
@@ -21,7 +21,7 @@
  * Utilities for Java collections and streams.
  */
 @Internal(since = "1.0.0")
-@Version("2.2.0")
+@Version("2.3.0")
 package org.apache.jackrabbit.oak.commons.collections;
 import org.apache.jackrabbit.oak.commons.annotations.Internal;
 import org.osgi.annotation.versioning.Version;
diff --git 
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
new file mode 100644
index 0000000000..464de8c4ef
--- /dev/null
+++ 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThrows;
+
+import org.junit.Test;
+
+/**
+ * Tests the Bloom filter implementation.
+ */
+public class BloomFilterTest {
+
+    @Test
+    public void calculateBits() {
+        assertEquals(9_585_059, BloomFilter.calculateBits(1_000_000, 0.01));
+        // test some extreme values
+        assertEquals(0, BloomFilter.calculateBits(1, 1.0));
+        assertEquals(1, BloomFilter.calculateBits(1, 0.99));
+        assertEquals(0, BloomFilter.calculateBits(0, 0.0));
+        assertTrue(BloomFilter.calculateBits(Integer.MAX_VALUE, 0.0) > 
Integer.MAX_VALUE);
+    }
+
+    @Test
+    public void calculateK() {
+        assertEquals(7, BloomFilter.calculateK(10));
+        // test some extreme values
+        assertEquals(1, BloomFilter.calculateK(1));
+        assertEquals(1, BloomFilter.calculateK(0));
+        assertEquals(69, BloomFilter.calculateK(100));
+    }
+
+    @Test
+    public void calculateN() {
+        assertEquals(11, BloomFilter.calculateN(100, 0.01));
+        // test some extreme values
+        assertEquals(1, BloomFilter.calculateN(1, 0.01));
+        assertEquals(1, BloomFilter.calculateN(1, 0.1));
+        assertEquals(0, BloomFilter.calculateN(0, 0.01));
+    }
+
+    @Test
+    public void construct() {
+        BloomFilter f = BloomFilter.construct(100, 0.01);
+        assertEquals(960, f.getBitCount());
+        assertEquals(7, f.getK());
+    }
+
+    @Test
+    public void fpp() {
+        for (double fpp = 0.001; fpp < 1; fpp *= 2) {
+            int size = 500_000;
+            BloomFilter f = BloomFilter.construct(size, fpp);
+            for (int i = 0; i < size; i++) {
+                f.add(HashUtils.hash64(i));
+            }
+            for (int i = 0; i < size; i++) {
+                assertTrue(f.mayContain(HashUtils.hash64(i)));
+            }
+            int falsePositives = 0;
+            for (int i = 0; i < size; i++) {
+                if (f.mayContain(HashUtils.hash64(i + size))) {
+                    falsePositives++;
+                }
+            }
+            double realFpp = (double) falsePositives / size;
+            // expected to be within 10%
+            assertTrue("expected fpp: " + fpp + " got: " + realFpp, realFpp >= 
fpp * 0.9 && realFpp <= fpp * 1.1);
+            long est = f.getEstimatedEntryCount();
+            assertTrue("expected n: " + size + " got: " + est, size >= est * 
0.9 && size <= est * 1.1);
+
+            double fpp2 = BloomFilter.calculateFpp(size, f.getBitCount(), 
f.getK());
+            assertTrue("expected fpp: " + fpp + " got: " + fpp2, fpp2 >= fpp * 
0.9 && fpp2 <= fpp * 1.1);
+        }
+    }
+
+    @Test
+    public void estimatedEntryCount() {
+        // let's assume we have a 1 KB Bloom filter with a false positive rate 
of 1%:
+        double fpp = 0.01;
+        long bits = 1000 * 8;
+        long n = BloomFilter.calculateN(bits, fpp);
+        BloomFilter bloom = BloomFilter.construct(n, fpp);
+        // and a HyperLogLog of 1 KB:
+        HyperLogLog hll = new HyperLogLog(1024, 0);
+        // now we calculate estimations with both the Bloom filter and 
HyperLogLog
+        for(int i = 0; i < 20_000; i++) {
+            long x = HashUtils.hash64(i);
+            bloom.add(x);
+            hll.add(x);
+            if (i > 0 && i % 1000 == 0) {
+                long estBloom = bloom.getEstimatedEntryCount();
+                long estHll = hll.estimate();
+                int errBloom = (int) (Math.abs((double) i / estBloom - 1) * 
10000);
+                int errHll = (int) (Math.abs((double) i / estHll - 1) * 10000);
+                if (i < 10_000) {
+                    assertTrue(errBloom < 1000);
+                } else {
+                    assertEquals(Long.MAX_VALUE, estBloom);
+                }
+                assertTrue(errHll < 1000);
+            }
+        }
+    }
+
+    @Test
+    public void filterFunctionality() {
+        BloomFilter filter = BloomFilter.construct(100, 0.01);
+        String testValue = "test-value";
+
+        // Initially should not contain anything
+        assertFalse(filter.mayContain(testValue));
+
+        // Add the item and verify it's found
+        filter.add(testValue);
+        assertTrue(filter.mayContain(testValue));
+
+        // Verify another value is not found
+        assertFalse(filter.mayContain("different-value"));
+    }
+
+    @Test
+    public void filterWithMultipleEntries() {
+        BloomFilter filter = BloomFilter.construct(100, 0.01);
+
+        // Add multiple entries
+        for (int i = 0; i < 100; i++) {
+            filter.add("value-" + i);
+        }
+
+        // Verify all entries are found
+        for (int i = 0; i < 100; i++) {
+            assertTrue(filter.mayContain("value-" + i));
+        }
+    }
+
+    @Test
+    public void falsePositiveProbability() {
+        // Create a filter with high false positive probability for testing
+        double fpp = 0.3;
+        BloomFilter filter = BloomFilter.construct(100, fpp);
+
+        // Fill the filter to capacity
+        for (int i = 0; i < 100; i++) {
+            filter.add("existing-" + i);
+        }
+
+        // Test with values not in the filter
+        int falsePositives = 0;
+        int trials = 1000;
+
+        for (int i = 0; i < trials; i++) {
+            if (filter.mayContain("nonexistent-" + i)) {
+                falsePositives++;
+            }
+        }
+
+        // The false positive rate should be approximately fpp
+        double actualFpp = (double) falsePositives / trials;
+        assertTrue("False positive rate should be close to expected, got " + 
actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15);
+    }
+
+    @Test
+    public void invalidEntries() {
+        // Should throw exception for entries < 1
+        assertThrows(IllegalArgumentException.class,() -> 
BloomFilter.construct(0, 0.01));
+    }
+
+    @Test
+    public void invalidFppZero() {
+        // Should throw exception for fpp <= 0
+        assertThrows(IllegalArgumentException.class,() -> 
BloomFilter.construct(100, 0.0));
+    }
+
+    @Test
+    public void invalidFppOne() {
+        // Should throw exception for fpp >= 1
+        assertThrows(IllegalArgumentException.class,() -> 
BloomFilter.construct(100, 1.0));
+    }    
+
+} 
\ No newline at end of file
diff --git 
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
new file mode 100644
index 0000000000..fd3fb39bff
--- /dev/null
+++ 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.commons.collections;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+public class HyperLogLogTest {
+
+    @Test(expected = IllegalArgumentException.class)
+    public void illegalHyperLogLogTooSmall() {
+        new HyperLogLog(8, 0);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void illegalHyperLogLogNotPowerOfTwo() {
+        new HyperLogLog(30, 0);
+    }
+
+    @Test
+    public void smallSet() {
+        HyperLogLog hll100 = new HyperLogLog(16, 100);
+        assertEquals(0, hll100.estimate());
+        HyperLogLog hll0 = new HyperLogLog(16, 0);
+        assertEquals(0, hll0.estimate());
+        for (int i = 0; i < 10_000; i++) {
+            hll100.add(i % 100);
+            hll0.add(i % 100);
+        }
+        assertEquals(100, hll100.estimate());
+        assertNotEquals(100, hll0.estimate());
+    }
+
+    @Test
+    public void test() {
+        int testCount = 50;
+        for (int m = 16; m <= 128; m *= 2) {
+            double avg = Math.sqrt(averageOverRange(m, 30_000, testCount, 
false, 2));
+            int min, max;
+            switch (m) {
+            case 16:
+                min = 22;
+                max = 23;
+                break;
+            case 32:
+                min = 15;
+                max = 16;
+                break;
+            case 64:
+                min = 10;
+                max = 11;
+                break;
+            case 128:
+                min = 7;
+                max = 8;
+                break;
+            default:
+                min = 0;
+                max = 0;
+                break;
+            }
+            assertTrue("m " + m + " expected " + min + ".." + max + " got " + 
avg, min < avg && avg < max);
+        }
+    }
+
+    private static double averageOverRange(int m, long maxSize, int testCount, 
boolean debug, double exponent) {
+        double sum = 0;
+        int count = 0;
+        for (long size = 1; size <= 20; size++) {
+            sum += test(m, size, testCount, debug, exponent);
+            count++;
+        }
+        for (long size = 22; size <= 300; size += size / 5) {
+            sum += test(m, size, testCount, debug, exponent);
+            count++;
+        }
+        for (long size = 400; size <= maxSize; size *= 2) {
+            sum += test(m, size, testCount, debug, exponent);
+            count++;
+        }
+        return sum / count;
+    }
+
+    private static double test(int m, long size, int testCount, boolean debug, 
double exponent) {
+        long x = 0;
+        long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+        long ns = System.nanoTime();
+        double sumSquareError = 0;
+        double sum = 0;
+        double sumFirst = 0;
+        int repeat = 10;
+        int runs = 2;
+        for (int test = 0; test < testCount; test++) {
+            HyperLogLog hll;
+            hll = new HyperLogLog(m, 0);
+            long baseX = x;
+            for (int i = 0; i < size; i++) {
+                hll.add(HashUtils.hash64(x));
+                x++;
+            }
+            long e = hll.estimate();
+            sum += e;
+            min = Math.min(min, e);
+            max = Math.max(max, e);
+            long error = e - size;
+            sumSquareError += error * error;
+            sumFirst += e;
+            for (int add = 0; add < repeat; add++) {
+                long x2 = baseX;
+                for (int i = 0; i < size; i++) {
+                    hll.add(HashUtils.hash64(x2));
+                    x2++;
+                }
+            }
+            e = hll.estimate();
+            sum += e;
+            min = Math.min(min, e);
+            max = Math.max(max, e);
+            error = e - size;
+            sumSquareError += error * error;
+        }
+        ns = System.nanoTime() - ns;
+        long nsPerItem = ns / testCount / runs / (1 + repeat) / size;
+        double stdDev = Math.sqrt(sumSquareError / testCount / runs);
+        double relStdDevP = stdDev / size * 100;
+        int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100);
+        int biasP = (int) (100 * (sum / testCount / runs / size) - 100);
+        if (debug) {
+            System.out.println("m " + m + " size " + size + " relStdDev% " + 
(int) relStdDevP +
+                    " range " + min + ".." + max +
+                    " biasFirst% " + biasFirstP +
+                    " bias% " + biasP +
+                    " avg " + (sum / testCount / runs) +
+                    " time " + nsPerItem);
+        }
+        // we try to reduce the relStdDevP, make sure there are no large values
+        // (trying to reduce sumSquareError directly
+        // would mean we care more about larger sets, but we don't)
+        return Math.pow(relStdDevP, exponent);
+    }
+
+} 
\ No newline at end of file

Reply via email to