This is an automated email from the ASF dual-hosted git repository.

thomasm pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new c438a3f96a OAK-10674 Use Oak's Bloom filter (#2412)
c438a3f96a is described below

commit c438a3f96a3b7c148a478b098aa59b7a57650c7d
Author: Thomas Mueller <thom...@apache.org>
AuthorDate: Mon Aug 4 20:26:20 2025 +0200

    OAK-10674 Use Oak's Bloom filter (#2412)
    
    * OAK-11787 ElasticRegexPropertyIndexTest.regexPropertyWithoutFlattened
    
    * OAK-10674 Use Oak's Bloom filter
    
    * OAK-10674 Use Oak's Bloom filter
    
    * OAK-10674 Use Oak's Bloom filter
    
    * OAK-10674 Use Oak's Bloom filter
    
    * OAK-10674 Use Oak's Bloom filter
---
 .../jackrabbit/oak/spi/blob/split/BlobIdSet.java   |  16 +-
 .../oak/spi/blob/split/BlobIdSetTest.java          | 238 +++++++++++++++++++++
 oak-commons/pom.xml                                |   4 +
 .../oak/commons/collections}/BloomFilter.java      |  34 ++-
 .../oak/commons/collections/HashUtils.java         |  21 +-
 .../oak/commons/collections}/HyperLogLog.java      |   4 +-
 .../oak/commons/collections/package-info.java      |   2 +-
 .../oak/commons/collections}/BloomFilterTest.java  |  92 +++++++-
 .../oak/commons/collections}/HyperLogLogTest.java  |  39 +---
 oak-run-commons/pom.xml                            |   5 +
 .../flatfile/analysis/modules/BinaryId.java        |   8 +-
 .../analysis/modules/DistinctBinarySize.java       |   4 +-
 .../modules/DistinctBinarySizeHistogram.java       |   2 +-
 .../flatfile/analysis/modules/PropertyStats.java   |   4 +-
 .../flatfile/analysis/utils/TopKValues.java        |   3 +-
 .../index/indexer/document/tree/Prefetcher.java    |   6 +-
 .../analysis/modules/PropertyStatsTest.java        |   2 +-
 .../analysis/utils/CountMinSketchTest.java         |   5 +-
 ...gLogTest.java => HyperLogLog3Linear64Test.java} |  39 +---
 .../flatfile/analysis/utils/TopKValuesTest.java    |  10 +-
 .../document/cache/CacheChangesTracker.java        |  44 ++--
 .../cache/CacheChangesTrackerConcurrencyTest.java  | 215 +++++++++++++++++++
 22 files changed, 658 insertions(+), 139 deletions(-)

diff --git 
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
 
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
index e62084fcfe..49acb59232 100644
--- 
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
+++ 
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
@@ -25,7 +25,6 @@ import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 
 import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
@@ -33,8 +32,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.jackrabbit.guava.common.cache.Cache;
 import org.apache.jackrabbit.guava.common.cache.CacheBuilder;
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnels;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
 
 class BlobIdSet {
 
@@ -42,19 +40,19 @@ class BlobIdSet {
 
     private final File store;
 
-    private final BloomFilter<CharSequence> bloomFilter;
+    private final BloomFilter bloomFilter;
 
     private final Cache<String, Boolean> cache;
 
     BlobIdSet(String repositoryDir, String filename) {
         store = new File(new File(repositoryDir), filename);
-        bloomFilter = 
BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); // 
about 8MB
+        bloomFilter = BloomFilter.construct(9000000, 0.03); // 9M entries, 3% 
false positive rate
         cache = CacheBuilder.newBuilder().maximumSize(1000).build();
         fillBloomFilter();
     }
 
     synchronized boolean contains(String blobId) throws IOException {
-        if (!bloomFilter.apply(blobId)) {
+        if (!bloomFilter.mayContain(blobId)) {
             return false;
         }
         Boolean cached = cache.getIfPresent(blobId);
@@ -64,7 +62,7 @@ class BlobIdSet {
 
         if (isPresentInStore(blobId)) {
             cache.put(blobId, Boolean.TRUE);
-            bloomFilter.put(blobId);
+            bloomFilter.add(blobId);
             return true;
         } else {
             cache.put(blobId, Boolean.FALSE);
@@ -74,7 +72,7 @@ class BlobIdSet {
 
     synchronized void add(String blobId) throws IOException {
         addToStore(blobId);
-        bloomFilter.put(blobId);
+        bloomFilter.add(blobId);
         cache.put(blobId, Boolean.TRUE);
     }
 
@@ -114,7 +112,7 @@ class BlobIdSet {
             reader = new BufferedReader(new FileReader(store));
             String line;
             while ((line = reader.readLine()) != null) {
-                bloomFilter.put(line);
+                bloomFilter.add(line);
             }
         } catch (IOException e) {
             log.error("Can't fill bloom filter", e);
diff --git 
a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
 
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
new file mode 100644
index 0000000000..d933bf29d4
--- /dev/null
+++ 
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.spi.blob.split;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class BlobIdSetTest {
+
+    private static final String TEST_FILENAME = "test-blob-ids.txt";
+
+    private File tempDir;
+    private BlobIdSet blobIdSet;
+
+    @Before
+    public void setup() throws IOException {
+        tempDir = Files.createTempDirectory("blob-id-set-test").toFile();
+        blobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME);
+    }
+
+    @After
+    public void cleanup() {
+        File testFile = new File(tempDir, TEST_FILENAME);
+        if (testFile.exists()) {
+            testFile.delete();
+        }
+        tempDir.delete();
+    }
+
+    @Test
+    public void testAddAndContains() throws IOException {
+        String blobId = "testblob123";
+        Assert.assertFalse("New set should not contain blob ID", 
blobIdSet.contains(blobId));
+
+        blobIdSet.add(blobId);
+        Assert.assertTrue("Set should contain added blob ID", 
blobIdSet.contains(blobId));
+    }
+
+    @Test
+    public void testMultipleAddAndContains() throws IOException {
+        String[] blobIds = {"blob1", "blob2", "blob3", "blob4", "blob5"};
+
+        // Add all blob IDs
+        for (String blobId : blobIds) {
+            blobIdSet.add(blobId);
+        }
+
+        // Check all blob IDs are present
+        for (String blobId : blobIds) {
+            Assert.assertTrue("Set should contain: " + blobId, 
blobIdSet.contains(blobId));
+        }
+
+        // Check a non-existent blob ID
+        Assert.assertFalse("Set should not contain non-existent blob ID", 
blobIdSet.contains("nonexistentblob"));
+    }
+
+    @Test
+    public void testPersistenceAcrossInstances() throws IOException {
+        String blobId = "persistenceblob";
+
+        // Add to the first instance
+        blobIdSet.add(blobId);
+
+        // Create a new instance pointing to the same file
+        BlobIdSet newSet = new BlobIdSet(tempDir.getAbsolutePath(), 
TEST_FILENAME);
+
+        // Verify the new instance sees the previously added blob ID
+        Assert.assertTrue("New instance should see blob ID from file", 
newSet.contains(blobId));
+    }
+
+    @Test
+    public void testEmptyFileStore() throws IOException {
+        // Create with non-existent file
+        File nonExistentDir = 
Files.createTempDirectory("non-existent").toFile();
+        BlobIdSet emptySet = new BlobIdSet(nonExistentDir.getAbsolutePath(), 
"empty.txt");
+
+        // Should not contain any blob IDs
+        Assert.assertFalse(emptySet.contains("anyblob"));
+
+        // Clean up
+        nonExistentDir.delete();
+    }
+
+    @Test
+    public void testLargeNumberOfEntries() throws IOException {
+        // Add a moderate number of entries
+        int count = 1000;
+        for (int i = 0; i < count; i++) {
+            blobIdSet.add("blob-" + i);
+        }
+
+        // Verify all entries can be found
+        for (int i = 0; i < count; i++) {
+            Assert.assertTrue(blobIdSet.contains("blob-" + i));
+        }
+
+        // Non-existent entries should return false
+        for (int i = 0; i < 10; i++) {
+            Assert.assertFalse(blobIdSet.contains("nonexistent-blob-" + i));
+        }
+    }
+
+    @Test
+    public void testFileContainsAddedEntries() throws IOException {
+        // Add several blob IDs
+        String[] blobIds = {"a", "b", "c"};
+        for (String id : blobIds) {
+            blobIdSet.add(id);
+        }
+
+        // Verify the file contains the added blob IDs
+        File storeFile = new File(tempDir, TEST_FILENAME);
+        Assert.assertTrue("Store file should exist", storeFile.exists());
+
+        List<String> fileContent = Files.readAllLines(storeFile.toPath());
+        Assert.assertEquals("File should contain all added blob IDs", 
blobIds.length, fileContent.size());
+
+        for (int i = 0; i < blobIds.length; i++) {
+            Assert.assertEquals("File line should match blob ID", blobIds[i], 
fileContent.get(i));
+        }
+    }
+
+    @Test
+    public void testBloomFilterPreventsUnnecessaryFileReads() throws 
IOException {
+        // The bloom filter should prevent checking the file for non-existent 
IDs
+
+        // Add some entries to populate the bloom filter
+        for (int i = 0; i < 10; i++) {
+            blobIdSet.add("existing-" + i);
+        }
+
+        // Using a unique pattern for non-existent IDs to ensure they hash 
differently
+        // than the ones we added
+        for (int i = 0; i < 10; i++) {
+            Assert.assertFalse(blobIdSet.contains("definitely-not-there-" + 
i));
+        }
+    }
+
+    @Test
+    public void testCachingBehavior() throws IOException {
+        String blobId = "cachedblob";
+
+        // Add the blob ID
+        blobIdSet.add(blobId);
+
+        // First check should populate cache
+        Assert.assertTrue(blobIdSet.contains(blobId));
+
+        // Even if we delete the file, the cached result should be used
+        File storeFile = new File(tempDir, TEST_FILENAME);
+        storeFile.delete();
+
+        // Should still return true due to cache
+        Assert.assertTrue(blobIdSet.contains(blobId));
+    }
+
+    @Test
+    public void testContainsFindsExistingEntriesInFile() throws IOException {
+        // Create some blob IDs
+        String[] blobIds = {"fileblob1", "fileblob2", "fileblob3"};
+
+        // Write blob IDs directly to file (not using BlobIdSet.add())
+        File storeFile = new File(tempDir, TEST_FILENAME);
+        try (FileWriter writer = new FileWriter(storeFile)) {
+            for (String id : blobIds) {
+                writer.write(id + "\n");
+            }
+        }
+
+        // Create a new BlobIdSet instance (which should load from the file)
+        BlobIdSet newBlobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), 
TEST_FILENAME);
+
+        // Verify that contains() finds all the IDs
+        for (String id : blobIds) {
+            Assert.assertTrue("Should contain blob ID written directly to 
file: " + id, newBlobIdSet.contains(id));
+        }
+
+        // Verify a non-existent ID still returns false
+        Assert.assertFalse(newBlobIdSet.contains("notinfile"));
+    }
+
+    @Test
+    public void testBloomFilterFalsePositiveProbabilityLessThanThreePercent() 
throws IOException {
+        // Load the bloom filter with a significant number of entries (about 
5% of configured capacity)
+        final int numToAdd = 5000;
+
+        // Add entries to the bloom filter
+        for (int i = 0; i < numToAdd; i++) {
+            blobIdSet.add("entry-" + i);
+        }
+
+        // Test with non-existent entries using carefully crafted IDs
+        int numTests = 1000;
+        int falsePositives = 0;
+
+        // Use a distinct prefix to ensure test IDs don't conflict with added 
entries
+        for (int i = 0; i < numTests; i++) {
+            String nonExistentId = "non-existent-" + i;
+
+            if (blobIdSet.contains(nonExistentId)) {
+                falsePositives++;
+            }
+        }
+
+        final double falsePositiveRate = (double) falsePositives / numTests;
+
+        // Verify the false positive rate is below the configured 3%
+        Assert.assertTrue(
+                "False positive rate should be less than 3%, was: " + 
(falsePositiveRate * 100) + "%",
+                falsePositiveRate < 0.03
+        );
+    }
+}
\ No newline at end of file
diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml
index 5e8396cd65..8b774a03dc 100644
--- a/oak-commons/pom.xml
+++ b/oak-commons/pom.xml
@@ -102,6 +102,10 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-collections4</artifactId>
     </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.jackrabbit</groupId>
       <artifactId>jackrabbit-jcr-commons</artifactId>
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
similarity index 83%
rename from 
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
rename to 
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
index 3ad439a909..3c266ea1bb 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
 
 /**
  * A Bloom filter implementation.
@@ -44,6 +44,12 @@ public class BloomFilter {
      * @return the Bloom filter
      */
     public static BloomFilter construct(long n, double fpp) {
+        if (n <= 0) {
+            throw new IllegalArgumentException("n must be greater than 0");
+        }
+        if (fpp <= 0 || fpp >= 1) {
+            throw new IllegalArgumentException("fpp must be between 0 and 1");
+        }
         long m = calculateBits(n, fpp);
         int k = calculateK((double) m / n);
         return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
@@ -96,6 +102,15 @@ public class BloomFilter {
         return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
     }
 
+    /**
+     * Add an entry, using an int hash code.
+     *
+     * @param value the value to check
+     */
+    public void add(String value) {
+        add(HashUtils.hash64(value));
+    }
+
     /**
      * Add an entry.
      *
@@ -106,11 +121,22 @@ public class BloomFilter {
         long a = (hash >>> 32) | (hash << 32);
         long b = hash;
         for (int i = 0; i < k; i++) {
-            data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
+            data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
             a += b;
         }
     }
 
+    /**
+     * Whether the entry may be in the set.
+     *
+     * @param value the value to check
+     * @return true if the entry was added, or, with a certain false positive
+     *         probability, even if it was not added
+     */
+    public boolean mayContain(String value) {
+        return mayContain(HashUtils.hash64(value));
+    }
+
     /**
      * Whether the entry may be in the set.
      *
@@ -123,7 +149,7 @@ public class BloomFilter {
         long a = (hash >>> 32) | (hash << 32);
         long b = hash;
         for (int i = 0; i < k; i++) {
-            if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 
0) {
+            if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L << 
a) == 0) {
                 return false;
             }
             a += b;
@@ -159,4 +185,4 @@ public class BloomFilter {
         return (long) (-(m / k) * Math.log(1 - (x / m)));
     }
 
-}
\ No newline at end of file
+} 
\ No newline at end of file
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
similarity index 83%
rename from 
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
rename to 
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
index a2cd599e7b..90d998664c 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
@@ -16,17 +16,30 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.codec.digest.MurmurHash3;
 
 /**
  * A hash function utility class.
  */
-public class Hash {
+public class HashUtils {
 
-    private Hash() {
+    private HashUtils() {
         // utility class
     }
 
+    /**
+     * Calculate a 64-bit hash value from a string.
+     *
+     * @param s the string
+     * @return the hash value
+     */
+    public static long hash64(String s) {
+        return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0];
+    }
+
     /**
      * Calculate a 64-bit hash value from a value, using a seed.
      *
@@ -70,4 +83,4 @@ public class Hash {
         return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
     }
 
-}
+} 
\ No newline at end of file
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
similarity index 97%
rename from 
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
rename to 
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
index dd20a1e777..2054decff0 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
 
 import java.util.HashSet;
 
@@ -92,4 +92,4 @@ public class HyperLogLog {
         return est;
     }
 
-}
+} 
\ No newline at end of file
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
index 81c20db5ca..5d1490f210 100644
--- 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
@@ -21,7 +21,7 @@
  * Utilities for Java collections and streams.
  */
 @Internal(since = "1.0.0")
-@Version("2.2.0")
+@Version("2.3.0")
 package org.apache.jackrabbit.oak.commons.collections;
 import org.apache.jackrabbit.oak.commons.annotations.Internal;
 import org.osgi.annotation.versioning.Version;
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
similarity index 61%
rename from 
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
rename to 
oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
index c3813f340d..464de8c4ef 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
+++ 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
@@ -16,10 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThrows;
 
 import org.junit.Test;
 
@@ -61,9 +63,6 @@ public class BloomFilterTest {
         BloomFilter f = BloomFilter.construct(100, 0.01);
         assertEquals(960, f.getBitCount());
         assertEquals(7, f.getK());
-        f = BloomFilter.construct(0, 0.01);
-        assertEquals(0, f.getBitCount());
-        assertEquals(1, f.getK());
     }
 
     @Test
@@ -72,14 +71,14 @@ public class BloomFilterTest {
             int size = 500_000;
             BloomFilter f = BloomFilter.construct(size, fpp);
             for (int i = 0; i < size; i++) {
-                f.add(Hash.hash64(i));
+                f.add(HashUtils.hash64(i));
             }
             for (int i = 0; i < size; i++) {
-                assertTrue(f.mayContain(Hash.hash64(i)));
+                assertTrue(f.mayContain(HashUtils.hash64(i)));
             }
             int falsePositives = 0;
             for (int i = 0; i < size; i++) {
-                if (f.mayContain(Hash.hash64(i + size))) {
+                if (f.mayContain(HashUtils.hash64(i + size))) {
                     falsePositives++;
                 }
             }
@@ -105,7 +104,7 @@ public class BloomFilterTest {
         HyperLogLog hll = new HyperLogLog(1024, 0);
         // now we calculate estimations with both the Bloom filter and 
HyperLogLog
         for(int i = 0; i < 20_000; i++) {
-            long x = Hash.hash64(i);
+            long x = HashUtils.hash64(i);
             bloom.add(x);
             hll.add(x);
             if (i > 0 && i % 1000 == 0) {
@@ -123,4 +122,79 @@ public class BloomFilterTest {
         }
     }
 
-}
+    @Test
+    public void filterFunctionality() {
+        BloomFilter filter = BloomFilter.construct(100, 0.01);
+        String testValue = "test-value";
+
+        // Initially should not contain anything
+        assertFalse(filter.mayContain(testValue));
+
+        // Add the item and verify it's found
+        filter.add(testValue);
+        assertTrue(filter.mayContain(testValue));
+
+        // Verify another value is not found
+        assertFalse(filter.mayContain("different-value"));
+    }
+
+    @Test
+    public void filterWithMultipleEntries() {
+        BloomFilter filter = BloomFilter.construct(100, 0.01);
+
+        // Add multiple entries
+        for (int i = 0; i < 100; i++) {
+            filter.add("value-" + i);
+        }
+
+        // Verify all entries are found
+        for (int i = 0; i < 100; i++) {
+            assertTrue(filter.mayContain("value-" + i));
+        }
+    }
+
+    @Test
+    public void falsePositiveProbability() {
+        // Create a filter with high false positive probability for testing
+        double fpp = 0.3;
+        BloomFilter filter = BloomFilter.construct(100, fpp);
+
+        // Fill the filter to capacity
+        for (int i = 0; i < 100; i++) {
+            filter.add("existing-" + i);
+        }
+
+        // Test with values not in the filter
+        int falsePositives = 0;
+        int trials = 1000;
+
+        for (int i = 0; i < trials; i++) {
+            if (filter.mayContain("nonexistent-" + i)) {
+                falsePositives++;
+            }
+        }
+
+        // The false positive rate should be approximately fpp
+        double actualFpp = (double) falsePositives / trials;
+        assertTrue("False positive rate should be close to expected, got " + 
actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15);
+    }
+
+    @Test
+    public void invalidEntries() {
+        // Should throw exception for entries < 1
+        assertThrows(IllegalArgumentException.class,() -> 
BloomFilter.construct(0, 0.01));
+    }
+
+    @Test
+    public void invalidFppZero() {
+        // Should throw exception for fpp <= 0
+        assertThrows(IllegalArgumentException.class,() -> 
BloomFilter.construct(100, 0.0));
+    }
+
+    @Test
+    public void invalidFppOne() {
+        // Should throw exception for fpp >= 1
+        assertThrows(IllegalArgumentException.class,() -> 
BloomFilter.construct(100, 1.0));
+    }    
+
+} 
\ No newline at end of file
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
similarity index 84%
copy from 
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
copy to 
oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
index 570af9f16f..fd3fb39bff 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
+++ 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
@@ -53,14 +53,10 @@ public class HyperLogLogTest {
     @Test
     public void test() {
         int testCount = 50;
-        for (int m = 8; m <= 128; m *= 2) {
+        for (int m = 16; m <= 128; m *= 2) {
             double avg = Math.sqrt(averageOverRange(m, 30_000, testCount, 
false, 2));
             int min, max;
             switch (m) {
-            case 8:
-                min = 16;
-                max = 17;
-                break;
             case 16:
                 min = 22;
                 max = 23;
@@ -82,7 +78,6 @@ public class HyperLogLogTest {
                 max = 0;
                 break;
             }
-            // System.out.println(type + " expected " + min + ".." + max + " 
got " + avg);
             assertTrue("m " + m + " expected " + min + ".." + max + " got " + 
avg, min < avg && avg < max);
         }
     }
@@ -116,14 +111,10 @@ public class HyperLogLogTest {
         int runs = 2;
         for (int test = 0; test < testCount; test++) {
             HyperLogLog hll;
-            if (m == 8) {
-                hll = new HyperLogLogUsingLong(16, 0);
-            } else {
-                hll = new HyperLogLog(m, 0);
-            }
+            hll = new HyperLogLog(m, 0);
             long baseX = x;
             for (int i = 0; i < size; i++) {
-                hll.add(Hash.hash64(x));
+                hll.add(HashUtils.hash64(x));
                 x++;
             }
             long e = hll.estimate();
@@ -136,7 +127,7 @@ public class HyperLogLogTest {
             for (int add = 0; add < repeat; add++) {
                 long x2 = baseX;
                 for (int i = 0; i < size; i++) {
-                    hll.add(Hash.hash64(x2));
+                    hll.add(HashUtils.hash64(x2));
                     x2++;
                 }
             }
@@ -167,22 +158,4 @@ public class HyperLogLogTest {
         return Math.pow(relStdDevP, exponent);
     }
 
-    static class HyperLogLogUsingLong extends HyperLogLog {
-
-        private long value;
-
-        public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
-            super(m, maxSmallSetSize);
-        }
-
-        public void add(long hash) {
-            value = HyperLogLog3Linear64.add(value, hash);
-        }
-
-        public long estimate() {
-            return HyperLogLog3Linear64.estimate(value);
-        }
-
-    }
-
-}
+} 
\ No newline at end of file
diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml
index e1288dccc2..efd3657e51 100644
--- a/oak-run-commons/pom.xml
+++ b/oak-run-commons/pom.xml
@@ -110,6 +110,11 @@
             <artifactId>oak-jcr</artifactId>
             <version>${project.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.jackrabbit</groupId>
+            <artifactId>oak-commons</artifactId>
+            <version>${project.version}</version>
+        </dependency>
 
         <dependency>
             <groupId>org.jetbrains</groupId>
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
index 3ec93f175d..741c1cceae 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
@@ -20,7 +20,7 @@ package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul
 
 import java.util.Objects;
 
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 
 /**
  * A binary id.
@@ -51,9 +51,9 @@ public class BinaryId {
         // we need to hash again because some of the bits are fixed
         // in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx
         // (the hash64 is a reversible mapping, so there is no risk of 
conflicts)
-        this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 
16));
-        this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 
16));
-        this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32, 
Math.min(48, buff.length())), 16));
+        this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0, 
16), 16));
+        this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16, 
32), 16));
+        this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32, 
Math.min(48, buff.length())), 16));
     }
 
     @Override
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
index 272c7f357b..878375f1f8 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
@@ -28,8 +28,8 @@ import java.util.stream.Collectors;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 
 /**
  * Collects the number and size of distinct binaries.
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
index 349a3fe35b..061aa39d52 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
@@ -27,7 +27,7 @@ import java.util.stream.Collectors;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 
 /**
  * A histogram of distinct binaries. For each size range, we calculate the
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
index 80227f5744..c749a1ab59 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
@@ -31,7 +31,7 @@ import java.util.stream.Collectors;
 
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues;
 
@@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector {
         stats.values += p.getValues().length;
         if (stats.count > MIN_PROPERTY_COUNT) {
             for (String v : p.getValues()) {
-                long hash = Hash.hash64(v.hashCode(), seed);
+                long hash = HashUtils.hash64(v.hashCode(), seed);
                 stats.hll = HyperLogLog3Linear64.add(stats.hll, hash);
                 stats.size += v.length();
                 stats.maxSize = Math.max(stats.maxSize, v.length());
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
index 2c235f2197..48b4a63b0b 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
@@ -21,6 +21,7 @@ package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils
 import java.util.ArrayList;
 import java.util.HashMap;
 
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
 
 /**
@@ -91,7 +92,7 @@ public class TopKValues {
         if (countedCount > 1000) {
             skipRemaining = SKIP;
         }
-        long hash = Hash.hash64(value.hashCode());
+        long hash = HashUtils.hash64(value);
         long est = sketch.addAndEstimate(hash);
         if (est < min) {
             return;
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
index 10992f5c06..8f4e6d2cac 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
@@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -240,7 +240,7 @@ public class Prefetcher {
         // and then we use a secondary hash
         // otherwise the estimation is way off
         int h = blob.getContentIdentity().hashCode();
-        return Hash.hash64(h | (blob.length() << 32));
+        return HashUtils.hash64(h | (blob.length() << 32));
     }
 
     static enum PrefetchType {
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
index 7803a54400..5fe819335d 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
@@ -67,7 +67,7 @@ public class PropertyStatsTest {
             pc.add(n);
         }
         assertEquals("PropertyStats\n"
-                + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 
maxSize 11 top 
{\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+                + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 
maxSize 11 top 
{\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
                 + "", pc.toString());
     }
 
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
index 92935ca286..212c0fba22 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue;
 import java.util.Arrays;
 import java.util.Random;
 
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 import org.junit.Test;
 
 public class CountMinSketchTest {
@@ -83,11 +84,11 @@ public class CountMinSketchTest {
                     }
                     CountMinSketch est = new CountMinSketch(5, 16);
                     for (int i = 0; i < size; i++) {
-                        est.add(Hash.hash64(x + data[i]));
+                        est.add(HashUtils.hash64(x + data[i]));
                     }
                     int[] counts = getCounts(data);
                     for (int i = 0; i < 10; i++) {
-                        long e = est.estimate(Hash.hash64(x + i));
+                        long e = est.estimate(HashUtils.hash64(x + i));
                         long expectedPercent = (int) (100. * counts[i] / size);
                         long estPercent = (int) (100. * e / size);
                         if (debug) {
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
similarity index 83%
rename from 
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
rename to 
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
index 570af9f16f..eb7ab9a692 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
@@ -18,37 +18,13 @@
  */
 package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 import org.junit.Test;
 
-public class HyperLogLogTest {
-
-    @Test(expected = IllegalArgumentException.class)
-    public void illegalHyperLogLogTooSmall() {
-        new HyperLogLog(8, 0);
-    }
-
-    @Test(expected = IllegalArgumentException.class)
-    public void illegalHyperLogLogNotPowerOfTwo() {
-        new HyperLogLog(30, 0);
-    }
-
-    @Test
-    public void smallSet() {
-        HyperLogLog hll100 = new HyperLogLog(16, 100);
-        assertEquals(0, hll100.estimate());
-        HyperLogLog hll0 = new HyperLogLog(16, 0);
-        assertEquals(0, hll0.estimate());
-        for (int i = 0; i < 10_000; i++) {
-            hll100.add(i % 100);
-            hll0.add(i % 100);
-        }
-        assertEquals(100, hll100.estimate());
-        assertNotEquals(100, hll0.estimate());
-    }
+public class HyperLogLog3Linear64Test {
 
     @Test
     public void test() {
@@ -82,7 +58,6 @@ public class HyperLogLogTest {
                 max = 0;
                 break;
             }
-            // System.out.println(type + " expected " + min + ".." + max + " 
got " + avg);
             assertTrue("m " + m + " expected " + min + ".." + max + " got " + 
avg, min < avg && avg < max);
         }
     }
@@ -123,7 +98,7 @@ public class HyperLogLogTest {
             }
             long baseX = x;
             for (int i = 0; i < size; i++) {
-                hll.add(Hash.hash64(x));
+                hll.add(HashUtils.hash64(x));
                 x++;
             }
             long e = hll.estimate();
@@ -136,7 +111,7 @@ public class HyperLogLogTest {
             for (int add = 0; add < repeat; add++) {
                 long x2 = baseX;
                 for (int i = 0; i < size; i++) {
-                    hll.add(Hash.hash64(x2));
+                    hll.add(HashUtils.hash64(x2));
                     x2++;
                 }
             }
@@ -175,14 +150,16 @@ public class HyperLogLogTest {
             super(m, maxSmallSetSize);
         }
 
+        @Override
         public void add(long hash) {
             value = HyperLogLog3Linear64.add(value, hash);
         }
 
+        @Override
         public long estimate() {
             return HyperLogLog3Linear64.estimate(value);
         }
 
     }
 
-}
+} 
\ No newline at end of file
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
index 696e2e724f..dfd36a6eba 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
@@ -30,17 +30,17 @@ public class TopKValuesTest {
     public void test() {
         TopKValues v = new TopKValues(3);
         Random r = new Random(1);
-        for(int i=0; i<1000000; i++) {
-            if(r.nextBoolean()) {
+        for (int i = 0; i < 1000000; i++) {
+            if (r.nextBoolean()) {
                 v.add("common" + r.nextInt(2));
             } else {
                 v.add("rare" + r.nextInt(100));
             }
         }
-        
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}",
 v.toString());
+        
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}",
 v.toString());
         assertEquals(91809, v.getCount());
-        assertEquals(24849, v.getTopCount());
-        assertEquals(24652, v.getSecondCount());
+        assertEquals(24231, v.getTopCount());
+        assertEquals(23844, v.getSecondCount());
     }
 
 }
diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
index 14e30cd673..196f6188d4 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
@@ -16,14 +16,13 @@
  */
 package org.apache.jackrabbit.oak.plugins.document.cache;
 
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnel;
-import org.apache.jackrabbit.guava.common.hash.PrimitiveSink;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.Closeable;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Predicate;
 
 public class CacheChangesTracker implements Closeable {
@@ -62,10 +61,10 @@ public class CacheChangesTracker implements Closeable {
         changeTrackers.remove(this);
 
         if (LOG.isDebugEnabled()) {
-            if (lazyBloomFilter.filter == null) {
+            if (lazyBloomFilter.filterRef.get() == null) {
                 LOG.debug("Disposing CacheChangesTracker for {}, no filter was 
needed", keyFilter);
             } else {
-                LOG.debug("Disposing CacheChangesTracker for {}, filter fpp 
was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp());
+                LOG.debug("Disposing CacheChangesTracker for {}, filter fpp 
was: {}", keyFilter, LazyBloomFilter.FPP);
             }
         }
     }
@@ -76,38 +75,33 @@ public class CacheChangesTracker implements Closeable {
 
         private final int entries;
 
-        private volatile BloomFilter<String> filter;
+        private final AtomicReference<BloomFilter> filterRef;
 
         public LazyBloomFilter(int entries) {
             this.entries = entries;
+            this.filterRef = new AtomicReference<>();
         }
 
         public synchronized void put(String entry) {
-            getFilter().put(entry);
+            getFilter().add(entry);
         }
 
         public boolean mightContain(String entry) {
-            if (filter == null) {
-                return false;
-            } else {
-                synchronized (this) {
-                    return filter.mightContain(entry);
-                }
-            }
+            BloomFilter f = filterRef.get();
+            return f != null && f.mayContain(entry);
         }
 
-        private BloomFilter<String> getFilter() {
-            if (filter == null) {
-                filter = BloomFilter.create(new Funnel<String>() {
-                    private static final long serialVersionUID = 
-7114267990225941161L;
-
-                    @Override
-                    public void funnel(String from, PrimitiveSink into) {
-                        into.putUnencodedChars(from);
-                    }
-                }, entries, FPP);
+        private BloomFilter getFilter() {
+            BloomFilter result = filterRef.get();
+            if (result == null) {
+                BloomFilter newFilter = BloomFilter.construct(entries, FPP);
+                if (filterRef.compareAndSet(null, newFilter)) {
+                    result = newFilter;
+                } else {
+                    result = filterRef.get();
+                }
             }
-            return filter;
+            return result;
         }
     }
 }
diff --git 
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
new file mode 100644
index 0000000000..5d593df76c
--- /dev/null
+++ 
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.document.cache;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.junit.Assert.*;
+
+/**
+ * Tests for CacheChangesTracker concurrency scenarios, particularly
+ * the LazyBloomFilter double-checked locking implementation.
+ */
+public class CacheChangesTrackerConcurrencyTest {
+
+    /**
+     * Test concurrent initialization of LazyBloomFilter to ensure
+     * double-checked locking prevents race conditions.
+     */
+    @Test
+    public void testLazyBloomFilterConcurrentInitialization() throws 
InterruptedException {
+        final int threadCount = 20;
+        final int entriesPerThread = 50;
+        final CountDownLatch startLatch = new CountDownLatch(1);
+        final CountDownLatch doneLatch = new CountDownLatch(threadCount);
+        final ExecutorService executor = 
Executors.newFixedThreadPool(threadCount);
+        
+        // Create a LazyBloomFilter instance
+        final CacheChangesTracker.LazyBloomFilter lazyFilter = 
+            new CacheChangesTracker.LazyBloomFilter(1000);
+        
+        final AtomicInteger putOperations = new AtomicInteger(0);
+        final List<Exception> exceptions = Collections.synchronizedList(new 
ArrayList<>());
+        
+        try {
+            // Create multiple threads that will all try to initialize and use 
the filter simultaneously
+            for (int i = 0; i < threadCount; i++) {
+                final int threadId = i;
+                executor.submit(() -> {
+                    try {
+                        // Wait for all threads to be ready
+                        startLatch.await();
+                        
+                        // Each thread adds multiple entries
+                        for (int j = 0; j < entriesPerThread; j++) {
+                            String key = "thread-" + threadId + "-key-" + j;
+                            lazyFilter.put(key);
+                            putOperations.incrementAndGet();
+                            
+                            // Add a small random delay to increase chance of 
race condition
+                            if (j % 10 == 0) {
+                                Thread.sleep(1);
+                            }
+                        }
+                    } catch (Exception e) {
+                        exceptions.add(e);
+                    } finally {
+                        doneLatch.countDown();
+                    }
+                });
+            }
+            
+            // Start all threads simultaneously
+            startLatch.countDown();
+            
+            // Wait for all threads to complete
+            assertTrue("Test timed out", doneLatch.await(30, 
TimeUnit.SECONDS));
+            
+            // Verify no exceptions occurred
+            if (!exceptions.isEmpty()) {
+                fail("Exceptions occurred during concurrent access: " + 
exceptions.get(0));
+            }
+            
+            // Verify all put operations completed
+            assertEquals(threadCount * entriesPerThread, putOperations.get());
+            
+            // Verify the filter works correctly after concurrent 
initialization
+            for (int i = 0; i < threadCount; i++) {
+                for (int j = 0; j < entriesPerThread; j++) {
+                    String key = "thread-" + i + "-key-" + j;
+                    assertTrue("Filter should contain key: " + key, 
lazyFilter.mightContain(key));
+                }
+            }
+            
+            // Verify false positive behavior (some keys that weren't added 
should return false)
+            int falsePositives = 0;
+            int testKeys = 100;
+            for (int i = 0; i < testKeys; i++) {
+                String nonExistentKey = "non-existent-key-" + i;
+                if (lazyFilter.mightContain(nonExistentKey)) {
+                    falsePositives++;
+                }
+            }
+            
+            // With 1000 entries and 1% FPP, we expect roughly 1% false 
positives for non-existent keys
+            // Allow for some variance but it shouldn't be too high
+            assertTrue("False positive rate too high: " + falsePositives + "/" 
+ testKeys, 
+                       falsePositives < testKeys * 0.05); // Allow up to 5% to 
account for variance
+            
+        } finally {
+            executor.shutdown();
+            if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
+                executor.shutdownNow();
+            }
+        }
+    }
+    
+    /**
+     * Test concurrent put and mightContain operations to ensure thread safety.
+     */
+    @Test
+    public void testLazyBloomFilterConcurrentReadWrite() throws 
InterruptedException {
+        final int threadCount = 10;
+        final int operationsPerThread = 100;
+        final CountDownLatch startLatch = new CountDownLatch(1);
+        final CountDownLatch doneLatch = new CountDownLatch(threadCount);
+        final ExecutorService executor = 
Executors.newFixedThreadPool(threadCount);
+        
+        final CacheChangesTracker.LazyBloomFilter lazyFilter = 
+            new CacheChangesTracker.LazyBloomFilter(2000);
+        
+        final AtomicInteger readOperations = new AtomicInteger(0);
+        final AtomicInteger writeOperations = new AtomicInteger(0);
+        final List<Exception> exceptions = Collections.synchronizedList(new 
ArrayList<>());
+        
+        try {
+            // Create mixed read/write threads
+            for (int i = 0; i < threadCount; i++) {
+                final int threadId = i;
+                final boolean isWriter = (i % 2 == 0);
+                
+                executor.submit(() -> {
+                    try {
+                        startLatch.await();
+                        
+                        for (int j = 0; j < operationsPerThread; j++) {
+                            String key = "mixed-thread-" + threadId + "-key-" 
+ j;
+                            
+                            if (isWriter || j < 10) { // Writers, or first few 
operations of readers
+                                lazyFilter.put(key);
+                                writeOperations.incrementAndGet();
+                            }
+                            
+                            // All threads also do reads
+                            boolean result = lazyFilter.mightContain(key);
+                            readOperations.incrementAndGet();
+                            
+                            // If we just wrote the key, it should definitely 
be found
+                            if (isWriter || j < 10) {
+                                assertTrue("Key should be found after being 
added: " + key, result);
+                            }
+                        }
+                    } catch (Exception e) {
+                        exceptions.add(e);
+                    } finally {
+                        doneLatch.countDown();
+                    }
+                });
+            }
+            
+            startLatch.countDown();
+            assertTrue("Test timed out", doneLatch.await(30, 
TimeUnit.SECONDS));
+            
+            if (!exceptions.isEmpty()) {
+                fail("Exceptions occurred during concurrent read/write: " + 
exceptions.get(0));
+            }
+            
+            assertTrue("Should have performed read operations", 
readOperations.get() > 0);
+            assertTrue("Should have performed write operations", 
writeOperations.get() > 0);
+            
+        } finally {
+            executor.shutdown();
+            if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
+                executor.shutdownNow();
+            }
+        }
+    }
+    
+    /**
+     * Test that LazyBloomFilter behaves correctly when filter is never 
initialized
+     * (i.e., only mightContain is called, never put).
+     */
+    @Test
+    public void testLazyBloomFilterNoInitialization() {
+        CacheChangesTracker.LazyBloomFilter lazyFilter = 
+            new CacheChangesTracker.LazyBloomFilter(1000);
+        
+        // Should return false for any key when filter is not initialized
+        assertFalse(lazyFilter.mightContain("any-key"));
+        assertFalse(lazyFilter.mightContain("another-key"));
+        assertFalse(lazyFilter.mightContain(""));
+    }
+} 
\ No newline at end of file

Reply via email to