This is an automated email from the ASF dual-hosted git repository.

daim pushed a commit to branch OAK-11809
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git

commit 00ff914d06891b843588901766ae9f5c9c7e31e2
Author: rishabhdaim <[email protected]>
AuthorDate: Wed Jul 23 13:40:52 2025 +0530

    OAK-11809 : replaced guava's bloom filter with apache commons-collections4 
implementation
---
 .../jackrabbit/oak/spi/blob/split/BlobIdSet.java   |  23 +--
 oak-commons/pom.xml                                |   4 +
 .../oak/commons/collections/BloomFilterUtils.java  |  78 ++++++++
 .../oak/commons/collections/package-info.java      |   2 +-
 .../commons/collections/BloomFilterUtilsTest.java  | 202 +++++++++++++++++++++
 .../document/cache/CacheChangesTracker.java        |  27 ++-
 6 files changed, 308 insertions(+), 28 deletions(-)

diff --git 
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
 
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
index e62084fcfe..6ed481890d 100644
--- 
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
+++ 
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
@@ -21,20 +21,20 @@ package org.apache.jackrabbit.oak.spi.blob.split;
 
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 
+import org.apache.commons.collections4.bloomfilter.BloomFilter;
+import org.apache.commons.collections4.bloomfilter.Hasher;
+import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter;
 import org.apache.commons.io.IOUtils;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilterUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.jackrabbit.guava.common.cache.Cache;
 import org.apache.jackrabbit.guava.common.cache.CacheBuilder;
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnels;
 
 class BlobIdSet {
 
@@ -42,19 +42,20 @@ class BlobIdSet {
 
     private final File store;
 
-    private final BloomFilter<CharSequence> bloomFilter;
+    private final BloomFilter<SimpleBloomFilter> bloomFilter;
 
     private final Cache<String, Boolean> cache;
 
     BlobIdSet(String repositoryDir, String filename) {
         store = new File(new File(repositoryDir), filename);
-        bloomFilter = 
BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); // 
about 8MB
+        bloomFilter = BloomFilterUtils.createFilter(9000000, 0.03); // about 
8MB & 3% false positive rate
         cache = CacheBuilder.newBuilder().maximumSize(1000).build();
         fillBloomFilter();
     }
 
     synchronized boolean contains(String blobId) throws IOException {
-        if (!bloomFilter.apply(blobId)) {
+        final Hasher hasher = BloomFilterUtils.hasher(blobId);
+        if (!bloomFilter.contains(hasher)) {
             return false;
         }
         Boolean cached = cache.getIfPresent(blobId);
@@ -64,7 +65,7 @@ class BlobIdSet {
 
         if (isPresentInStore(blobId)) {
             cache.put(blobId, Boolean.TRUE);
-            bloomFilter.put(blobId);
+            bloomFilter.merge(hasher);
             return true;
         } else {
             cache.put(blobId, Boolean.FALSE);
@@ -74,11 +75,11 @@ class BlobIdSet {
 
     synchronized void add(String blobId) throws IOException {
         addToStore(blobId);
-        bloomFilter.put(blobId);
+        bloomFilter.merge(BloomFilterUtils.hasher(blobId));
         cache.put(blobId, Boolean.TRUE);
     }
 
-    private boolean isPresentInStore(String blobId) throws 
FileNotFoundException, IOException {
+    private boolean isPresentInStore(String blobId) throws IOException {
         if (!store.exists()) {
             return false;
         }
@@ -114,7 +115,7 @@ class BlobIdSet {
             reader = new BufferedReader(new FileReader(store));
             String line;
             while ((line = reader.readLine()) != null) {
-                bloomFilter.put(line);
+                bloomFilter.merge(BloomFilterUtils.hasher(line));
             }
         } catch (IOException e) {
             log.error("Can't fill bloom filter", e);
diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml
index 5e8396cd65..8b774a03dc 100644
--- a/oak-commons/pom.xml
+++ b/oak-commons/pom.xml
@@ -102,6 +102,10 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-collections4</artifactId>
     </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.jackrabbit</groupId>
       <artifactId>jackrabbit-jcr-commons</artifactId>
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtils.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtils.java
new file mode 100644
index 0000000000..ba212ce75e
--- /dev/null
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtils.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.commons.collections;
+
+import org.apache.commons.codec.digest.MurmurHash3;
+import org.apache.commons.collections4.bloomfilter.BloomFilter;
+import org.apache.commons.collections4.bloomfilter.EnhancedDoubleHasher;
+import org.apache.commons.collections4.bloomfilter.Hasher;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter;
+import org.jetbrains.annotations.NotNull;
+
+import java.nio.charset.StandardCharsets;
+
+public class BloomFilterUtils {
+
+    private BloomFilterUtils() {
+        // no instances for you
+    }
+
+    /**
+     * Creates a new Bloom filter with the specified expected entries and 
false positive probability.
+     * <p>
+     * The method constructs a properly configured Bloom filter by first 
calculating the optimal
+     * filter shape (bit size and hash functions) based on the provided 
parameters, then instantiating
+     * a SimpleBloomFilter with that shape.
+     * <p>
+     * The resulting Bloom filter provides efficient membership testing with a 
predictable
+     * false positive rate.
+     *
+     * @param entries the expected number of entries to be inserted into the 
filter
+     * @param fpp the desired false positive probability (between 0 and 1 
exclusive)
+     * @return a new empty BloomFilter instance configured with the specified 
parameters
+     * @throws IllegalArgumentException if entries is less than 1 or if fpp is 
not between 0 and 1 exclusive
+     */
+    public static BloomFilter<SimpleBloomFilter> createFilter(final int 
entries, final double fpp) {
+        final Shape shape = Shape.fromNP(entries, fpp);
+        return new SimpleBloomFilter(shape);
+    }
+
+
+    /**
+     * Creates a Bloom filter-compatible {@link Hasher} for the provided 
string value.
+     *
+     * @param value the string value to hash must not be null
+     * @return a non-null {@link Hasher} implementation based on the hash of 
the input string
+     * @throws NullPointerException if the input value is null
+     */
+
+    // This method generates a 128-bit MurmurHash3 hash from the UTF-8 
representation
+    // of the input string, then creates an {@link EnhancedDoubleHasher} using 
the
+    // two 64-bit components of the hash. The resulting hasher can be used 
with Bloom
+    // filters from the Apache Commons Collections.
+
+    public static @NotNull Hasher hasher(final @NotNull String value) {
+        final long[] hashed128 = 
MurmurHash3.hash128(value.getBytes(StandardCharsets.UTF_8));
+        return new EnhancedDoubleHasher(hashed128[0], hashed128[1]);
+    }
+
+
+}
diff --git 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
index 81c20db5ca..5d1490f210 100644
--- 
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
+++ 
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
@@ -21,7 +21,7 @@
  * Utilities for Java collections and streams.
  */
 @Internal(since = "1.0.0")
-@Version("2.2.0")
+@Version("2.3.0")
 package org.apache.jackrabbit.oak.commons.collections;
 import org.apache.jackrabbit.oak.commons.annotations.Internal;
 import org.osgi.annotation.versioning.Version;
diff --git 
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtilsTest.java
 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtilsTest.java
new file mode 100644
index 0000000000..bbd0ec25ed
--- /dev/null
+++ 
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtilsTest.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.commons.collections;
+
+import org.apache.commons.collections4.bloomfilter.BloomFilter;
+import org.apache.commons.collections4.bloomfilter.EnhancedDoubleHasher;
+import org.apache.commons.collections4.bloomfilter.Hasher;
+import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.lang.reflect.Field;
+
+public class BloomFilterUtilsTest {
+
+    @Test
+    public void testCreateFilterWithValidParameters() {
+        int entries = 1000;
+        double fpp = 0.01;
+        BloomFilter<SimpleBloomFilter> filter = 
BloomFilterUtils.createFilter(entries, fpp);
+
+        Assert.assertNotNull(filter);
+        Assert.assertTrue(filter instanceof SimpleBloomFilter);
+    }
+
+    @Test
+    public void testFilterFunctionality() {
+        BloomFilter<SimpleBloomFilter> filter = 
BloomFilterUtils.createFilter(100, 0.01);
+        String testValue = "test-value";
+
+        // Initially should not contain anything
+        
Assert.assertFalse(filter.contains(BloomFilterUtils.hasher(testValue)));
+
+        // Add the item and verify it's found
+        filter.merge(BloomFilterUtils.hasher(testValue));
+        Assert.assertTrue(filter.contains(BloomFilterUtils.hasher(testValue)));
+
+        // Verify another value is not found
+        
Assert.assertFalse(filter.contains(BloomFilterUtils.hasher("different-value")));
+    }
+
+    @Test
+    public void testFilterWithMultipleEntries() {
+        BloomFilter<SimpleBloomFilter> filter = 
BloomFilterUtils.createFilter(1000, 0.01);
+
+        // Add multiple entries
+        for (int i = 0; i < 100; i++) {
+            filter.merge(BloomFilterUtils.hasher("value-" + i));
+        }
+
+        // Verify all entries are found
+        for (int i = 0; i < 100; i++) {
+            Assert.assertTrue(filter.contains(BloomFilterUtils.hasher("value-" 
+ i)));
+        }
+    }
+
+    @Test
+    public void testFalsePositiveProbability() {
+        // Create a filter with high false positive probability for testing
+        double fpp = 0.3;
+        BloomFilter<SimpleBloomFilter> filter = 
BloomFilterUtils.createFilter(100, fpp);
+
+        // Fill the filter to capacity
+        for (int i = 0; i < 100; i++) {
+            filter.merge(BloomFilterUtils.hasher("existing-" + i));
+        }
+
+        // Test with values not in the filter
+        int falsePositives = 0;
+        int trials = 1000;
+
+        for (int i = 0; i < trials; i++) {
+            if (filter.contains(BloomFilterUtils.hasher("nonexistent-" + i))) {
+                falsePositives++;
+            }
+        }
+
+        // The false positive rate should be approximately fpp
+        double actualFpp = (double) falsePositives / trials;
+        Assert.assertTrue("False positive rate should be close to expected", 
Math.abs(actualFpp - fpp) < 0.15);
+    }
+
+    @Test
+    public void testInvalidEntries() {
+        // Should throw exception for entries < 1
+        Assert.assertThrows(IllegalArgumentException.class,() -> 
BloomFilterUtils.createFilter(0, 0.01));
+    }
+
+    @Test
+    public void testInvalidFppZero() {
+        // Should throw exception for fpp <= 0
+        Assert.assertThrows(IllegalArgumentException.class,() -> 
BloomFilterUtils.createFilter(100, 0.0));
+    }
+
+    @Test
+    public void testInvalidFppOne() {
+        // Should throw exception for fpp >= 1
+        Assert.assertThrows(IllegalArgumentException.class,() -> 
BloomFilterUtils.createFilter(100, 1.0));
+    }
+
+    @Test
+    public void testHasherWithNormalString() {
+        // Create a hasher from a string
+        Hasher hasher = BloomFilterUtils.hasher("test string");
+
+        // Verify the hasher is not null and correct type
+        Assert.assertNotNull(hasher);
+        Assert.assertTrue(hasher instanceof EnhancedDoubleHasher);
+    }
+
+    @Test
+    public void testHasherWithEmptyString() {
+        // Empty strings should also work
+        Hasher hasher = BloomFilterUtils.hasher("");
+
+        Assert.assertNotNull(hasher);
+        Assert.assertTrue(hasher instanceof EnhancedDoubleHasher);
+    }
+
+    @Test
+    public void testConsistentHashing() throws ReflectiveOperationException {
+        // Create two hashers from the same string
+        Hasher hasher1 = BloomFilterUtils.hasher("consistent");
+        Hasher hasher2 = BloomFilterUtils.hasher("consistent");
+
+        // Same string should produce identical hash values
+        Assert.assertTrue(compareHasherContents(hasher1, hasher2));
+    }
+
+    @Test
+    public void testDifferentStringsProduceDifferentHashes() throws 
ReflectiveOperationException {
+        Hasher hasher1 = BloomFilterUtils.hasher("string1");
+        Hasher hasher2 = BloomFilterUtils.hasher("string2");
+
+        // Different strings should produce different hashers
+        Assert.assertFalse(compareHasherContents(hasher1, hasher2));
+    }
+
+    @Test
+    public void testHasherWithNullThrowsNPE() {
+        // Method should throw NullPointerException when given null
+        Assert.assertThrows(NullPointerException.class, () -> 
BloomFilterUtils.hasher(null));
+    }
+
+    @Test
+    public void testHasherWithSpecialCharacters() {
+        // Special characters should be handled properly
+        Hasher hasher = BloomFilterUtils.hasher("!@#$%^&*()_+");
+        Assert.assertNotNull(hasher);
+    }
+
+    // util mehtods
+    /**
+     * Compares two EnhancedDoubleHasher instances by examining their internal 
hash values.
+     *
+     * @param h1 first Hasher to compare, must be EnhancedDoubleHasher
+     * @param h2 second Hasher to compare, must be EnhancedDoubleHasher
+     * @return true if both hashers have identical internal hash values
+     * @throws IllegalArgumentException if either hasher is not 
EnhancedDoubleHasher
+     * @throws ReflectiveOperationException if reflection fails
+     */
+    private boolean compareHasherContents(final Hasher h1, final Hasher h2)
+            throws ReflectiveOperationException {
+
+        if (!(h1 instanceof EnhancedDoubleHasher) ||
+                !(h2 instanceof EnhancedDoubleHasher)) {
+            throw new IllegalArgumentException("Both hashers must be 
EnhancedDoubleHasher instances");
+        }
+
+        // Use reflection to access the private fields
+        Field value1Field = 
EnhancedDoubleHasher.class.getDeclaredField("initial");
+        Field value2Field = 
EnhancedDoubleHasher.class.getDeclaredField("increment");
+
+        value1Field.setAccessible(true);
+        value2Field.setAccessible(true);
+
+        long value1A = (long) value1Field.get(h1);
+        long value2A = (long) value2Field.get(h1);
+        long value1B = (long) value1Field.get(h2);
+        long value2B = (long) value2Field.get(h2);
+
+        return (value1A == value1B) && (value2A == value2B);
+    }
+
+}
\ No newline at end of file
diff --git 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
index 14e30cd673..d31b1086a7 100644
--- 
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
+++ 
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
@@ -16,9 +16,10 @@
  */
 package org.apache.jackrabbit.oak.plugins.document.cache;
 
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnel;
-import org.apache.jackrabbit.guava.common.hash.PrimitiveSink;
+import org.apache.commons.collections4.bloomfilter.BloomFilter;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilterUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -65,7 +66,8 @@ public class CacheChangesTracker implements Closeable {
             if (lazyBloomFilter.filter == null) {
                 LOG.debug("Disposing CacheChangesTracker for {}, no filter was 
needed", keyFilter);
             } else {
-                LOG.debug("Disposing CacheChangesTracker for {}, filter fpp 
was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp());
+                Shape shape = lazyBloomFilter.filter.getShape();
+                LOG.debug("Disposing CacheChangesTracker for {}, filter fpp 
was: {}", keyFilter, shape.getProbability((int)shape.estimateMaxN()));
             }
         }
     }
@@ -76,14 +78,14 @@ public class CacheChangesTracker implements Closeable {
 
         private final int entries;
 
-        private volatile BloomFilter<String> filter;
+        private volatile BloomFilter<SimpleBloomFilter> filter;
 
         public LazyBloomFilter(int entries) {
             this.entries = entries;
         }
 
         public synchronized void put(String entry) {
-            getFilter().put(entry);
+            getFilter().merge(BloomFilterUtils.hasher(entry));
         }
 
         public boolean mightContain(String entry) {
@@ -91,21 +93,14 @@ public class CacheChangesTracker implements Closeable {
                 return false;
             } else {
                 synchronized (this) {
-                    return filter.mightContain(entry);
+                    return filter.contains(BloomFilterUtils.hasher(entry));
                 }
             }
         }
 
-        private BloomFilter<String> getFilter() {
+        private BloomFilter<SimpleBloomFilter> getFilter() {
             if (filter == null) {
-                filter = BloomFilter.create(new Funnel<String>() {
-                    private static final long serialVersionUID = 
-7114267990225941161L;
-
-                    @Override
-                    public void funnel(String from, PrimitiveSink into) {
-                        into.putUnencodedChars(from);
-                    }
-                }, entries, FPP);
+                filter = BloomFilterUtils.createFilter(entries, FPP);
             }
             return filter;
         }

Reply via email to