This is an automated email from the ASF dual-hosted git repository.

reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new b0af31296c OAK-11855: update oak-blob to use oak-commons bloom filter 
(#2445)
b0af31296c is described below

commit b0af31296ce0794200560d844698012713390129
Author: Julian Reschke <[email protected]>
AuthorDate: Wed Aug 13 08:22:07 2025 +0200

    OAK-11855: update oak-blob to use oak-commons bloom filter (#2445)
---
 .../jackrabbit/oak/spi/blob/split/BlobIdSet.java   |  17 +-
 .../oak/spi/blob/split/BlobIdSetTest.java          | 237 +++++++++++++++++++++
 2 files changed, 244 insertions(+), 10 deletions(-)

diff --git 
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
 
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
index e62084fcfe..ffed802581 100644
--- 
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
+++ 
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.jackrabbit.oak.spi.blob.split;
 
 import java.io.BufferedReader;
@@ -25,7 +24,6 @@ import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 
 import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
@@ -33,8 +31,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.jackrabbit.guava.common.cache.Cache;
 import org.apache.jackrabbit.guava.common.cache.CacheBuilder;
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnels;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
 
 class BlobIdSet {
 
@@ -42,19 +39,19 @@ class BlobIdSet {
 
     private final File store;
 
-    private final BloomFilter<CharSequence> bloomFilter;
+    private final BloomFilter bloomFilter;
 
     private final Cache<String, Boolean> cache;
 
     BlobIdSet(String repositoryDir, String filename) {
         store = new File(new File(repositoryDir), filename);
-        bloomFilter = 
BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); // 
about 8MB
+        bloomFilter = BloomFilter.construct(9000000, 0.03); // 9M entries, 3% 
false positive rate
         cache = CacheBuilder.newBuilder().maximumSize(1000).build();
         fillBloomFilter();
     }
 
     synchronized boolean contains(String blobId) throws IOException {
-        if (!bloomFilter.apply(blobId)) {
+        if (!bloomFilter.mayContain(blobId)) {
             return false;
         }
         Boolean cached = cache.getIfPresent(blobId);
@@ -64,7 +61,7 @@ class BlobIdSet {
 
         if (isPresentInStore(blobId)) {
             cache.put(blobId, Boolean.TRUE);
-            bloomFilter.put(blobId);
+            bloomFilter.add(blobId);
             return true;
         } else {
             cache.put(blobId, Boolean.FALSE);
@@ -74,7 +71,7 @@ class BlobIdSet {
 
     synchronized void add(String blobId) throws IOException {
         addToStore(blobId);
-        bloomFilter.put(blobId);
+        bloomFilter.add(blobId);
         cache.put(blobId, Boolean.TRUE);
     }
 
@@ -114,7 +111,7 @@ class BlobIdSet {
             reader = new BufferedReader(new FileReader(store));
             String line;
             while ((line = reader.readLine()) != null) {
-                bloomFilter.put(line);
+                bloomFilter.add(line);
             }
         } catch (IOException e) {
             log.error("Can't fill bloom filter", e);
diff --git 
a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
 
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
new file mode 100644
index 0000000000..c31e6be9e5
--- /dev/null
+++ 
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.spi.blob.split;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class BlobIdSetTest {
+
+    private static final String TEST_FILENAME = "test-blob-ids.txt";
+
+    private File tempDir;
+    private BlobIdSet blobIdSet;
+
+    @Before
+    public void setup() throws IOException {
+        tempDir = Files.createTempDirectory("blob-id-set-test").toFile();
+        blobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME);
+    }
+
+    @After
+    public void cleanup() {
+        File testFile = new File(tempDir, TEST_FILENAME);
+        if (testFile.exists()) {
+            testFile.delete();
+        }
+        tempDir.delete();
+    }
+
+    @Test
+    public void testAddAndContains() throws IOException {
+        String blobId = "testblob123";
+        Assert.assertFalse("New set should not contain blob ID", 
blobIdSet.contains(blobId));
+
+        blobIdSet.add(blobId);
+        Assert.assertTrue("Set should contain added blob ID", 
blobIdSet.contains(blobId));
+    }
+
+    @Test
+    public void testMultipleAddAndContains() throws IOException {
+        String[] blobIds = {"blob1", "blob2", "blob3", "blob4", "blob5"};
+
+        // Add all blob IDs
+        for (String blobId : blobIds) {
+            blobIdSet.add(blobId);
+        }
+
+        // Check all blob IDs are present
+        for (String blobId : blobIds) {
+            Assert.assertTrue("Set should contain: " + blobId, 
blobIdSet.contains(blobId));
+        }
+
+        // Check a non-existent blob ID
+        Assert.assertFalse("Set should not contain non-existent blob ID", 
blobIdSet.contains("nonexistentblob"));
+    }
+
+    @Test
+    public void testPersistenceAcrossInstances() throws IOException {
+        String blobId = "persistenceblob";
+
+        // Add to the first instance
+        blobIdSet.add(blobId);
+
+        // Create a new instance pointing to the same file
+        BlobIdSet newSet = new BlobIdSet(tempDir.getAbsolutePath(), 
TEST_FILENAME);
+
+        // Verify the new instance sees the previously added blob ID
+        Assert.assertTrue("New instance should see blob ID from file", 
newSet.contains(blobId));
+    }
+
+    @Test
+    public void testEmptyFileStore() throws IOException {
+        // Create with non-existent file
+        File nonExistentDir = 
Files.createTempDirectory("non-existent").toFile();
+        BlobIdSet emptySet = new BlobIdSet(nonExistentDir.getAbsolutePath(), 
"empty.txt");
+
+        // Should not contain any blob IDs
+        Assert.assertFalse(emptySet.contains("anyblob"));
+
+        // Clean up
+        nonExistentDir.delete();
+    }
+
+    @Test
+    public void testLargeNumberOfEntries() throws IOException {
+        // Add a moderate number of entries
+        int count = 1000;
+        for (int i = 0; i < count; i++) {
+            blobIdSet.add("blob-" + i);
+        }
+
+        // Verify all entries can be found
+        for (int i = 0; i < count; i++) {
+            Assert.assertTrue(blobIdSet.contains("blob-" + i));
+        }
+
+        // Non-existent entries should return false
+        for (int i = 0; i < 10; i++) {
+            Assert.assertFalse(blobIdSet.contains("nonexistent-blob-" + i));
+        }
+    }
+
+    @Test
+    public void testFileContainsAddedEntries() throws IOException {
+        // Add several blob IDs
+        String[] blobIds = {"a", "b", "c"};
+        for (String id : blobIds) {
+            blobIdSet.add(id);
+        }
+
+        // Verify the file contains the added blob IDs
+        File storeFile = new File(tempDir, TEST_FILENAME);
+        Assert.assertTrue("Store file should exist", storeFile.exists());
+
+        List<String> fileContent = Files.readAllLines(storeFile.toPath());
+        Assert.assertEquals("File should contain all added blob IDs", 
blobIds.length, fileContent.size());
+
+        for (int i = 0; i < blobIds.length; i++) {
+            Assert.assertEquals("File line should match blob ID", blobIds[i], 
fileContent.get(i));
+        }
+    }
+
+    @Test
+    public void testBloomFilterPreventsUnnecessaryFileReads() throws 
IOException {
+        // The bloom filter should prevent checking the file for non-existent 
IDs
+
+        // Add some entries to populate the bloom filter
+        for (int i = 0; i < 10; i++) {
+            blobIdSet.add("existing-" + i);
+        }
+
+        // Using a unique pattern for non-existent IDs to ensure they hash 
differently
+        // than the ones we added
+        for (int i = 0; i < 10; i++) {
+            Assert.assertFalse(blobIdSet.contains("definitely-not-there-" + 
i));
+        }
+    }
+
+    @Test
+    public void testCachingBehavior() throws IOException {
+        String blobId = "cachedblob";
+
+        // Add the blob ID
+        blobIdSet.add(blobId);
+
+        // First check should populate cache
+        Assert.assertTrue(blobIdSet.contains(blobId));
+
+        // Even if we delete the file, the cached result should be used
+        File storeFile = new File(tempDir, TEST_FILENAME);
+        storeFile.delete();
+
+        // Should still return true due to cache
+        Assert.assertTrue(blobIdSet.contains(blobId));
+    }
+
+    @Test
+    public void testContainsFindsExistingEntriesInFile() throws IOException {
+        // Create some blob IDs
+        String[] blobIds = {"fileblob1", "fileblob2", "fileblob3"};
+
+        // Write blob IDs directly to file (not using BlobIdSet.add())
+        File storeFile = new File(tempDir, TEST_FILENAME);
+        try (FileWriter writer = new FileWriter(storeFile)) {
+            for (String id : blobIds) {
+                writer.write(id + "\n");
+            }
+        }
+
+        // Create a new BlobIdSet instance (which should load from the file)
+        BlobIdSet newBlobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), 
TEST_FILENAME);
+
+        // Verify that contains() finds all the IDs
+        for (String id : blobIds) {
+            Assert.assertTrue("Should contain blob ID written directly to 
file: " + id, newBlobIdSet.contains(id));
+        }
+
+        // Verify a non-existent ID still returns false
+        Assert.assertFalse(newBlobIdSet.contains("notinfile"));
+    }
+
+    @Test
+    public void testBloomFilterFalsePositiveProbabilityLessThanThreePercent() 
throws IOException {
+        // Load the bloom filter with a significant number of entries (about 
5% of configured capacity)
+        final int numToAdd = 5000;
+
+        // Add entries to the bloom filter
+        for (int i = 0; i < numToAdd; i++) {
+            blobIdSet.add("entry-" + i);
+        }
+
+        // Test with non-existent entries using carefully crafted IDs
+        int numTests = 1000;
+        int falsePositives = 0;
+
+        // Use a distinct prefix to ensure test IDs don't conflict with added 
entries
+        for (int i = 0; i < numTests; i++) {
+            String nonExistentId = "non-existent-" + i;
+
+            if (blobIdSet.contains(nonExistentId)) {
+                falsePositives++;
+            }
+        }
+
+        final double falsePositiveRate = (double) falsePositives / numTests;
+
+        // Verify the false positive rate is below the configured 3%
+        Assert.assertTrue(
+                "False positive rate should be less than 3%, was: " + 
(falsePositiveRate * 100) + "%",
+                falsePositiveRate < 0.03
+        );
+    }
+}
\ No newline at end of file

Reply via email to