This is an automated email from the ASF dual-hosted git repository. daim pushed a commit to branch OAK-11809 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 00ff914d06891b843588901766ae9f5c9c7e31e2 Author: rishabhdaim <[email protected]> AuthorDate: Wed Jul 23 13:40:52 2025 +0530 OAK-11809 : replaced guava's bloom filter with apache commons-collections4 implementation --- .../jackrabbit/oak/spi/blob/split/BlobIdSet.java | 23 +-- oak-commons/pom.xml | 4 + .../oak/commons/collections/BloomFilterUtils.java | 78 ++++++++ .../oak/commons/collections/package-info.java | 2 +- .../commons/collections/BloomFilterUtilsTest.java | 202 +++++++++++++++++++++ .../document/cache/CacheChangesTracker.java | 27 ++- 6 files changed, 308 insertions(+), 28 deletions(-) diff --git a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java index e62084fcfe..6ed481890d 100644 --- a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java +++ b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java @@ -21,20 +21,20 @@ package org.apache.jackrabbit.oak.spi.blob.split; import java.io.BufferedReader; import java.io.File; -import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; -import java.nio.charset.StandardCharsets; +import org.apache.commons.collections4.bloomfilter.BloomFilter; +import org.apache.commons.collections4.bloomfilter.Hasher; +import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter; import org.apache.commons.io.IOUtils; +import org.apache.jackrabbit.oak.commons.collections.BloomFilterUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.jackrabbit.guava.common.cache.Cache; import org.apache.jackrabbit.guava.common.cache.CacheBuilder; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnels; class BlobIdSet { @@ -42,19 +42,20 @@ class BlobIdSet { private final File store; - private final BloomFilter<CharSequence> bloomFilter; + private final BloomFilter<SimpleBloomFilter> bloomFilter; private final Cache<String, Boolean> cache; BlobIdSet(String repositoryDir, String filename) { store = new File(new File(repositoryDir), filename); - bloomFilter = BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); // about 8MB + bloomFilter = BloomFilterUtils.createFilter(9000000, 0.03); // about 8MB & 3% false positive rate cache = CacheBuilder.newBuilder().maximumSize(1000).build(); fillBloomFilter(); } synchronized boolean contains(String blobId) throws IOException { - if (!bloomFilter.apply(blobId)) { + final Hasher hasher = BloomFilterUtils.hasher(blobId); + if (!bloomFilter.contains(hasher)) { return false; } Boolean cached = cache.getIfPresent(blobId); @@ -64,7 +65,7 @@ class BlobIdSet { if (isPresentInStore(blobId)) { cache.put(blobId, Boolean.TRUE); - bloomFilter.put(blobId); + bloomFilter.merge(hasher); return true; } else { cache.put(blobId, Boolean.FALSE); @@ -74,11 +75,11 @@ class BlobIdSet { synchronized void add(String blobId) throws IOException { addToStore(blobId); - bloomFilter.put(blobId); + bloomFilter.merge(BloomFilterUtils.hasher(blobId)); cache.put(blobId, Boolean.TRUE); } - private boolean isPresentInStore(String blobId) throws FileNotFoundException, IOException { + private boolean isPresentInStore(String blobId) throws IOException { if (!store.exists()) { return false; } @@ -114,7 +115,7 @@ class BlobIdSet { reader = new BufferedReader(new FileReader(store)); String line; while ((line = reader.readLine()) != null) { - bloomFilter.put(line); + bloomFilter.merge(BloomFilterUtils.hasher(line)); } } catch (IOException e) { log.error("Can't fill bloom filter", e); diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml index 5e8396cd65..8b774a03dc 100644 --- a/oak-commons/pom.xml +++ b/oak-commons/pom.xml @@ -102,6 +102,10 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + </dependency> <dependency> <groupId>org.apache.jackrabbit</groupId> <artifactId>jackrabbit-jcr-commons</artifactId> diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtils.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtils.java new file mode 100644 index 0000000000..ba212ce75e --- /dev/null +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtils.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.commons.collections; + +import org.apache.commons.codec.digest.MurmurHash3; +import org.apache.commons.collections4.bloomfilter.BloomFilter; +import org.apache.commons.collections4.bloomfilter.EnhancedDoubleHasher; +import org.apache.commons.collections4.bloomfilter.Hasher; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter; +import org.jetbrains.annotations.NotNull; + +import java.nio.charset.StandardCharsets; + +public class BloomFilterUtils { + + private BloomFilterUtils() { + // no instances for you + } + + /** + * Creates a new Bloom filter with the specified expected entries and false positive probability. + * <p> + * The method constructs a properly configured Bloom filter by first calculating the optimal + * filter shape (bit size and hash functions) based on the provided parameters, then instantiating + * a SimpleBloomFilter with that shape. + * <p> + * The resulting Bloom filter provides efficient membership testing with a predictable + * false positive rate. + * + * @param entries the expected number of entries to be inserted into the filter + * @param fpp the desired false positive probability (between 0 and 1 exclusive) + * @return a new empty BloomFilter instance configured with the specified parameters + * @throws IllegalArgumentException if entries is less than 1 or if fpp is not between 0 and 1 exclusive + */ + public static BloomFilter<SimpleBloomFilter> createFilter(final int entries, final double fpp) { + final Shape shape = Shape.fromNP(entries, fpp); + return new SimpleBloomFilter(shape); + } + + + /** + * Creates a Bloom filter-compatible {@link Hasher} for the provided string value. + * + * @param value the string value to hash must not be null + * @return a non-null {@link Hasher} implementation based on the hash of the input string + * @throws NullPointerException if the input value is null + */ + + // This method generates a 128-bit MurmurHash3 hash from the UTF-8 representation + // of the input string, then creates an {@link EnhancedDoubleHasher} using the + // two 64-bit components of the hash. The resulting hasher can be used with Bloom + // filters from the Apache Commons Collections. + + public static @NotNull Hasher hasher(final @NotNull String value) { + final long[] hashed128 = MurmurHash3.hash128(value.getBytes(StandardCharsets.UTF_8)); + return new EnhancedDoubleHasher(hashed128[0], hashed128[1]); + } + + +} diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java index 81c20db5ca..5d1490f210 100644 --- a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java @@ -21,7 +21,7 @@ * Utilities for Java collections and streams. */ @Internal(since = "1.0.0") -@Version("2.2.0") +@Version("2.3.0") package org.apache.jackrabbit.oak.commons.collections; import org.apache.jackrabbit.oak.commons.annotations.Internal; import org.osgi.annotation.versioning.Version; diff --git a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtilsTest.java b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtilsTest.java new file mode 100644 index 0000000000..bbd0ec25ed --- /dev/null +++ b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterUtilsTest.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.commons.collections; + +import org.apache.commons.collections4.bloomfilter.BloomFilter; +import org.apache.commons.collections4.bloomfilter.EnhancedDoubleHasher; +import org.apache.commons.collections4.bloomfilter.Hasher; +import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter; +import org.junit.Assert; +import org.junit.Test; + +import java.lang.reflect.Field; + +public class BloomFilterUtilsTest { + + @Test + public void testCreateFilterWithValidParameters() { + int entries = 1000; + double fpp = 0.01; + BloomFilter<SimpleBloomFilter> filter = BloomFilterUtils.createFilter(entries, fpp); + + Assert.assertNotNull(filter); + Assert.assertTrue(filter instanceof SimpleBloomFilter); + } + + @Test + public void testFilterFunctionality() { + BloomFilter<SimpleBloomFilter> filter = BloomFilterUtils.createFilter(100, 0.01); + String testValue = "test-value"; + + // Initially should not contain anything + Assert.assertFalse(filter.contains(BloomFilterUtils.hasher(testValue))); + + // Add the item and verify it's found + filter.merge(BloomFilterUtils.hasher(testValue)); + Assert.assertTrue(filter.contains(BloomFilterUtils.hasher(testValue))); + + // Verify another value is not found + Assert.assertFalse(filter.contains(BloomFilterUtils.hasher("different-value"))); + } + + @Test + public void testFilterWithMultipleEntries() { + BloomFilter<SimpleBloomFilter> filter = BloomFilterUtils.createFilter(1000, 0.01); + + // Add multiple entries + for (int i = 0; i < 100; i++) { + filter.merge(BloomFilterUtils.hasher("value-" + i)); + } + + // Verify all entries are found + for (int i = 0; i < 100; i++) { + Assert.assertTrue(filter.contains(BloomFilterUtils.hasher("value-" + i))); + } + } + + @Test + public void testFalsePositiveProbability() { + // Create a filter with high false positive probability for testing + double fpp = 0.3; + BloomFilter<SimpleBloomFilter> filter = BloomFilterUtils.createFilter(100, fpp); + + // Fill the filter to capacity + for (int i = 0; i < 100; i++) { + filter.merge(BloomFilterUtils.hasher("existing-" + i)); + } + + // Test with values not in the filter + int falsePositives = 0; + int trials = 1000; + + for (int i = 0; i < trials; i++) { + if (filter.contains(BloomFilterUtils.hasher("nonexistent-" + i))) { + falsePositives++; + } + } + + // The false positive rate should be approximately fpp + double actualFpp = (double) falsePositives / trials; + Assert.assertTrue("False positive rate should be close to expected", Math.abs(actualFpp - fpp) < 0.15); + } + + @Test + public void testInvalidEntries() { + // Should throw exception for entries < 1 + Assert.assertThrows(IllegalArgumentException.class,() -> BloomFilterUtils.createFilter(0, 0.01)); + } + + @Test + public void testInvalidFppZero() { + // Should throw exception for fpp <= 0 + Assert.assertThrows(IllegalArgumentException.class,() -> BloomFilterUtils.createFilter(100, 0.0)); + } + + @Test + public void testInvalidFppOne() { + // Should throw exception for fpp >= 1 + Assert.assertThrows(IllegalArgumentException.class,() -> BloomFilterUtils.createFilter(100, 1.0)); + } + + @Test + public void testHasherWithNormalString() { + // Create a hasher from a string + Hasher hasher = BloomFilterUtils.hasher("test string"); + + // Verify the hasher is not null and correct type + Assert.assertNotNull(hasher); + Assert.assertTrue(hasher instanceof EnhancedDoubleHasher); + } + + @Test + public void testHasherWithEmptyString() { + // Empty strings should also work + Hasher hasher = BloomFilterUtils.hasher(""); + + Assert.assertNotNull(hasher); + Assert.assertTrue(hasher instanceof EnhancedDoubleHasher); + } + + @Test + public void testConsistentHashing() throws ReflectiveOperationException { + // Create two hashers from the same string + Hasher hasher1 = BloomFilterUtils.hasher("consistent"); + Hasher hasher2 = BloomFilterUtils.hasher("consistent"); + + // Same string should produce identical hash values + Assert.assertTrue(compareHasherContents(hasher1, hasher2)); + } + + @Test + public void testDifferentStringsProduceDifferentHashes() throws ReflectiveOperationException { + Hasher hasher1 = BloomFilterUtils.hasher("string1"); + Hasher hasher2 = BloomFilterUtils.hasher("string2"); + + // Different strings should produce different hashers + Assert.assertFalse(compareHasherContents(hasher1, hasher2)); + } + + @Test + public void testHasherWithNullThrowsNPE() { + // Method should throw NullPointerException when given null + Assert.assertThrows(NullPointerException.class, () -> BloomFilterUtils.hasher(null)); + } + + @Test + public void testHasherWithSpecialCharacters() { + // Special characters should be handled properly + Hasher hasher = BloomFilterUtils.hasher("!@#$%^&*()_+"); + Assert.assertNotNull(hasher); + } + + // util mehtods + /** + * Compares two EnhancedDoubleHasher instances by examining their internal hash values. + * + * @param h1 first Hasher to compare, must be EnhancedDoubleHasher + * @param h2 second Hasher to compare, must be EnhancedDoubleHasher + * @return true if both hashers have identical internal hash values + * @throws IllegalArgumentException if either hasher is not EnhancedDoubleHasher + * @throws ReflectiveOperationException if reflection fails + */ + private boolean compareHasherContents(final Hasher h1, final Hasher h2) + throws ReflectiveOperationException { + + if (!(h1 instanceof EnhancedDoubleHasher) || + !(h2 instanceof EnhancedDoubleHasher)) { + throw new IllegalArgumentException("Both hashers must be EnhancedDoubleHasher instances"); + } + + // Use reflection to access the private fields + Field value1Field = EnhancedDoubleHasher.class.getDeclaredField("initial"); + Field value2Field = EnhancedDoubleHasher.class.getDeclaredField("increment"); + + value1Field.setAccessible(true); + value2Field.setAccessible(true); + + long value1A = (long) value1Field.get(h1); + long value2A = (long) value2Field.get(h1); + long value1B = (long) value1Field.get(h2); + long value2B = (long) value2Field.get(h2); + + return (value1A == value1B) && (value2A == value2B); + } + +} \ No newline at end of file diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java index 14e30cd673..d31b1086a7 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java @@ -16,9 +16,10 @@ */ package org.apache.jackrabbit.oak.plugins.document.cache; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnel; -import org.apache.jackrabbit.guava.common.hash.PrimitiveSink; +import org.apache.commons.collections4.bloomfilter.BloomFilter; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.apache.commons.collections4.bloomfilter.SimpleBloomFilter; +import org.apache.jackrabbit.oak.commons.collections.BloomFilterUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -65,7 +66,8 @@ public class CacheChangesTracker implements Closeable { if (lazyBloomFilter.filter == null) { LOG.debug("Disposing CacheChangesTracker for {}, no filter was needed", keyFilter); } else { - LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp()); + Shape shape = lazyBloomFilter.filter.getShape(); + LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, shape.getProbability((int)shape.estimateMaxN())); } } } @@ -76,14 +78,14 @@ public class CacheChangesTracker implements Closeable { private final int entries; - private volatile BloomFilter<String> filter; + private volatile BloomFilter<SimpleBloomFilter> filter; public LazyBloomFilter(int entries) { this.entries = entries; } public synchronized void put(String entry) { - getFilter().put(entry); + getFilter().merge(BloomFilterUtils.hasher(entry)); } public boolean mightContain(String entry) { @@ -91,21 +93,14 @@ public class CacheChangesTracker implements Closeable { return false; } else { synchronized (this) { - return filter.mightContain(entry); + return filter.contains(BloomFilterUtils.hasher(entry)); } } } - private BloomFilter<String> getFilter() { + private BloomFilter<SimpleBloomFilter> getFilter() { if (filter == null) { - filter = BloomFilter.create(new Funnel<String>() { - private static final long serialVersionUID = -7114267990225941161L; - - @Override - public void funnel(String from, PrimitiveSink into) { - into.putUnencodedChars(from); - } - }, entries, FPP); + filter = BloomFilterUtils.createFilter(entries, FPP); } return filter; }
