This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-10674-b in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit f7cd60321388882e80c49f3407d602d3abd2529a Author: Thomas Mueller <[email protected]> AuthorDate: Tue Jul 29 15:26:33 2025 +0200 OAK-10674 Use Oak's Bloom filter --- .../jackrabbit/oak/spi/blob/split/BlobIdSet.java | 16 +- .../oak/spi/blob/split/BlobIdSetTest.java | 238 +++++++++++++++++++++ oak-commons/pom.xml | 4 + .../oak/commons/collections/BloomFilter.class | Bin 0 -> 1858 bytes .../oak/commons/collections}/BloomFilter.java | 34 ++- .../jackrabbit/oak/commons/collections/Hash.class | Bin 0 -> 510 bytes .../oak/commons/collections/HashUtils.java | 21 +- .../oak/commons/collections/HyperLogLog.class | Bin 0 -> 2147 bytes .../oak/commons/collections}/HyperLogLog.java | 4 +- .../oak/commons/collections/package-info.java | 2 +- .../oak/commons/collections}/BloomFilterTest.java | 92 +++++++- .../oak/commons/collections}/HyperLogLogTest.java | 38 +--- oak-run-commons/pom.xml | 5 + .../flatfile/analysis/modules/BinaryId.java | 8 +- .../analysis/modules/DistinctBinarySize.java | 4 +- .../modules/DistinctBinarySizeHistogram.java | 2 +- .../flatfile/analysis/modules/PropertyStats.java | 4 +- .../flatfile/analysis/utils/TopKValues.java | 3 +- .../index/indexer/document/tree/Prefetcher.java | 6 +- .../analysis/modules/PropertyStatsTest.java | 2 +- .../analysis/utils/CountMinSketchTest.java | 5 +- ...gLogTest.java => HyperLogLog3Linear64Test.java} | 36 +--- .../flatfile/analysis/utils/TopKValuesTest.java | 10 +- .../document/cache/CacheChangesTracker.java | 23 +- 24 files changed, 429 insertions(+), 128 deletions(-) diff --git a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java index e62084fcfe..a3916938f2 100644 --- a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java +++ b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java @@ -25,7 +25,6 @@ import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; -import java.nio.charset.StandardCharsets; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; @@ -33,8 +32,7 @@ import org.slf4j.LoggerFactory; import org.apache.jackrabbit.guava.common.cache.Cache; import org.apache.jackrabbit.guava.common.cache.CacheBuilder; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnels; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; class BlobIdSet { @@ -42,19 +40,19 @@ class BlobIdSet { private final File store; - private final BloomFilter<CharSequence> bloomFilter; + private final BloomFilter bloomFilter; private final Cache<String, Boolean> cache; BlobIdSet(String repositoryDir, String filename) { store = new File(new File(repositoryDir), filename); - bloomFilter = BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); // about 8MB + bloomFilter = BloomFilter.construct(9000000, 0.01); // 9M entries, 1% false positive rate cache = CacheBuilder.newBuilder().maximumSize(1000).build(); fillBloomFilter(); } synchronized boolean contains(String blobId) throws IOException { - if (!bloomFilter.apply(blobId)) { + if (!bloomFilter.mayContain(blobId)) { return false; } Boolean cached = cache.getIfPresent(blobId); @@ -64,7 +62,7 @@ class BlobIdSet { if (isPresentInStore(blobId)) { cache.put(blobId, Boolean.TRUE); - bloomFilter.put(blobId); + bloomFilter.add(blobId); return true; } else { cache.put(blobId, Boolean.FALSE); @@ -74,7 +72,7 @@ class BlobIdSet { synchronized void add(String blobId) throws IOException { addToStore(blobId); - bloomFilter.put(blobId); + bloomFilter.add(blobId); cache.put(blobId, Boolean.TRUE); } @@ -114,7 +112,7 @@ class BlobIdSet { reader = new BufferedReader(new FileReader(store)); String line; while ((line = reader.readLine()) != null) { - bloomFilter.put(line); + bloomFilter.add(line); } } catch (IOException e) { log.error("Can't fill bloom filter", e); diff --git a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java new file mode 100644 index 0000000000..d933bf29d4 --- /dev/null +++ b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.spi.blob.split; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class BlobIdSetTest { + + private static final String TEST_FILENAME = "test-blob-ids.txt"; + + private File tempDir; + private BlobIdSet blobIdSet; + + @Before + public void setup() throws IOException { + tempDir = Files.createTempDirectory("blob-id-set-test").toFile(); + blobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME); + } + + @After + public void cleanup() { + File testFile = new File(tempDir, TEST_FILENAME); + if (testFile.exists()) { + testFile.delete(); + } + tempDir.delete(); + } + + @Test + public void testAddAndContains() throws IOException { + String blobId = "testblob123"; + Assert.assertFalse("New set should not contain blob ID", blobIdSet.contains(blobId)); + + blobIdSet.add(blobId); + Assert.assertTrue("Set should contain added blob ID", blobIdSet.contains(blobId)); + } + + @Test + public void testMultipleAddAndContains() throws IOException { + String[] blobIds = {"blob1", "blob2", "blob3", "blob4", "blob5"}; + + // Add all blob IDs + for (String blobId : blobIds) { + blobIdSet.add(blobId); + } + + // Check all blob IDs are present + for (String blobId : blobIds) { + Assert.assertTrue("Set should contain: " + blobId, blobIdSet.contains(blobId)); + } + + // Check a non-existent blob ID + Assert.assertFalse("Set should not contain non-existent blob ID", blobIdSet.contains("nonexistentblob")); + } + + @Test + public void testPersistenceAcrossInstances() throws IOException { + String blobId = "persistenceblob"; + + // Add to the first instance + blobIdSet.add(blobId); + + // Create a new instance pointing to the same file + BlobIdSet newSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME); + + // Verify the new instance sees the previously added blob ID + Assert.assertTrue("New instance should see blob ID from file", newSet.contains(blobId)); + } + + @Test + public void testEmptyFileStore() throws IOException { + // Create with non-existent file + File nonExistentDir = Files.createTempDirectory("non-existent").toFile(); + BlobIdSet emptySet = new BlobIdSet(nonExistentDir.getAbsolutePath(), "empty.txt"); + + // Should not contain any blob IDs + Assert.assertFalse(emptySet.contains("anyblob")); + + // Clean up + nonExistentDir.delete(); + } + + @Test + public void testLargeNumberOfEntries() throws IOException { + // Add a moderate number of entries + int count = 1000; + for (int i = 0; i < count; i++) { + blobIdSet.add("blob-" + i); + } + + // Verify all entries can be found + for (int i = 0; i < count; i++) { + Assert.assertTrue(blobIdSet.contains("blob-" + i)); + } + + // Non-existent entries should return false + for (int i = 0; i < 10; i++) { + Assert.assertFalse(blobIdSet.contains("nonexistent-blob-" + i)); + } + } + + @Test + public void testFileContainsAddedEntries() throws IOException { + // Add several blob IDs + String[] blobIds = {"a", "b", "c"}; + for (String id : blobIds) { + blobIdSet.add(id); + } + + // Verify the file contains the added blob IDs + File storeFile = new File(tempDir, TEST_FILENAME); + Assert.assertTrue("Store file should exist", storeFile.exists()); + + List<String> fileContent = Files.readAllLines(storeFile.toPath()); + Assert.assertEquals("File should contain all added blob IDs", blobIds.length, fileContent.size()); + + for (int i = 0; i < blobIds.length; i++) { + Assert.assertEquals("File line should match blob ID", blobIds[i], fileContent.get(i)); + } + } + + @Test + public void testBloomFilterPreventsUnnecessaryFileReads() throws IOException { + // The bloom filter should prevent checking the file for non-existent IDs + + // Add some entries to populate the bloom filter + for (int i = 0; i < 10; i++) { + blobIdSet.add("existing-" + i); + } + + // Using a unique pattern for non-existent IDs to ensure they hash differently + // than the ones we added + for (int i = 0; i < 10; i++) { + Assert.assertFalse(blobIdSet.contains("definitely-not-there-" + i)); + } + } + + @Test + public void testCachingBehavior() throws IOException { + String blobId = "cachedblob"; + + // Add the blob ID + blobIdSet.add(blobId); + + // First check should populate cache + Assert.assertTrue(blobIdSet.contains(blobId)); + + // Even if we delete the file, the cached result should be used + File storeFile = new File(tempDir, TEST_FILENAME); + storeFile.delete(); + + // Should still return true due to cache + Assert.assertTrue(blobIdSet.contains(blobId)); + } + + @Test + public void testContainsFindsExistingEntriesInFile() throws IOException { + // Create some blob IDs + String[] blobIds = {"fileblob1", "fileblob2", "fileblob3"}; + + // Write blob IDs directly to file (not using BlobIdSet.add()) + File storeFile = new File(tempDir, TEST_FILENAME); + try (FileWriter writer = new FileWriter(storeFile)) { + for (String id : blobIds) { + writer.write(id + "\n"); + } + } + + // Create a new BlobIdSet instance (which should load from the file) + BlobIdSet newBlobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME); + + // Verify that contains() finds all the IDs + for (String id : blobIds) { + Assert.assertTrue("Should contain blob ID written directly to file: " + id, newBlobIdSet.contains(id)); + } + + // Verify a non-existent ID still returns false + Assert.assertFalse(newBlobIdSet.contains("notinfile")); + } + + @Test + public void testBloomFilterFalsePositiveProbabilityLessThanThreePercent() throws IOException { + // Load the bloom filter with a significant number of entries (about 5% of configured capacity) + final int numToAdd = 5000; + + // Add entries to the bloom filter + for (int i = 0; i < numToAdd; i++) { + blobIdSet.add("entry-" + i); + } + + // Test with non-existent entries using carefully crafted IDs + int numTests = 1000; + int falsePositives = 0; + + // Use a distinct prefix to ensure test IDs don't conflict with added entries + for (int i = 0; i < numTests; i++) { + String nonExistentId = "non-existent-" + i; + + if (blobIdSet.contains(nonExistentId)) { + falsePositives++; + } + } + + final double falsePositiveRate = (double) falsePositives / numTests; + + // Verify the false positive rate is below the configured 3% + Assert.assertTrue( + "False positive rate should be less than 3%, was: " + (falsePositiveRate * 100) + "%", + falsePositiveRate < 0.03 + ); + } +} \ No newline at end of file diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml index 5e8396cd65..8b774a03dc 100644 --- a/oak-commons/pom.xml +++ b/oak-commons/pom.xml @@ -102,6 +102,10 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + </dependency> <dependency> <groupId>org.apache.jackrabbit</groupId> <artifactId>jackrabbit-jcr-commons</artifactId> diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.class b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.class new file mode 100644 index 0000000000..7212602c35 Binary files /dev/null and b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.class differ diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java similarity index 83% rename from oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java rename to oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java index 3ad439a909..3c266ea1bb 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; /** * A Bloom filter implementation. @@ -44,6 +44,12 @@ public class BloomFilter { * @return the Bloom filter */ public static BloomFilter construct(long n, double fpp) { + if (n <= 0) { + throw new IllegalArgumentException("n must be greater than 0"); + } + if (fpp <= 0 || fpp >= 1) { + throw new IllegalArgumentException("fpp must be between 0 and 1"); + } long m = calculateBits(n, fpp); int k = calculateK((double) m / n); return new BloomFilter(new long[(int) ((m + 63) / 64)], k); @@ -96,6 +102,15 @@ public class BloomFilter { return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k); } + /** + * Add an entry, using an int hash code. + * + * @param value the value to check + */ + public void add(String value) { + add(HashUtils.hash64(value)); + } + /** * Add an entry. * @@ -106,11 +121,22 @@ public class BloomFilter { long a = (hash >>> 32) | (hash << 32); long b = hash; for (int i = 0; i < k; i++) { - data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a; + data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a; a += b; } } + /** + * Whether the entry may be in the set. + * + * @param value the value to check + * @return true if the entry was added, or, with a certain false positive + * probability, even if it was not added + */ + public boolean mayContain(String value) { + return mayContain(HashUtils.hash64(value)); + } + /** * Whether the entry may be in the set. * @@ -123,7 +149,7 @@ public class BloomFilter { long a = (hash >>> 32) | (hash << 32); long b = hash; for (int i = 0; i < k; i++) { - if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) { + if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) { return false; } a += b; @@ -159,4 +185,4 @@ public class BloomFilter { return (long) (-(m / k) * Math.log(1 - (x / m))); } -} \ No newline at end of file +} \ No newline at end of file diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/Hash.class b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/Hash.class new file mode 100644 index 0000000000..a2ce8b37b5 Binary files /dev/null and b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/Hash.class differ diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java similarity index 83% rename from oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java rename to oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java index a2cd599e7b..90d998664c 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java @@ -16,17 +16,30 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; + +import java.nio.charset.StandardCharsets; +import org.apache.commons.codec.digest.MurmurHash3; /** * A hash function utility class. */ -public class Hash { +public class HashUtils { - private Hash() { + private HashUtils() { // utility class } + /** + * Calculate a 64-bit hash value from a string. + * + * @param s the string + * @return the hash value + */ + public static long hash64(String s) { + return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0]; + } + /** * Calculate a 64-bit hash value from a value, using a seed. * @@ -70,4 +83,4 @@ public class Hash { return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32); } -} +} \ No newline at end of file diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.class b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.class new file mode 100644 index 0000000000..9dc87020c4 Binary files /dev/null and b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.class differ diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java similarity index 97% rename from oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java rename to oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java index dd20a1e777..2054decff0 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; import java.util.HashSet; @@ -92,4 +92,4 @@ public class HyperLogLog { return est; } -} +} \ No newline at end of file diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java index 81c20db5ca..5d1490f210 100644 --- a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java @@ -21,7 +21,7 @@ * Utilities for Java collections and streams. */ @Internal(since = "1.0.0") -@Version("2.2.0") +@Version("2.3.0") package org.apache.jackrabbit.oak.commons.collections; import org.apache.jackrabbit.oak.commons.annotations.Internal; import org.osgi.annotation.versioning.Version; diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java similarity index 61% rename from oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java rename to oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java index c3813f340d..464de8c4ef 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java +++ b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java @@ -16,10 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertThrows; import org.junit.Test; @@ -61,9 +63,6 @@ public class BloomFilterTest { BloomFilter f = BloomFilter.construct(100, 0.01); assertEquals(960, f.getBitCount()); assertEquals(7, f.getK()); - f = BloomFilter.construct(0, 0.01); - assertEquals(0, f.getBitCount()); - assertEquals(1, f.getK()); } @Test @@ -72,14 +71,14 @@ public class BloomFilterTest { int size = 500_000; BloomFilter f = BloomFilter.construct(size, fpp); for (int i = 0; i < size; i++) { - f.add(Hash.hash64(i)); + f.add(HashUtils.hash64(i)); } for (int i = 0; i < size; i++) { - assertTrue(f.mayContain(Hash.hash64(i))); + assertTrue(f.mayContain(HashUtils.hash64(i))); } int falsePositives = 0; for (int i = 0; i < size; i++) { - if (f.mayContain(Hash.hash64(i + size))) { + if (f.mayContain(HashUtils.hash64(i + size))) { falsePositives++; } } @@ -105,7 +104,7 @@ public class BloomFilterTest { HyperLogLog hll = new HyperLogLog(1024, 0); // now we calculate estimations with both the Bloom filter and HyperLogLog for(int i = 0; i < 20_000; i++) { - long x = Hash.hash64(i); + long x = HashUtils.hash64(i); bloom.add(x); hll.add(x); if (i > 0 && i % 1000 == 0) { @@ -123,4 +122,79 @@ public class BloomFilterTest { } } -} + @Test + public void filterFunctionality() { + BloomFilter filter = BloomFilter.construct(100, 0.01); + String testValue = "test-value"; + + // Initially should not contain anything + assertFalse(filter.mayContain(testValue)); + + // Add the item and verify it's found + filter.add(testValue); + assertTrue(filter.mayContain(testValue)); + + // Verify another value is not found + assertFalse(filter.mayContain("different-value")); + } + + @Test + public void filterWithMultipleEntries() { + BloomFilter filter = BloomFilter.construct(100, 0.01); + + // Add multiple entries + for (int i = 0; i < 100; i++) { + filter.add("value-" + i); + } + + // Verify all entries are found + for (int i = 0; i < 100; i++) { + assertTrue(filter.mayContain("value-" + i)); + } + } + + @Test + public void falsePositiveProbability() { + // Create a filter with high false positive probability for testing + double fpp = 0.3; + BloomFilter filter = BloomFilter.construct(100, fpp); + + // Fill the filter to capacity + for (int i = 0; i < 100; i++) { + filter.add("existing-" + i); + } + + // Test with values not in the filter + int falsePositives = 0; + int trials = 1000; + + for (int i = 0; i < trials; i++) { + if (filter.mayContain("nonexistent-" + i)) { + falsePositives++; + } + } + + // The false positive rate should be approximately fpp + double actualFpp = (double) falsePositives / trials; + assertTrue("False positive rate should be close to expected, got " + actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15); + } + + @Test + public void invalidEntries() { + // Should throw exception for entries < 1 + assertThrows(IllegalArgumentException.class,() -> BloomFilter.construct(0, 0.01)); + } + + @Test + public void invalidFppZero() { + // Should throw exception for fpp <= 0 + assertThrows(IllegalArgumentException.class,() -> BloomFilter.construct(100, 0.0)); + } + + @Test + public void invalidFppOne() { + // Should throw exception for fpp >= 1 + assertThrows(IllegalArgumentException.class,() -> BloomFilter.construct(100, 1.0)); + } + +} \ No newline at end of file diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java similarity index 85% copy from oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java copy to oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java index 570af9f16f..351149d304 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java +++ b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; @@ -53,14 +53,10 @@ public class HyperLogLogTest { @Test public void test() { int testCount = 50; - for (int m = 8; m <= 128; m *= 2) { + for (int m = 16; m <= 128; m *= 2) { double avg = Math.sqrt(averageOverRange(m, 30_000, testCount, false, 2)); int min, max; switch (m) { - case 8: - min = 16; - max = 17; - break; case 16: min = 22; max = 23; @@ -116,14 +112,10 @@ public class HyperLogLogTest { int runs = 2; for (int test = 0; test < testCount; test++) { HyperLogLog hll; - if (m == 8) { - hll = new HyperLogLogUsingLong(16, 0); - } else { - hll = new HyperLogLog(m, 0); - } + hll = new HyperLogLog(m, 0); long baseX = x; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x)); + hll.add(HashUtils.hash64(x)); x++; } long e = hll.estimate(); @@ -136,7 +128,7 @@ public class HyperLogLogTest { for (int add = 0; add < repeat; add++) { long x2 = baseX; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x2)); + hll.add(HashUtils.hash64(x2)); x2++; } } @@ -167,22 +159,4 @@ public class HyperLogLogTest { return Math.pow(relStdDevP, exponent); } - static class HyperLogLogUsingLong extends HyperLogLog { - - private long value; - - public HyperLogLogUsingLong(int m, int maxSmallSetSize) { - super(m, maxSmallSetSize); - } - - public void add(long hash) { - value = HyperLogLog3Linear64.add(value, hash); - } - - public long estimate() { - return HyperLogLog3Linear64.estimate(value); - } - - } - -} +} \ No newline at end of file diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml index fae8a8bdcb..f6fe0ebbe5 100644 --- a/oak-run-commons/pom.xml +++ b/oak-run-commons/pom.xml @@ -110,6 +110,11 @@ <artifactId>oak-jcr</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.apache.jackrabbit</groupId> + <artifactId>oak-commons</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>org.jetbrains</groupId> diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java index 3ec93f175d..741c1cceae 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java @@ -20,7 +20,7 @@ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul import java.util.Objects; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; /** * A binary id. @@ -51,9 +51,9 @@ public class BinaryId { // we need to hash again because some of the bits are fixed // in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx // (the hash64 is a reversible mapping, so there is no risk of conflicts) - this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 16)); - this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 16)); - this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32, Math.min(48, buff.length())), 16)); + this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 16)); + this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 16)); + this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32, Math.min(48, buff.length())), 16)); } @Override diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java index 272c7f357b..878375f1f8 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java @@ -28,8 +28,8 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; /** * Collects the number and size of distinct binaries. diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java index 349a3fe35b..061aa39d52 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java @@ -27,7 +27,7 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; /** * A histogram of distinct binaries. For each size range, we calculate the diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java index 80227f5744..c749a1ab59 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java @@ -31,7 +31,7 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues; @@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector { stats.values += p.getValues().length; if (stats.count > MIN_PROPERTY_COUNT) { for (String v : p.getValues()) { - long hash = Hash.hash64(v.hashCode(), seed); + long hash = HashUtils.hash64(v.hashCode(), seed); stats.hll = HyperLogLog3Linear64.add(stats.hll, hash); stats.size += v.length(); stats.maxSize = Math.max(stats.maxSize, v.length()); diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java index 2c235f2197..48b4a63b0b 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java @@ -21,6 +21,7 @@ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils import java.util.ArrayList; import java.util.HashMap; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.apache.jackrabbit.oak.commons.json.JsopBuilder; /** @@ -91,7 +92,7 @@ public class TopKValues { if (countedCount > 1000) { skipRemaining = SKIP; } - long hash = Hash.hash64(value.hashCode()); + long hash = HashUtils.hash64(value); long est = sketch.addAndEstimate(hash); if (est < min) { return; diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java index 10992f5c06..8f4e6d2cac 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java @@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -240,7 +240,7 @@ public class Prefetcher { // and then we use a secondary hash // otherwise the estimation is way off int h = blob.getContentIdentity().hashCode(); - return Hash.hash64(h | (blob.length() << 32)); + return HashUtils.hash64(h | (blob.length() << 32)); } static enum PrefetchType { diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java index 7803a54400..5fe819335d 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java @@ -67,7 +67,7 @@ public class PropertyStatsTest { pc.add(n); } assertEquals("PropertyStats\n" - + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 maxSize 11 top {\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n" + + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 maxSize 11 top {\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n" + "", pc.toString()); } diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java index 92935ca286..212c0fba22 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue; import java.util.Arrays; import java.util.Random; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.junit.Test; public class CountMinSketchTest { @@ -83,11 +84,11 @@ public class CountMinSketchTest { } CountMinSketch est = new CountMinSketch(5, 16); for (int i = 0; i < size; i++) { - est.add(Hash.hash64(x + data[i])); + est.add(HashUtils.hash64(x + data[i])); } int[] counts = getCounts(data); for (int i = 0; i < 10; i++) { - long e = est.estimate(Hash.hash64(x + i)); + long e = est.estimate(HashUtils.hash64(x + i)); long expectedPercent = (int) (100. * counts[i] / size); long estPercent = (int) (100. * e / size); if (debug) { diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java similarity index 85% rename from oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java rename to oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java index 570af9f16f..42ef921e30 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java @@ -18,37 +18,13 @@ */ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; import org.junit.Test; -public class HyperLogLogTest { - - @Test(expected = IllegalArgumentException.class) - public void illegalHyperLogLogTooSmall() { - new HyperLogLog(8, 0); - } - - @Test(expected = IllegalArgumentException.class) - public void illegalHyperLogLogNotPowerOfTwo() { - new HyperLogLog(30, 0); - } - - @Test - public void smallSet() { - HyperLogLog hll100 = new HyperLogLog(16, 100); - assertEquals(0, hll100.estimate()); - HyperLogLog hll0 = new HyperLogLog(16, 0); - assertEquals(0, hll0.estimate()); - for (int i = 0; i < 10_000; i++) { - hll100.add(i % 100); - hll0.add(i % 100); - } - assertEquals(100, hll100.estimate()); - assertNotEquals(100, hll0.estimate()); - } +public class HyperLogLog3Linear64Test { @Test public void test() { @@ -123,7 +99,7 @@ public class HyperLogLogTest { } long baseX = x; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x)); + hll.add(HashUtils.hash64(x)); x++; } long e = hll.estimate(); @@ -136,7 +112,7 @@ public class HyperLogLogTest { for (int add = 0; add < repeat; add++) { long x2 = baseX; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x2)); + hll.add(HashUtils.hash64(x2)); x2++; } } @@ -185,4 +161,4 @@ public class HyperLogLogTest { } -} +} \ No newline at end of file diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java index 696e2e724f..c2878da651 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java @@ -30,17 +30,17 @@ public class TopKValuesTest { public void test() { TopKValues v = new TopKValues(3); Random r = new Random(1); - for(int i=0; i<1000000; i++) { - if(r.nextBoolean()) { + for (int i = 0; i < 1000000; i++) { + if (r.nextBoolean()) { v.add("common" + r.nextInt(2)); } else { v.add("rare" + r.nextInt(100)); } } - assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}", v.toString()); + assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}", v.toString()); assertEquals(91809, v.getCount()); - assertEquals(24849, v.getTopCount()); - assertEquals(24652, v.getSecondCount()); + assertEquals(25336, v.getTopCount()); + assertEquals(24679, v.getSecondCount()); } } diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java index 14e30cd673..5cdf866439 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java @@ -16,9 +16,7 @@ */ package org.apache.jackrabbit.oak.plugins.document.cache; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnel; -import org.apache.jackrabbit.guava.common.hash.PrimitiveSink; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -65,7 +63,7 @@ public class CacheChangesTracker implements Closeable { if (lazyBloomFilter.filter == null) { LOG.debug("Disposing CacheChangesTracker for {}, no filter was needed", keyFilter); } else { - LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp()); + LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, LazyBloomFilter.FPP); } } } @@ -76,14 +74,14 @@ public class CacheChangesTracker implements Closeable { private final int entries; - private volatile BloomFilter<String> filter; + private volatile BloomFilter filter; public LazyBloomFilter(int entries) { this.entries = entries; } public synchronized void put(String entry) { - getFilter().put(entry); + getFilter().add(entry); } public boolean mightContain(String entry) { @@ -91,21 +89,14 @@ public class CacheChangesTracker implements Closeable { return false; } else { synchronized (this) { - return filter.mightContain(entry); + return filter.mayContain(entry); } } } - private BloomFilter<String> getFilter() { + private BloomFilter getFilter() { if (filter == null) { - filter = BloomFilter.create(new Funnel<String>() { - private static final long serialVersionUID = -7114267990225941161L; - - @Override - public void funnel(String from, PrimitiveSink into) { - into.putUnencodedChars(from); - } - }, entries, FPP); + filter = BloomFilter.construct(entries, FPP); } return filter; }
