This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch trunk in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push: new c438a3f96a OAK-10674 Use Oak's Bloom filter (#2412) c438a3f96a is described below commit c438a3f96a3b7c148a478b098aa59b7a57650c7d Author: Thomas Mueller <thom...@apache.org> AuthorDate: Mon Aug 4 20:26:20 2025 +0200 OAK-10674 Use Oak's Bloom filter (#2412) * OAK-11787 ElasticRegexPropertyIndexTest.regexPropertyWithoutFlattened * OAK-10674 Use Oak's Bloom filter * OAK-10674 Use Oak's Bloom filter * OAK-10674 Use Oak's Bloom filter * OAK-10674 Use Oak's Bloom filter * OAK-10674 Use Oak's Bloom filter --- .../jackrabbit/oak/spi/blob/split/BlobIdSet.java | 16 +- .../oak/spi/blob/split/BlobIdSetTest.java | 238 +++++++++++++++++++++ oak-commons/pom.xml | 4 + .../oak/commons/collections}/BloomFilter.java | 34 ++- .../oak/commons/collections/HashUtils.java | 21 +- .../oak/commons/collections}/HyperLogLog.java | 4 +- .../oak/commons/collections/package-info.java | 2 +- .../oak/commons/collections}/BloomFilterTest.java | 92 +++++++- .../oak/commons/collections}/HyperLogLogTest.java | 39 +--- oak-run-commons/pom.xml | 5 + .../flatfile/analysis/modules/BinaryId.java | 8 +- .../analysis/modules/DistinctBinarySize.java | 4 +- .../modules/DistinctBinarySizeHistogram.java | 2 +- .../flatfile/analysis/modules/PropertyStats.java | 4 +- .../flatfile/analysis/utils/TopKValues.java | 3 +- .../index/indexer/document/tree/Prefetcher.java | 6 +- .../analysis/modules/PropertyStatsTest.java | 2 +- .../analysis/utils/CountMinSketchTest.java | 5 +- ...gLogTest.java => HyperLogLog3Linear64Test.java} | 39 +--- .../flatfile/analysis/utils/TopKValuesTest.java | 10 +- .../document/cache/CacheChangesTracker.java | 44 ++-- .../cache/CacheChangesTrackerConcurrencyTest.java | 215 +++++++++++++++++++ 22 files changed, 658 insertions(+), 139 deletions(-) diff --git a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java index e62084fcfe..49acb59232 100644 --- a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java +++ b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java @@ -25,7 +25,6 @@ import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; -import java.nio.charset.StandardCharsets; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; @@ -33,8 +32,7 @@ import org.slf4j.LoggerFactory; import org.apache.jackrabbit.guava.common.cache.Cache; import org.apache.jackrabbit.guava.common.cache.CacheBuilder; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnels; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; class BlobIdSet { @@ -42,19 +40,19 @@ class BlobIdSet { private final File store; - private final BloomFilter<CharSequence> bloomFilter; + private final BloomFilter bloomFilter; private final Cache<String, Boolean> cache; BlobIdSet(String repositoryDir, String filename) { store = new File(new File(repositoryDir), filename); - bloomFilter = BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); // about 8MB + bloomFilter = BloomFilter.construct(9000000, 0.03); // 9M entries, 3% false positive rate cache = CacheBuilder.newBuilder().maximumSize(1000).build(); fillBloomFilter(); } synchronized boolean contains(String blobId) throws IOException { - if (!bloomFilter.apply(blobId)) { + if (!bloomFilter.mayContain(blobId)) { return false; } Boolean cached = cache.getIfPresent(blobId); @@ -64,7 +62,7 @@ class BlobIdSet { if (isPresentInStore(blobId)) { cache.put(blobId, Boolean.TRUE); - bloomFilter.put(blobId); + bloomFilter.add(blobId); return true; } else { cache.put(blobId, Boolean.FALSE); @@ -74,7 +72,7 @@ class BlobIdSet { synchronized void add(String blobId) throws IOException { addToStore(blobId); - bloomFilter.put(blobId); + bloomFilter.add(blobId); cache.put(blobId, Boolean.TRUE); } @@ -114,7 +112,7 @@ class BlobIdSet { reader = new BufferedReader(new FileReader(store)); String line; while ((line = reader.readLine()) != null) { - bloomFilter.put(line); + bloomFilter.add(line); } } catch (IOException e) { log.error("Can't fill bloom filter", e); diff --git a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java new file mode 100644 index 0000000000..d933bf29d4 --- /dev/null +++ b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.spi.blob.split; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class BlobIdSetTest { + + private static final String TEST_FILENAME = "test-blob-ids.txt"; + + private File tempDir; + private BlobIdSet blobIdSet; + + @Before + public void setup() throws IOException { + tempDir = Files.createTempDirectory("blob-id-set-test").toFile(); + blobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME); + } + + @After + public void cleanup() { + File testFile = new File(tempDir, TEST_FILENAME); + if (testFile.exists()) { + testFile.delete(); + } + tempDir.delete(); + } + + @Test + public void testAddAndContains() throws IOException { + String blobId = "testblob123"; + Assert.assertFalse("New set should not contain blob ID", blobIdSet.contains(blobId)); + + blobIdSet.add(blobId); + Assert.assertTrue("Set should contain added blob ID", blobIdSet.contains(blobId)); + } + + @Test + public void testMultipleAddAndContains() throws IOException { + String[] blobIds = {"blob1", "blob2", "blob3", "blob4", "blob5"}; + + // Add all blob IDs + for (String blobId : blobIds) { + blobIdSet.add(blobId); + } + + // Check all blob IDs are present + for (String blobId : blobIds) { + Assert.assertTrue("Set should contain: " + blobId, blobIdSet.contains(blobId)); + } + + // Check a non-existent blob ID + Assert.assertFalse("Set should not contain non-existent blob ID", blobIdSet.contains("nonexistentblob")); + } + + @Test + public void testPersistenceAcrossInstances() throws IOException { + String blobId = "persistenceblob"; + + // Add to the first instance + blobIdSet.add(blobId); + + // Create a new instance pointing to the same file + BlobIdSet newSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME); + + // Verify the new instance sees the previously added blob ID + Assert.assertTrue("New instance should see blob ID from file", newSet.contains(blobId)); + } + + @Test + public void testEmptyFileStore() throws IOException { + // Create with non-existent file + File nonExistentDir = Files.createTempDirectory("non-existent").toFile(); + BlobIdSet emptySet = new BlobIdSet(nonExistentDir.getAbsolutePath(), "empty.txt"); + + // Should not contain any blob IDs + Assert.assertFalse(emptySet.contains("anyblob")); + + // Clean up + nonExistentDir.delete(); + } + + @Test + public void testLargeNumberOfEntries() throws IOException { + // Add a moderate number of entries + int count = 1000; + for (int i = 0; i < count; i++) { + blobIdSet.add("blob-" + i); + } + + // Verify all entries can be found + for (int i = 0; i < count; i++) { + Assert.assertTrue(blobIdSet.contains("blob-" + i)); + } + + // Non-existent entries should return false + for (int i = 0; i < 10; i++) { + Assert.assertFalse(blobIdSet.contains("nonexistent-blob-" + i)); + } + } + + @Test + public void testFileContainsAddedEntries() throws IOException { + // Add several blob IDs + String[] blobIds = {"a", "b", "c"}; + for (String id : blobIds) { + blobIdSet.add(id); + } + + // Verify the file contains the added blob IDs + File storeFile = new File(tempDir, TEST_FILENAME); + Assert.assertTrue("Store file should exist", storeFile.exists()); + + List<String> fileContent = Files.readAllLines(storeFile.toPath()); + Assert.assertEquals("File should contain all added blob IDs", blobIds.length, fileContent.size()); + + for (int i = 0; i < blobIds.length; i++) { + Assert.assertEquals("File line should match blob ID", blobIds[i], fileContent.get(i)); + } + } + + @Test + public void testBloomFilterPreventsUnnecessaryFileReads() throws IOException { + // The bloom filter should prevent checking the file for non-existent IDs + + // Add some entries to populate the bloom filter + for (int i = 0; i < 10; i++) { + blobIdSet.add("existing-" + i); + } + + // Using a unique pattern for non-existent IDs to ensure they hash differently + // than the ones we added + for (int i = 0; i < 10; i++) { + Assert.assertFalse(blobIdSet.contains("definitely-not-there-" + i)); + } + } + + @Test + public void testCachingBehavior() throws IOException { + String blobId = "cachedblob"; + + // Add the blob ID + blobIdSet.add(blobId); + + // First check should populate cache + Assert.assertTrue(blobIdSet.contains(blobId)); + + // Even if we delete the file, the cached result should be used + File storeFile = new File(tempDir, TEST_FILENAME); + storeFile.delete(); + + // Should still return true due to cache + Assert.assertTrue(blobIdSet.contains(blobId)); + } + + @Test + public void testContainsFindsExistingEntriesInFile() throws IOException { + // Create some blob IDs + String[] blobIds = {"fileblob1", "fileblob2", "fileblob3"}; + + // Write blob IDs directly to file (not using BlobIdSet.add()) + File storeFile = new File(tempDir, TEST_FILENAME); + try (FileWriter writer = new FileWriter(storeFile)) { + for (String id : blobIds) { + writer.write(id + "\n"); + } + } + + // Create a new BlobIdSet instance (which should load from the file) + BlobIdSet newBlobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME); + + // Verify that contains() finds all the IDs + for (String id : blobIds) { + Assert.assertTrue("Should contain blob ID written directly to file: " + id, newBlobIdSet.contains(id)); + } + + // Verify a non-existent ID still returns false + Assert.assertFalse(newBlobIdSet.contains("notinfile")); + } + + @Test + public void testBloomFilterFalsePositiveProbabilityLessThanThreePercent() throws IOException { + // Load the bloom filter with a significant number of entries (about 5% of configured capacity) + final int numToAdd = 5000; + + // Add entries to the bloom filter + for (int i = 0; i < numToAdd; i++) { + blobIdSet.add("entry-" + i); + } + + // Test with non-existent entries using carefully crafted IDs + int numTests = 1000; + int falsePositives = 0; + + // Use a distinct prefix to ensure test IDs don't conflict with added entries + for (int i = 0; i < numTests; i++) { + String nonExistentId = "non-existent-" + i; + + if (blobIdSet.contains(nonExistentId)) { + falsePositives++; + } + } + + final double falsePositiveRate = (double) falsePositives / numTests; + + // Verify the false positive rate is below the configured 3% + Assert.assertTrue( + "False positive rate should be less than 3%, was: " + (falsePositiveRate * 100) + "%", + falsePositiveRate < 0.03 + ); + } +} \ No newline at end of file diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml index 5e8396cd65..8b774a03dc 100644 --- a/oak-commons/pom.xml +++ b/oak-commons/pom.xml @@ -102,6 +102,10 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + </dependency> <dependency> <groupId>org.apache.jackrabbit</groupId> <artifactId>jackrabbit-jcr-commons</artifactId> diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java similarity index 83% rename from oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java rename to oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java index 3ad439a909..3c266ea1bb 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; /** * A Bloom filter implementation. @@ -44,6 +44,12 @@ public class BloomFilter { * @return the Bloom filter */ public static BloomFilter construct(long n, double fpp) { + if (n <= 0) { + throw new IllegalArgumentException("n must be greater than 0"); + } + if (fpp <= 0 || fpp >= 1) { + throw new IllegalArgumentException("fpp must be between 0 and 1"); + } long m = calculateBits(n, fpp); int k = calculateK((double) m / n); return new BloomFilter(new long[(int) ((m + 63) / 64)], k); @@ -96,6 +102,15 @@ public class BloomFilter { return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k); } + /** + * Add an entry, using an int hash code. + * + * @param value the value to check + */ + public void add(String value) { + add(HashUtils.hash64(value)); + } + /** * Add an entry. * @@ -106,11 +121,22 @@ public class BloomFilter { long a = (hash >>> 32) | (hash << 32); long b = hash; for (int i = 0; i < k; i++) { - data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a; + data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a; a += b; } } + /** + * Whether the entry may be in the set. + * + * @param value the value to check + * @return true if the entry was added, or, with a certain false positive + * probability, even if it was not added + */ + public boolean mayContain(String value) { + return mayContain(HashUtils.hash64(value)); + } + /** * Whether the entry may be in the set. * @@ -123,7 +149,7 @@ public class BloomFilter { long a = (hash >>> 32) | (hash << 32); long b = hash; for (int i = 0; i < k; i++) { - if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) { + if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) { return false; } a += b; @@ -159,4 +185,4 @@ public class BloomFilter { return (long) (-(m / k) * Math.log(1 - (x / m))); } -} \ No newline at end of file +} \ No newline at end of file diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java similarity index 83% rename from oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java rename to oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java index a2cd599e7b..90d998664c 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java @@ -16,17 +16,30 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; + +import java.nio.charset.StandardCharsets; +import org.apache.commons.codec.digest.MurmurHash3; /** * A hash function utility class. */ -public class Hash { +public class HashUtils { - private Hash() { + private HashUtils() { // utility class } + /** + * Calculate a 64-bit hash value from a string. + * + * @param s the string + * @return the hash value + */ + public static long hash64(String s) { + return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0]; + } + /** * Calculate a 64-bit hash value from a value, using a seed. * @@ -70,4 +83,4 @@ public class Hash { return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32); } -} +} \ No newline at end of file diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java similarity index 97% rename from oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java rename to oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java index dd20a1e777..2054decff0 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; import java.util.HashSet; @@ -92,4 +92,4 @@ public class HyperLogLog { return est; } -} +} \ No newline at end of file diff --git a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java index 81c20db5ca..5d1490f210 100644 --- a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java +++ b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java @@ -21,7 +21,7 @@ * Utilities for Java collections and streams. */ @Internal(since = "1.0.0") -@Version("2.2.0") +@Version("2.3.0") package org.apache.jackrabbit.oak.commons.collections; import org.apache.jackrabbit.oak.commons.annotations.Internal; import org.osgi.annotation.versioning.Version; diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java similarity index 61% rename from oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java rename to oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java index c3813f340d..464de8c4ef 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java +++ b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java @@ -16,10 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertThrows; import org.junit.Test; @@ -61,9 +63,6 @@ public class BloomFilterTest { BloomFilter f = BloomFilter.construct(100, 0.01); assertEquals(960, f.getBitCount()); assertEquals(7, f.getK()); - f = BloomFilter.construct(0, 0.01); - assertEquals(0, f.getBitCount()); - assertEquals(1, f.getK()); } @Test @@ -72,14 +71,14 @@ public class BloomFilterTest { int size = 500_000; BloomFilter f = BloomFilter.construct(size, fpp); for (int i = 0; i < size; i++) { - f.add(Hash.hash64(i)); + f.add(HashUtils.hash64(i)); } for (int i = 0; i < size; i++) { - assertTrue(f.mayContain(Hash.hash64(i))); + assertTrue(f.mayContain(HashUtils.hash64(i))); } int falsePositives = 0; for (int i = 0; i < size; i++) { - if (f.mayContain(Hash.hash64(i + size))) { + if (f.mayContain(HashUtils.hash64(i + size))) { falsePositives++; } } @@ -105,7 +104,7 @@ public class BloomFilterTest { HyperLogLog hll = new HyperLogLog(1024, 0); // now we calculate estimations with both the Bloom filter and HyperLogLog for(int i = 0; i < 20_000; i++) { - long x = Hash.hash64(i); + long x = HashUtils.hash64(i); bloom.add(x); hll.add(x); if (i > 0 && i % 1000 == 0) { @@ -123,4 +122,79 @@ public class BloomFilterTest { } } -} + @Test + public void filterFunctionality() { + BloomFilter filter = BloomFilter.construct(100, 0.01); + String testValue = "test-value"; + + // Initially should not contain anything + assertFalse(filter.mayContain(testValue)); + + // Add the item and verify it's found + filter.add(testValue); + assertTrue(filter.mayContain(testValue)); + + // Verify another value is not found + assertFalse(filter.mayContain("different-value")); + } + + @Test + public void filterWithMultipleEntries() { + BloomFilter filter = BloomFilter.construct(100, 0.01); + + // Add multiple entries + for (int i = 0; i < 100; i++) { + filter.add("value-" + i); + } + + // Verify all entries are found + for (int i = 0; i < 100; i++) { + assertTrue(filter.mayContain("value-" + i)); + } + } + + @Test + public void falsePositiveProbability() { + // Create a filter with high false positive probability for testing + double fpp = 0.3; + BloomFilter filter = BloomFilter.construct(100, fpp); + + // Fill the filter to capacity + for (int i = 0; i < 100; i++) { + filter.add("existing-" + i); + } + + // Test with values not in the filter + int falsePositives = 0; + int trials = 1000; + + for (int i = 0; i < trials; i++) { + if (filter.mayContain("nonexistent-" + i)) { + falsePositives++; + } + } + + // The false positive rate should be approximately fpp + double actualFpp = (double) falsePositives / trials; + assertTrue("False positive rate should be close to expected, got " + actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15); + } + + @Test + public void invalidEntries() { + // Should throw exception for entries < 1 + assertThrows(IllegalArgumentException.class,() -> BloomFilter.construct(0, 0.01)); + } + + @Test + public void invalidFppZero() { + // Should throw exception for fpp <= 0 + assertThrows(IllegalArgumentException.class,() -> BloomFilter.construct(100, 0.0)); + } + + @Test + public void invalidFppOne() { + // Should throw exception for fpp >= 1 + assertThrows(IllegalArgumentException.class,() -> BloomFilter.construct(100, 1.0)); + } + +} \ No newline at end of file diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java similarity index 84% copy from oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java copy to oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java index 570af9f16f..fd3fb39bff 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java +++ b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; +package org.apache.jackrabbit.oak.commons.collections; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; @@ -53,14 +53,10 @@ public class HyperLogLogTest { @Test public void test() { int testCount = 50; - for (int m = 8; m <= 128; m *= 2) { + for (int m = 16; m <= 128; m *= 2) { double avg = Math.sqrt(averageOverRange(m, 30_000, testCount, false, 2)); int min, max; switch (m) { - case 8: - min = 16; - max = 17; - break; case 16: min = 22; max = 23; @@ -82,7 +78,6 @@ public class HyperLogLogTest { max = 0; break; } - // System.out.println(type + " expected " + min + ".." + max + " got " + avg); assertTrue("m " + m + " expected " + min + ".." + max + " got " + avg, min < avg && avg < max); } } @@ -116,14 +111,10 @@ public class HyperLogLogTest { int runs = 2; for (int test = 0; test < testCount; test++) { HyperLogLog hll; - if (m == 8) { - hll = new HyperLogLogUsingLong(16, 0); - } else { - hll = new HyperLogLog(m, 0); - } + hll = new HyperLogLog(m, 0); long baseX = x; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x)); + hll.add(HashUtils.hash64(x)); x++; } long e = hll.estimate(); @@ -136,7 +127,7 @@ public class HyperLogLogTest { for (int add = 0; add < repeat; add++) { long x2 = baseX; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x2)); + hll.add(HashUtils.hash64(x2)); x2++; } } @@ -167,22 +158,4 @@ public class HyperLogLogTest { return Math.pow(relStdDevP, exponent); } - static class HyperLogLogUsingLong extends HyperLogLog { - - private long value; - - public HyperLogLogUsingLong(int m, int maxSmallSetSize) { - super(m, maxSmallSetSize); - } - - public void add(long hash) { - value = HyperLogLog3Linear64.add(value, hash); - } - - public long estimate() { - return HyperLogLog3Linear64.estimate(value); - } - - } - -} +} \ No newline at end of file diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml index e1288dccc2..efd3657e51 100644 --- a/oak-run-commons/pom.xml +++ b/oak-run-commons/pom.xml @@ -110,6 +110,11 @@ <artifactId>oak-jcr</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.apache.jackrabbit</groupId> + <artifactId>oak-commons</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>org.jetbrains</groupId> diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java index 3ec93f175d..741c1cceae 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java @@ -20,7 +20,7 @@ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul import java.util.Objects; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; /** * A binary id. @@ -51,9 +51,9 @@ public class BinaryId { // we need to hash again because some of the bits are fixed // in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx // (the hash64 is a reversible mapping, so there is no risk of conflicts) - this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 16)); - this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 16)); - this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32, Math.min(48, buff.length())), 16)); + this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 16)); + this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 16)); + this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32, Math.min(48, buff.length())), 16)); } @Override diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java index 272c7f357b..878375f1f8 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java @@ -28,8 +28,8 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; /** * Collects the number and size of distinct binaries. diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java index 349a3fe35b..061aa39d52 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java @@ -27,7 +27,7 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; /** * A histogram of distinct binaries. For each size range, we calculate the diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java index 80227f5744..c749a1ab59 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java @@ -31,7 +31,7 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues; @@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector { stats.values += p.getValues().length; if (stats.count > MIN_PROPERTY_COUNT) { for (String v : p.getValues()) { - long hash = Hash.hash64(v.hashCode(), seed); + long hash = HashUtils.hash64(v.hashCode(), seed); stats.hll = HyperLogLog3Linear64.add(stats.hll, hash); stats.size += v.length(); stats.maxSize = Math.max(stats.maxSize, v.length()); diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java index 2c235f2197..48b4a63b0b 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java @@ -21,6 +21,7 @@ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils import java.util.ArrayList; import java.util.HashMap; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.apache.jackrabbit.oak.commons.json.JsopBuilder; /** @@ -91,7 +92,7 @@ public class TopKValues { if (countedCount > 1000) { skipRemaining = SKIP; } - long hash = Hash.hash64(value.hashCode()); + long hash = HashUtils.hash64(value); long est = sketch.addAndEstimate(hash); if (est < min) { return; diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java index 10992f5c06..8f4e6d2cac 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java @@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -240,7 +240,7 @@ public class Prefetcher { // and then we use a secondary hash // otherwise the estimation is way off int h = blob.getContentIdentity().hashCode(); - return Hash.hash64(h | (blob.length() << 32)); + return HashUtils.hash64(h | (blob.length() << 32)); } static enum PrefetchType { diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java index 7803a54400..5fe819335d 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java @@ -67,7 +67,7 @@ public class PropertyStatsTest { pc.add(n); } assertEquals("PropertyStats\n" - + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 maxSize 11 top {\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n" + + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 maxSize 11 top {\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n" + "", pc.toString()); } diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java index 92935ca286..212c0fba22 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue; import java.util.Arrays; import java.util.Random; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.junit.Test; public class CountMinSketchTest { @@ -83,11 +84,11 @@ public class CountMinSketchTest { } CountMinSketch est = new CountMinSketch(5, 16); for (int i = 0; i < size; i++) { - est.add(Hash.hash64(x + data[i])); + est.add(HashUtils.hash64(x + data[i])); } int[] counts = getCounts(data); for (int i = 0; i < 10; i++) { - long e = est.estimate(Hash.hash64(x + i)); + long e = est.estimate(HashUtils.hash64(x + i)); long expectedPercent = (int) (100. * counts[i] / size); long estPercent = (int) (100. * e / size); if (debug) { diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java similarity index 83% rename from oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java rename to oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java index 570af9f16f..eb7ab9a692 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java @@ -18,37 +18,13 @@ */ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; import org.junit.Test; -public class HyperLogLogTest { - - @Test(expected = IllegalArgumentException.class) - public void illegalHyperLogLogTooSmall() { - new HyperLogLog(8, 0); - } - - @Test(expected = IllegalArgumentException.class) - public void illegalHyperLogLogNotPowerOfTwo() { - new HyperLogLog(30, 0); - } - - @Test - public void smallSet() { - HyperLogLog hll100 = new HyperLogLog(16, 100); - assertEquals(0, hll100.estimate()); - HyperLogLog hll0 = new HyperLogLog(16, 0); - assertEquals(0, hll0.estimate()); - for (int i = 0; i < 10_000; i++) { - hll100.add(i % 100); - hll0.add(i % 100); - } - assertEquals(100, hll100.estimate()); - assertNotEquals(100, hll0.estimate()); - } +public class HyperLogLog3Linear64Test { @Test public void test() { @@ -82,7 +58,6 @@ public class HyperLogLogTest { max = 0; break; } - // System.out.println(type + " expected " + min + ".." + max + " got " + avg); assertTrue("m " + m + " expected " + min + ".." + max + " got " + avg, min < avg && avg < max); } } @@ -123,7 +98,7 @@ public class HyperLogLogTest { } long baseX = x; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x)); + hll.add(HashUtils.hash64(x)); x++; } long e = hll.estimate(); @@ -136,7 +111,7 @@ public class HyperLogLogTest { for (int add = 0; add < repeat; add++) { long x2 = baseX; for (int i = 0; i < size; i++) { - hll.add(Hash.hash64(x2)); + hll.add(HashUtils.hash64(x2)); x2++; } } @@ -175,14 +150,16 @@ public class HyperLogLogTest { super(m, maxSmallSetSize); } + @Override public void add(long hash) { value = HyperLogLog3Linear64.add(value, hash); } + @Override public long estimate() { return HyperLogLog3Linear64.estimate(value); } } -} +} \ No newline at end of file diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java index 696e2e724f..dfd36a6eba 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java @@ -30,17 +30,17 @@ public class TopKValuesTest { public void test() { TopKValues v = new TopKValues(3); Random r = new Random(1); - for(int i=0; i<1000000; i++) { - if(r.nextBoolean()) { + for (int i = 0; i < 1000000; i++) { + if (r.nextBoolean()) { v.add("common" + r.nextInt(2)); } else { v.add("rare" + r.nextInt(100)); } } - assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}", v.toString()); + assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}", v.toString()); assertEquals(91809, v.getCount()); - assertEquals(24849, v.getTopCount()); - assertEquals(24652, v.getSecondCount()); + assertEquals(24231, v.getTopCount()); + assertEquals(23844, v.getSecondCount()); } } diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java index 14e30cd673..196f6188d4 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java @@ -16,14 +16,13 @@ */ package org.apache.jackrabbit.oak.plugins.document.cache; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnel; -import org.apache.jackrabbit.guava.common.hash.PrimitiveSink; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.util.List; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Predicate; public class CacheChangesTracker implements Closeable { @@ -62,10 +61,10 @@ public class CacheChangesTracker implements Closeable { changeTrackers.remove(this); if (LOG.isDebugEnabled()) { - if (lazyBloomFilter.filter == null) { + if (lazyBloomFilter.filterRef.get() == null) { LOG.debug("Disposing CacheChangesTracker for {}, no filter was needed", keyFilter); } else { - LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp()); + LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, LazyBloomFilter.FPP); } } } @@ -76,38 +75,33 @@ public class CacheChangesTracker implements Closeable { private final int entries; - private volatile BloomFilter<String> filter; + private final AtomicReference<BloomFilter> filterRef; public LazyBloomFilter(int entries) { this.entries = entries; + this.filterRef = new AtomicReference<>(); } public synchronized void put(String entry) { - getFilter().put(entry); + getFilter().add(entry); } public boolean mightContain(String entry) { - if (filter == null) { - return false; - } else { - synchronized (this) { - return filter.mightContain(entry); - } - } + BloomFilter f = filterRef.get(); + return f != null && f.mayContain(entry); } - private BloomFilter<String> getFilter() { - if (filter == null) { - filter = BloomFilter.create(new Funnel<String>() { - private static final long serialVersionUID = -7114267990225941161L; - - @Override - public void funnel(String from, PrimitiveSink into) { - into.putUnencodedChars(from); - } - }, entries, FPP); + private BloomFilter getFilter() { + BloomFilter result = filterRef.get(); + if (result == null) { + BloomFilter newFilter = BloomFilter.construct(entries, FPP); + if (filterRef.compareAndSet(null, newFilter)) { + result = newFilter; + } else { + result = filterRef.get(); + } } - return filter; + return result; } } } diff --git a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java new file mode 100644 index 0000000000..5d593df76c --- /dev/null +++ b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.oak.plugins.document.cache; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.Assert.*; + +/** + * Tests for CacheChangesTracker concurrency scenarios, particularly + * the LazyBloomFilter double-checked locking implementation. + */ +public class CacheChangesTrackerConcurrencyTest { + + /** + * Test concurrent initialization of LazyBloomFilter to ensure + * double-checked locking prevents race conditions. + */ + @Test + public void testLazyBloomFilterConcurrentInitialization() throws InterruptedException { + final int threadCount = 20; + final int entriesPerThread = 50; + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(threadCount); + final ExecutorService executor = Executors.newFixedThreadPool(threadCount); + + // Create a LazyBloomFilter instance + final CacheChangesTracker.LazyBloomFilter lazyFilter = + new CacheChangesTracker.LazyBloomFilter(1000); + + final AtomicInteger putOperations = new AtomicInteger(0); + final List<Exception> exceptions = Collections.synchronizedList(new ArrayList<>()); + + try { + // Create multiple threads that will all try to initialize and use the filter simultaneously + for (int i = 0; i < threadCount; i++) { + final int threadId = i; + executor.submit(() -> { + try { + // Wait for all threads to be ready + startLatch.await(); + + // Each thread adds multiple entries + for (int j = 0; j < entriesPerThread; j++) { + String key = "thread-" + threadId + "-key-" + j; + lazyFilter.put(key); + putOperations.incrementAndGet(); + + // Add a small random delay to increase chance of race condition + if (j % 10 == 0) { + Thread.sleep(1); + } + } + } catch (Exception e) { + exceptions.add(e); + } finally { + doneLatch.countDown(); + } + }); + } + + // Start all threads simultaneously + startLatch.countDown(); + + // Wait for all threads to complete + assertTrue("Test timed out", doneLatch.await(30, TimeUnit.SECONDS)); + + // Verify no exceptions occurred + if (!exceptions.isEmpty()) { + fail("Exceptions occurred during concurrent access: " + exceptions.get(0)); + } + + // Verify all put operations completed + assertEquals(threadCount * entriesPerThread, putOperations.get()); + + // Verify the filter works correctly after concurrent initialization + for (int i = 0; i < threadCount; i++) { + for (int j = 0; j < entriesPerThread; j++) { + String key = "thread-" + i + "-key-" + j; + assertTrue("Filter should contain key: " + key, lazyFilter.mightContain(key)); + } + } + + // Verify false positive behavior (some keys that weren't added should return false) + int falsePositives = 0; + int testKeys = 100; + for (int i = 0; i < testKeys; i++) { + String nonExistentKey = "non-existent-key-" + i; + if (lazyFilter.mightContain(nonExistentKey)) { + falsePositives++; + } + } + + // With 1000 entries and 1% FPP, we expect roughly 1% false positives for non-existent keys + // Allow for some variance but it shouldn't be too high + assertTrue("False positive rate too high: " + falsePositives + "/" + testKeys, + falsePositives < testKeys * 0.05); // Allow up to 5% to account for variance + + } finally { + executor.shutdown(); + if (!executor.awaitTermination(5, TimeUnit.SECONDS)) { + executor.shutdownNow(); + } + } + } + + /** + * Test concurrent put and mightContain operations to ensure thread safety. + */ + @Test + public void testLazyBloomFilterConcurrentReadWrite() throws InterruptedException { + final int threadCount = 10; + final int operationsPerThread = 100; + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(threadCount); + final ExecutorService executor = Executors.newFixedThreadPool(threadCount); + + final CacheChangesTracker.LazyBloomFilter lazyFilter = + new CacheChangesTracker.LazyBloomFilter(2000); + + final AtomicInteger readOperations = new AtomicInteger(0); + final AtomicInteger writeOperations = new AtomicInteger(0); + final List<Exception> exceptions = Collections.synchronizedList(new ArrayList<>()); + + try { + // Create mixed read/write threads + for (int i = 0; i < threadCount; i++) { + final int threadId = i; + final boolean isWriter = (i % 2 == 0); + + executor.submit(() -> { + try { + startLatch.await(); + + for (int j = 0; j < operationsPerThread; j++) { + String key = "mixed-thread-" + threadId + "-key-" + j; + + if (isWriter || j < 10) { // Writers, or first few operations of readers + lazyFilter.put(key); + writeOperations.incrementAndGet(); + } + + // All threads also do reads + boolean result = lazyFilter.mightContain(key); + readOperations.incrementAndGet(); + + // If we just wrote the key, it should definitely be found + if (isWriter || j < 10) { + assertTrue("Key should be found after being added: " + key, result); + } + } + } catch (Exception e) { + exceptions.add(e); + } finally { + doneLatch.countDown(); + } + }); + } + + startLatch.countDown(); + assertTrue("Test timed out", doneLatch.await(30, TimeUnit.SECONDS)); + + if (!exceptions.isEmpty()) { + fail("Exceptions occurred during concurrent read/write: " + exceptions.get(0)); + } + + assertTrue("Should have performed read operations", readOperations.get() > 0); + assertTrue("Should have performed write operations", writeOperations.get() > 0); + + } finally { + executor.shutdown(); + if (!executor.awaitTermination(5, TimeUnit.SECONDS)) { + executor.shutdownNow(); + } + } + } + + /** + * Test that LazyBloomFilter behaves correctly when filter is never initialized + * (i.e., only mightContain is called, never put). + */ + @Test + public void testLazyBloomFilterNoInitialization() { + CacheChangesTracker.LazyBloomFilter lazyFilter = + new CacheChangesTracker.LazyBloomFilter(1000); + + // Should return false for any key when filter is not initialized + assertFalse(lazyFilter.mightContain("any-key")); + assertFalse(lazyFilter.mightContain("another-key")); + assertFalse(lazyFilter.mightContain("")); + } +} \ No newline at end of file