This is an automated email from the ASF dual-hosted git repository.
thomasm pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new c438a3f96a OAK-10674 Use Oak's Bloom filter (#2412)
c438a3f96a is described below
commit c438a3f96a3b7c148a478b098aa59b7a57650c7d
Author: Thomas Mueller <[email protected]>
AuthorDate: Mon Aug 4 20:26:20 2025 +0200
OAK-10674 Use Oak's Bloom filter (#2412)
* OAK-11787 ElasticRegexPropertyIndexTest.regexPropertyWithoutFlattened
* OAK-10674 Use Oak's Bloom filter
* OAK-10674 Use Oak's Bloom filter
* OAK-10674 Use Oak's Bloom filter
* OAK-10674 Use Oak's Bloom filter
* OAK-10674 Use Oak's Bloom filter
---
.../jackrabbit/oak/spi/blob/split/BlobIdSet.java | 16 +-
.../oak/spi/blob/split/BlobIdSetTest.java | 238 +++++++++++++++++++++
oak-commons/pom.xml | 4 +
.../oak/commons/collections}/BloomFilter.java | 34 ++-
.../oak/commons/collections/HashUtils.java | 21 +-
.../oak/commons/collections}/HyperLogLog.java | 4 +-
.../oak/commons/collections/package-info.java | 2 +-
.../oak/commons/collections}/BloomFilterTest.java | 92 +++++++-
.../oak/commons/collections}/HyperLogLogTest.java | 39 +---
oak-run-commons/pom.xml | 5 +
.../flatfile/analysis/modules/BinaryId.java | 8 +-
.../analysis/modules/DistinctBinarySize.java | 4 +-
.../modules/DistinctBinarySizeHistogram.java | 2 +-
.../flatfile/analysis/modules/PropertyStats.java | 4 +-
.../flatfile/analysis/utils/TopKValues.java | 3 +-
.../index/indexer/document/tree/Prefetcher.java | 6 +-
.../analysis/modules/PropertyStatsTest.java | 2 +-
.../analysis/utils/CountMinSketchTest.java | 5 +-
...gLogTest.java => HyperLogLog3Linear64Test.java} | 39 +---
.../flatfile/analysis/utils/TopKValuesTest.java | 10 +-
.../document/cache/CacheChangesTracker.java | 44 ++--
.../cache/CacheChangesTrackerConcurrencyTest.java | 215 +++++++++++++++++++
22 files changed, 658 insertions(+), 139 deletions(-)
diff --git
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
index e62084fcfe..49acb59232 100644
---
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
+++
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
@@ -25,7 +25,6 @@ import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
-import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
@@ -33,8 +32,7 @@ import org.slf4j.LoggerFactory;
import org.apache.jackrabbit.guava.common.cache.Cache;
import org.apache.jackrabbit.guava.common.cache.CacheBuilder;
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnels;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
class BlobIdSet {
@@ -42,19 +40,19 @@ class BlobIdSet {
private final File store;
- private final BloomFilter<CharSequence> bloomFilter;
+ private final BloomFilter bloomFilter;
private final Cache<String, Boolean> cache;
BlobIdSet(String repositoryDir, String filename) {
store = new File(new File(repositoryDir), filename);
- bloomFilter =
BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); //
about 8MB
+ bloomFilter = BloomFilter.construct(9000000, 0.03); // 9M entries, 3%
false positive rate
cache = CacheBuilder.newBuilder().maximumSize(1000).build();
fillBloomFilter();
}
synchronized boolean contains(String blobId) throws IOException {
- if (!bloomFilter.apply(blobId)) {
+ if (!bloomFilter.mayContain(blobId)) {
return false;
}
Boolean cached = cache.getIfPresent(blobId);
@@ -64,7 +62,7 @@ class BlobIdSet {
if (isPresentInStore(blobId)) {
cache.put(blobId, Boolean.TRUE);
- bloomFilter.put(blobId);
+ bloomFilter.add(blobId);
return true;
} else {
cache.put(blobId, Boolean.FALSE);
@@ -74,7 +72,7 @@ class BlobIdSet {
synchronized void add(String blobId) throws IOException {
addToStore(blobId);
- bloomFilter.put(blobId);
+ bloomFilter.add(blobId);
cache.put(blobId, Boolean.TRUE);
}
@@ -114,7 +112,7 @@ class BlobIdSet {
reader = new BufferedReader(new FileReader(store));
String line;
while ((line = reader.readLine()) != null) {
- bloomFilter.put(line);
+ bloomFilter.add(line);
}
} catch (IOException e) {
log.error("Can't fill bloom filter", e);
diff --git
a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
new file mode 100644
index 0000000000..d933bf29d4
--- /dev/null
+++
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.spi.blob.split;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class BlobIdSetTest {
+
+ private static final String TEST_FILENAME = "test-blob-ids.txt";
+
+ private File tempDir;
+ private BlobIdSet blobIdSet;
+
+ @Before
+ public void setup() throws IOException {
+ tempDir = Files.createTempDirectory("blob-id-set-test").toFile();
+ blobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME);
+ }
+
+ @After
+ public void cleanup() {
+ File testFile = new File(tempDir, TEST_FILENAME);
+ if (testFile.exists()) {
+ testFile.delete();
+ }
+ tempDir.delete();
+ }
+
+ @Test
+ public void testAddAndContains() throws IOException {
+ String blobId = "testblob123";
+ Assert.assertFalse("New set should not contain blob ID",
blobIdSet.contains(blobId));
+
+ blobIdSet.add(blobId);
+ Assert.assertTrue("Set should contain added blob ID",
blobIdSet.contains(blobId));
+ }
+
+ @Test
+ public void testMultipleAddAndContains() throws IOException {
+ String[] blobIds = {"blob1", "blob2", "blob3", "blob4", "blob5"};
+
+ // Add all blob IDs
+ for (String blobId : blobIds) {
+ blobIdSet.add(blobId);
+ }
+
+ // Check all blob IDs are present
+ for (String blobId : blobIds) {
+ Assert.assertTrue("Set should contain: " + blobId,
blobIdSet.contains(blobId));
+ }
+
+ // Check a non-existent blob ID
+ Assert.assertFalse("Set should not contain non-existent blob ID",
blobIdSet.contains("nonexistentblob"));
+ }
+
+ @Test
+ public void testPersistenceAcrossInstances() throws IOException {
+ String blobId = "persistenceblob";
+
+ // Add to the first instance
+ blobIdSet.add(blobId);
+
+ // Create a new instance pointing to the same file
+ BlobIdSet newSet = new BlobIdSet(tempDir.getAbsolutePath(),
TEST_FILENAME);
+
+ // Verify the new instance sees the previously added blob ID
+ Assert.assertTrue("New instance should see blob ID from file",
newSet.contains(blobId));
+ }
+
+ @Test
+ public void testEmptyFileStore() throws IOException {
+ // Create with non-existent file
+ File nonExistentDir =
Files.createTempDirectory("non-existent").toFile();
+ BlobIdSet emptySet = new BlobIdSet(nonExistentDir.getAbsolutePath(),
"empty.txt");
+
+ // Should not contain any blob IDs
+ Assert.assertFalse(emptySet.contains("anyblob"));
+
+ // Clean up
+ nonExistentDir.delete();
+ }
+
+ @Test
+ public void testLargeNumberOfEntries() throws IOException {
+ // Add a moderate number of entries
+ int count = 1000;
+ for (int i = 0; i < count; i++) {
+ blobIdSet.add("blob-" + i);
+ }
+
+ // Verify all entries can be found
+ for (int i = 0; i < count; i++) {
+ Assert.assertTrue(blobIdSet.contains("blob-" + i));
+ }
+
+ // Non-existent entries should return false
+ for (int i = 0; i < 10; i++) {
+ Assert.assertFalse(blobIdSet.contains("nonexistent-blob-" + i));
+ }
+ }
+
+ @Test
+ public void testFileContainsAddedEntries() throws IOException {
+ // Add several blob IDs
+ String[] blobIds = {"a", "b", "c"};
+ for (String id : blobIds) {
+ blobIdSet.add(id);
+ }
+
+ // Verify the file contains the added blob IDs
+ File storeFile = new File(tempDir, TEST_FILENAME);
+ Assert.assertTrue("Store file should exist", storeFile.exists());
+
+ List<String> fileContent = Files.readAllLines(storeFile.toPath());
+ Assert.assertEquals("File should contain all added blob IDs",
blobIds.length, fileContent.size());
+
+ for (int i = 0; i < blobIds.length; i++) {
+ Assert.assertEquals("File line should match blob ID", blobIds[i],
fileContent.get(i));
+ }
+ }
+
+ @Test
+ public void testBloomFilterPreventsUnnecessaryFileReads() throws
IOException {
+ // The bloom filter should prevent checking the file for non-existent
IDs
+
+ // Add some entries to populate the bloom filter
+ for (int i = 0; i < 10; i++) {
+ blobIdSet.add("existing-" + i);
+ }
+
+ // Using a unique pattern for non-existent IDs to ensure they hash
differently
+ // than the ones we added
+ for (int i = 0; i < 10; i++) {
+ Assert.assertFalse(blobIdSet.contains("definitely-not-there-" +
i));
+ }
+ }
+
+ @Test
+ public void testCachingBehavior() throws IOException {
+ String blobId = "cachedblob";
+
+ // Add the blob ID
+ blobIdSet.add(blobId);
+
+ // First check should populate cache
+ Assert.assertTrue(blobIdSet.contains(blobId));
+
+ // Even if we delete the file, the cached result should be used
+ File storeFile = new File(tempDir, TEST_FILENAME);
+ storeFile.delete();
+
+ // Should still return true due to cache
+ Assert.assertTrue(blobIdSet.contains(blobId));
+ }
+
+ @Test
+ public void testContainsFindsExistingEntriesInFile() throws IOException {
+ // Create some blob IDs
+ String[] blobIds = {"fileblob1", "fileblob2", "fileblob3"};
+
+ // Write blob IDs directly to file (not using BlobIdSet.add())
+ File storeFile = new File(tempDir, TEST_FILENAME);
+ try (FileWriter writer = new FileWriter(storeFile)) {
+ for (String id : blobIds) {
+ writer.write(id + "\n");
+ }
+ }
+
+ // Create a new BlobIdSet instance (which should load from the file)
+ BlobIdSet newBlobIdSet = new BlobIdSet(tempDir.getAbsolutePath(),
TEST_FILENAME);
+
+ // Verify that contains() finds all the IDs
+ for (String id : blobIds) {
+ Assert.assertTrue("Should contain blob ID written directly to
file: " + id, newBlobIdSet.contains(id));
+ }
+
+ // Verify a non-existent ID still returns false
+ Assert.assertFalse(newBlobIdSet.contains("notinfile"));
+ }
+
+ @Test
+ public void testBloomFilterFalsePositiveProbabilityLessThanThreePercent()
throws IOException {
+ // Load the bloom filter with a significant number of entries (about
5% of configured capacity)
+ final int numToAdd = 5000;
+
+ // Add entries to the bloom filter
+ for (int i = 0; i < numToAdd; i++) {
+ blobIdSet.add("entry-" + i);
+ }
+
+ // Test with non-existent entries using carefully crafted IDs
+ int numTests = 1000;
+ int falsePositives = 0;
+
+ // Use a distinct prefix to ensure test IDs don't conflict with added
entries
+ for (int i = 0; i < numTests; i++) {
+ String nonExistentId = "non-existent-" + i;
+
+ if (blobIdSet.contains(nonExistentId)) {
+ falsePositives++;
+ }
+ }
+
+ final double falsePositiveRate = (double) falsePositives / numTests;
+
+ // Verify the false positive rate is below the configured 3%
+ Assert.assertTrue(
+ "False positive rate should be less than 3%, was: " +
(falsePositiveRate * 100) + "%",
+ falsePositiveRate < 0.03
+ );
+ }
+}
\ No newline at end of file
diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml
index 5e8396cd65..8b774a03dc 100644
--- a/oak-commons/pom.xml
+++ b/oak-commons/pom.xml
@@ -102,6 +102,10 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
</dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </dependency>
<dependency>
<groupId>org.apache.jackrabbit</groupId>
<artifactId>jackrabbit-jcr-commons</artifactId>
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
similarity index 83%
rename from
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
rename to
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
index 3ad439a909..3c266ea1bb 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
/**
* A Bloom filter implementation.
@@ -44,6 +44,12 @@ public class BloomFilter {
* @return the Bloom filter
*/
public static BloomFilter construct(long n, double fpp) {
+ if (n <= 0) {
+ throw new IllegalArgumentException("n must be greater than 0");
+ }
+ if (fpp <= 0 || fpp >= 1) {
+ throw new IllegalArgumentException("fpp must be between 0 and 1");
+ }
long m = calculateBits(n, fpp);
int k = calculateK((double) m / n);
return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
@@ -96,6 +102,15 @@ public class BloomFilter {
return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
}
+ /**
+ * Add an entry, using an int hash code.
+ *
+ * @param value the value to check
+ */
+ public void add(String value) {
+ add(HashUtils.hash64(value));
+ }
+
/**
* Add an entry.
*
@@ -106,11 +121,22 @@ public class BloomFilter {
long a = (hash >>> 32) | (hash << 32);
long b = hash;
for (int i = 0; i < k; i++) {
- data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
+ data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
a += b;
}
}
+ /**
+ * Whether the entry may be in the set.
+ *
+ * @param value the value to check
+ * @return true if the entry was added, or, with a certain false positive
+ * probability, even if it was not added
+ */
+ public boolean mayContain(String value) {
+ return mayContain(HashUtils.hash64(value));
+ }
+
/**
* Whether the entry may be in the set.
*
@@ -123,7 +149,7 @@ public class BloomFilter {
long a = (hash >>> 32) | (hash << 32);
long b = hash;
for (int i = 0; i < k; i++) {
- if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) ==
0) {
+ if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L <<
a) == 0) {
return false;
}
a += b;
@@ -159,4 +185,4 @@ public class BloomFilter {
return (long) (-(m / k) * Math.log(1 - (x / m)));
}
-}
\ No newline at end of file
+}
\ No newline at end of file
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
similarity index 83%
rename from
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
rename to
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
index a2cd599e7b..90d998664c 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
@@ -16,17 +16,30 @@
* specific language governing permissions and limitations
* under the License.
*/
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.codec.digest.MurmurHash3;
/**
* A hash function utility class.
*/
-public class Hash {
+public class HashUtils {
- private Hash() {
+ private HashUtils() {
// utility class
}
+ /**
+ * Calculate a 64-bit hash value from a string.
+ *
+ * @param s the string
+ * @return the hash value
+ */
+ public static long hash64(String s) {
+ return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0];
+ }
+
/**
* Calculate a 64-bit hash value from a value, using a seed.
*
@@ -70,4 +83,4 @@ public class Hash {
return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
}
-}
+}
\ No newline at end of file
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
similarity index 97%
rename from
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
rename to
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
index dd20a1e777..2054decff0 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
import java.util.HashSet;
@@ -92,4 +92,4 @@ public class HyperLogLog {
return est;
}
-}
+}
\ No newline at end of file
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
index 81c20db5ca..5d1490f210 100644
---
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
@@ -21,7 +21,7 @@
* Utilities for Java collections and streams.
*/
@Internal(since = "1.0.0")
-@Version("2.2.0")
+@Version("2.3.0")
package org.apache.jackrabbit.oak.commons.collections;
import org.apache.jackrabbit.oak.commons.annotations.Internal;
import org.osgi.annotation.versioning.Version;
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
similarity index 61%
rename from
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
rename to
oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
index c3813f340d..464de8c4ef 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
+++
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
@@ -16,10 +16,12 @@
* specific language governing permissions and limitations
* under the License.
*/
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThrows;
import org.junit.Test;
@@ -61,9 +63,6 @@ public class BloomFilterTest {
BloomFilter f = BloomFilter.construct(100, 0.01);
assertEquals(960, f.getBitCount());
assertEquals(7, f.getK());
- f = BloomFilter.construct(0, 0.01);
- assertEquals(0, f.getBitCount());
- assertEquals(1, f.getK());
}
@Test
@@ -72,14 +71,14 @@ public class BloomFilterTest {
int size = 500_000;
BloomFilter f = BloomFilter.construct(size, fpp);
for (int i = 0; i < size; i++) {
- f.add(Hash.hash64(i));
+ f.add(HashUtils.hash64(i));
}
for (int i = 0; i < size; i++) {
- assertTrue(f.mayContain(Hash.hash64(i)));
+ assertTrue(f.mayContain(HashUtils.hash64(i)));
}
int falsePositives = 0;
for (int i = 0; i < size; i++) {
- if (f.mayContain(Hash.hash64(i + size))) {
+ if (f.mayContain(HashUtils.hash64(i + size))) {
falsePositives++;
}
}
@@ -105,7 +104,7 @@ public class BloomFilterTest {
HyperLogLog hll = new HyperLogLog(1024, 0);
// now we calculate estimations with both the Bloom filter and
HyperLogLog
for(int i = 0; i < 20_000; i++) {
- long x = Hash.hash64(i);
+ long x = HashUtils.hash64(i);
bloom.add(x);
hll.add(x);
if (i > 0 && i % 1000 == 0) {
@@ -123,4 +122,79 @@ public class BloomFilterTest {
}
}
-}
+ @Test
+ public void filterFunctionality() {
+ BloomFilter filter = BloomFilter.construct(100, 0.01);
+ String testValue = "test-value";
+
+ // Initially should not contain anything
+ assertFalse(filter.mayContain(testValue));
+
+ // Add the item and verify it's found
+ filter.add(testValue);
+ assertTrue(filter.mayContain(testValue));
+
+ // Verify another value is not found
+ assertFalse(filter.mayContain("different-value"));
+ }
+
+ @Test
+ public void filterWithMultipleEntries() {
+ BloomFilter filter = BloomFilter.construct(100, 0.01);
+
+ // Add multiple entries
+ for (int i = 0; i < 100; i++) {
+ filter.add("value-" + i);
+ }
+
+ // Verify all entries are found
+ for (int i = 0; i < 100; i++) {
+ assertTrue(filter.mayContain("value-" + i));
+ }
+ }
+
+ @Test
+ public void falsePositiveProbability() {
+ // Create a filter with high false positive probability for testing
+ double fpp = 0.3;
+ BloomFilter filter = BloomFilter.construct(100, fpp);
+
+ // Fill the filter to capacity
+ for (int i = 0; i < 100; i++) {
+ filter.add("existing-" + i);
+ }
+
+ // Test with values not in the filter
+ int falsePositives = 0;
+ int trials = 1000;
+
+ for (int i = 0; i < trials; i++) {
+ if (filter.mayContain("nonexistent-" + i)) {
+ falsePositives++;
+ }
+ }
+
+ // The false positive rate should be approximately fpp
+ double actualFpp = (double) falsePositives / trials;
+ assertTrue("False positive rate should be close to expected, got " +
actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15);
+ }
+
+ @Test
+ public void invalidEntries() {
+ // Should throw exception for entries < 1
+ assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(0, 0.01));
+ }
+
+ @Test
+ public void invalidFppZero() {
+ // Should throw exception for fpp <= 0
+ assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(100, 0.0));
+ }
+
+ @Test
+ public void invalidFppOne() {
+ // Should throw exception for fpp >= 1
+ assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(100, 1.0));
+ }
+
+}
\ No newline at end of file
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
similarity index 84%
copy from
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
copy to
oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
index 570af9f16f..fd3fb39bff 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
+++
b/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+package org.apache.jackrabbit.oak.commons.collections;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
@@ -53,14 +53,10 @@ public class HyperLogLogTest {
@Test
public void test() {
int testCount = 50;
- for (int m = 8; m <= 128; m *= 2) {
+ for (int m = 16; m <= 128; m *= 2) {
double avg = Math.sqrt(averageOverRange(m, 30_000, testCount,
false, 2));
int min, max;
switch (m) {
- case 8:
- min = 16;
- max = 17;
- break;
case 16:
min = 22;
max = 23;
@@ -82,7 +78,6 @@ public class HyperLogLogTest {
max = 0;
break;
}
- // System.out.println(type + " expected " + min + ".." + max + "
got " + avg);
assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
}
}
@@ -116,14 +111,10 @@ public class HyperLogLogTest {
int runs = 2;
for (int test = 0; test < testCount; test++) {
HyperLogLog hll;
- if (m == 8) {
- hll = new HyperLogLogUsingLong(16, 0);
- } else {
- hll = new HyperLogLog(m, 0);
- }
+ hll = new HyperLogLog(m, 0);
long baseX = x;
for (int i = 0; i < size; i++) {
- hll.add(Hash.hash64(x));
+ hll.add(HashUtils.hash64(x));
x++;
}
long e = hll.estimate();
@@ -136,7 +127,7 @@ public class HyperLogLogTest {
for (int add = 0; add < repeat; add++) {
long x2 = baseX;
for (int i = 0; i < size; i++) {
- hll.add(Hash.hash64(x2));
+ hll.add(HashUtils.hash64(x2));
x2++;
}
}
@@ -167,22 +158,4 @@ public class HyperLogLogTest {
return Math.pow(relStdDevP, exponent);
}
- static class HyperLogLogUsingLong extends HyperLogLog {
-
- private long value;
-
- public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
- super(m, maxSmallSetSize);
- }
-
- public void add(long hash) {
- value = HyperLogLog3Linear64.add(value, hash);
- }
-
- public long estimate() {
- return HyperLogLog3Linear64.estimate(value);
- }
-
- }
-
-}
+}
\ No newline at end of file
diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml
index e1288dccc2..efd3657e51 100644
--- a/oak-run-commons/pom.xml
+++ b/oak-run-commons/pom.xml
@@ -110,6 +110,11 @@
<artifactId>oak-jcr</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>oak-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>org.jetbrains</groupId>
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
index 3ec93f175d..741c1cceae 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
@@ -20,7 +20,7 @@ package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul
import java.util.Objects;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
/**
* A binary id.
@@ -51,9 +51,9 @@ public class BinaryId {
// we need to hash again because some of the bits are fixed
// in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx
// (the hash64 is a reversible mapping, so there is no risk of
conflicts)
- this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16),
16));
- this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32),
16));
- this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32,
Math.min(48, buff.length())), 16));
+ this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0,
16), 16));
+ this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16,
32), 16));
+ this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32,
Math.min(48, buff.length())), 16));
}
@Override
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
index 272c7f357b..878375f1f8 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
@@ -28,8 +28,8 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
/**
* Collects the number and size of distinct binaries.
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
index 349a3fe35b..061aa39d52 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
@@ -27,7 +27,7 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
/**
* A histogram of distinct binaries. For each size range, we calculate the
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
index 80227f5744..c749a1ab59 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
@@ -31,7 +31,7 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues;
@@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector {
stats.values += p.getValues().length;
if (stats.count > MIN_PROPERTY_COUNT) {
for (String v : p.getValues()) {
- long hash = Hash.hash64(v.hashCode(), seed);
+ long hash = HashUtils.hash64(v.hashCode(), seed);
stats.hll = HyperLogLog3Linear64.add(stats.hll, hash);
stats.size += v.length();
stats.maxSize = Math.max(stats.maxSize, v.length());
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
index 2c235f2197..48b4a63b0b 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
@@ -21,6 +21,7 @@ package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils
import java.util.ArrayList;
import java.util.HashMap;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
/**
@@ -91,7 +92,7 @@ public class TopKValues {
if (countedCount > 1000) {
skipRemaining = SKIP;
}
- long hash = Hash.hash64(value.hashCode());
+ long hash = HashUtils.hash64(value);
long est = sketch.addAndEstimate(hash);
if (est < min) {
return;
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
index 10992f5c06..8f4e6d2cac 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
@@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -240,7 +240,7 @@ public class Prefetcher {
// and then we use a secondary hash
// otherwise the estimation is way off
int h = blob.getContentIdentity().hashCode();
- return Hash.hash64(h | (blob.length() << 32));
+ return HashUtils.hash64(h | (blob.length() << 32));
}
static enum PrefetchType {
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
index 7803a54400..5fe819335d 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
@@ -67,7 +67,7 @@ public class PropertyStatsTest {
pc.add(n);
}
assertEquals("PropertyStats\n"
- + "skewed weight 3 count 1000000 distinct 394382 avgSize 7
maxSize 11 top
{\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+ + "skewed weight 3 count 1000000 distinct 394382 avgSize 7
maxSize 11 top
{\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+ "", pc.toString());
}
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
index 92935ca286..212c0fba22 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue;
import java.util.Arrays;
import java.util.Random;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import org.junit.Test;
public class CountMinSketchTest {
@@ -83,11 +84,11 @@ public class CountMinSketchTest {
}
CountMinSketch est = new CountMinSketch(5, 16);
for (int i = 0; i < size; i++) {
- est.add(Hash.hash64(x + data[i]));
+ est.add(HashUtils.hash64(x + data[i]));
}
int[] counts = getCounts(data);
for (int i = 0; i < 10; i++) {
- long e = est.estimate(Hash.hash64(x + i));
+ long e = est.estimate(HashUtils.hash64(x + i));
long expectedPercent = (int) (100. * counts[i] / size);
long estPercent = (int) (100. * e / size);
if (debug) {
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
similarity index 83%
rename from
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
rename to
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
index 570af9f16f..eb7ab9a692 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
@@ -18,37 +18,13 @@
*/
package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
import org.junit.Test;
-public class HyperLogLogTest {
-
- @Test(expected = IllegalArgumentException.class)
- public void illegalHyperLogLogTooSmall() {
- new HyperLogLog(8, 0);
- }
-
- @Test(expected = IllegalArgumentException.class)
- public void illegalHyperLogLogNotPowerOfTwo() {
- new HyperLogLog(30, 0);
- }
-
- @Test
- public void smallSet() {
- HyperLogLog hll100 = new HyperLogLog(16, 100);
- assertEquals(0, hll100.estimate());
- HyperLogLog hll0 = new HyperLogLog(16, 0);
- assertEquals(0, hll0.estimate());
- for (int i = 0; i < 10_000; i++) {
- hll100.add(i % 100);
- hll0.add(i % 100);
- }
- assertEquals(100, hll100.estimate());
- assertNotEquals(100, hll0.estimate());
- }
+public class HyperLogLog3Linear64Test {
@Test
public void test() {
@@ -82,7 +58,6 @@ public class HyperLogLogTest {
max = 0;
break;
}
- // System.out.println(type + " expected " + min + ".." + max + "
got " + avg);
assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
}
}
@@ -123,7 +98,7 @@ public class HyperLogLogTest {
}
long baseX = x;
for (int i = 0; i < size; i++) {
- hll.add(Hash.hash64(x));
+ hll.add(HashUtils.hash64(x));
x++;
}
long e = hll.estimate();
@@ -136,7 +111,7 @@ public class HyperLogLogTest {
for (int add = 0; add < repeat; add++) {
long x2 = baseX;
for (int i = 0; i < size; i++) {
- hll.add(Hash.hash64(x2));
+ hll.add(HashUtils.hash64(x2));
x2++;
}
}
@@ -175,14 +150,16 @@ public class HyperLogLogTest {
super(m, maxSmallSetSize);
}
+ @Override
public void add(long hash) {
value = HyperLogLog3Linear64.add(value, hash);
}
+ @Override
public long estimate() {
return HyperLogLog3Linear64.estimate(value);
}
}
-}
+}
\ No newline at end of file
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
index 696e2e724f..dfd36a6eba 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
@@ -30,17 +30,17 @@ public class TopKValuesTest {
public void test() {
TopKValues v = new TopKValues(3);
Random r = new Random(1);
- for(int i=0; i<1000000; i++) {
- if(r.nextBoolean()) {
+ for (int i = 0; i < 1000000; i++) {
+ if (r.nextBoolean()) {
v.add("common" + r.nextInt(2));
} else {
v.add("rare" + r.nextInt(100));
}
}
-
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}",
v.toString());
+
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}",
v.toString());
assertEquals(91809, v.getCount());
- assertEquals(24849, v.getTopCount());
- assertEquals(24652, v.getSecondCount());
+ assertEquals(24231, v.getTopCount());
+ assertEquals(23844, v.getSecondCount());
}
}
diff --git
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
index 14e30cd673..196f6188d4 100644
---
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
+++
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
@@ -16,14 +16,13 @@
*/
package org.apache.jackrabbit.oak.plugins.document.cache;
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnel;
-import org.apache.jackrabbit.guava.common.hash.PrimitiveSink;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Closeable;
import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Predicate;
public class CacheChangesTracker implements Closeable {
@@ -62,10 +61,10 @@ public class CacheChangesTracker implements Closeable {
changeTrackers.remove(this);
if (LOG.isDebugEnabled()) {
- if (lazyBloomFilter.filter == null) {
+ if (lazyBloomFilter.filterRef.get() == null) {
LOG.debug("Disposing CacheChangesTracker for {}, no filter was
needed", keyFilter);
} else {
- LOG.debug("Disposing CacheChangesTracker for {}, filter fpp
was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp());
+ LOG.debug("Disposing CacheChangesTracker for {}, filter fpp
was: {}", keyFilter, LazyBloomFilter.FPP);
}
}
}
@@ -76,38 +75,33 @@ public class CacheChangesTracker implements Closeable {
private final int entries;
- private volatile BloomFilter<String> filter;
+ private final AtomicReference<BloomFilter> filterRef;
public LazyBloomFilter(int entries) {
this.entries = entries;
+ this.filterRef = new AtomicReference<>();
}
public synchronized void put(String entry) {
- getFilter().put(entry);
+ getFilter().add(entry);
}
public boolean mightContain(String entry) {
- if (filter == null) {
- return false;
- } else {
- synchronized (this) {
- return filter.mightContain(entry);
- }
- }
+ BloomFilter f = filterRef.get();
+ return f != null && f.mayContain(entry);
}
- private BloomFilter<String> getFilter() {
- if (filter == null) {
- filter = BloomFilter.create(new Funnel<String>() {
- private static final long serialVersionUID =
-7114267990225941161L;
-
- @Override
- public void funnel(String from, PrimitiveSink into) {
- into.putUnencodedChars(from);
- }
- }, entries, FPP);
+ private BloomFilter getFilter() {
+ BloomFilter result = filterRef.get();
+ if (result == null) {
+ BloomFilter newFilter = BloomFilter.construct(entries, FPP);
+ if (filterRef.compareAndSet(null, newFilter)) {
+ result = newFilter;
+ } else {
+ result = filterRef.get();
+ }
}
- return filter;
+ return result;
}
}
}
diff --git
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
new file mode 100644
index 0000000000..5d593df76c
--- /dev/null
+++
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.document.cache;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.junit.Assert.*;
+
+/**
+ * Tests for CacheChangesTracker concurrency scenarios, particularly
+ * the LazyBloomFilter double-checked locking implementation.
+ */
+public class CacheChangesTrackerConcurrencyTest {
+
+ /**
+ * Test concurrent initialization of LazyBloomFilter to ensure
+ * double-checked locking prevents race conditions.
+ */
+ @Test
+ public void testLazyBloomFilterConcurrentInitialization() throws
InterruptedException {
+ final int threadCount = 20;
+ final int entriesPerThread = 50;
+ final CountDownLatch startLatch = new CountDownLatch(1);
+ final CountDownLatch doneLatch = new CountDownLatch(threadCount);
+ final ExecutorService executor =
Executors.newFixedThreadPool(threadCount);
+
+ // Create a LazyBloomFilter instance
+ final CacheChangesTracker.LazyBloomFilter lazyFilter =
+ new CacheChangesTracker.LazyBloomFilter(1000);
+
+ final AtomicInteger putOperations = new AtomicInteger(0);
+ final List<Exception> exceptions = Collections.synchronizedList(new
ArrayList<>());
+
+ try {
+ // Create multiple threads that will all try to initialize and use
the filter simultaneously
+ for (int i = 0; i < threadCount; i++) {
+ final int threadId = i;
+ executor.submit(() -> {
+ try {
+ // Wait for all threads to be ready
+ startLatch.await();
+
+ // Each thread adds multiple entries
+ for (int j = 0; j < entriesPerThread; j++) {
+ String key = "thread-" + threadId + "-key-" + j;
+ lazyFilter.put(key);
+ putOperations.incrementAndGet();
+
+ // Add a small random delay to increase chance of
race condition
+ if (j % 10 == 0) {
+ Thread.sleep(1);
+ }
+ }
+ } catch (Exception e) {
+ exceptions.add(e);
+ } finally {
+ doneLatch.countDown();
+ }
+ });
+ }
+
+ // Start all threads simultaneously
+ startLatch.countDown();
+
+ // Wait for all threads to complete
+ assertTrue("Test timed out", doneLatch.await(30,
TimeUnit.SECONDS));
+
+ // Verify no exceptions occurred
+ if (!exceptions.isEmpty()) {
+ fail("Exceptions occurred during concurrent access: " +
exceptions.get(0));
+ }
+
+ // Verify all put operations completed
+ assertEquals(threadCount * entriesPerThread, putOperations.get());
+
+ // Verify the filter works correctly after concurrent
initialization
+ for (int i = 0; i < threadCount; i++) {
+ for (int j = 0; j < entriesPerThread; j++) {
+ String key = "thread-" + i + "-key-" + j;
+ assertTrue("Filter should contain key: " + key,
lazyFilter.mightContain(key));
+ }
+ }
+
+ // Verify false positive behavior (some keys that weren't added
should return false)
+ int falsePositives = 0;
+ int testKeys = 100;
+ for (int i = 0; i < testKeys; i++) {
+ String nonExistentKey = "non-existent-key-" + i;
+ if (lazyFilter.mightContain(nonExistentKey)) {
+ falsePositives++;
+ }
+ }
+
+ // With 1000 entries and 1% FPP, we expect roughly 1% false
positives for non-existent keys
+ // Allow for some variance but it shouldn't be too high
+ assertTrue("False positive rate too high: " + falsePositives + "/"
+ testKeys,
+ falsePositives < testKeys * 0.05); // Allow up to 5% to
account for variance
+
+ } finally {
+ executor.shutdown();
+ if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
+ executor.shutdownNow();
+ }
+ }
+ }
+
+ /**
+ * Test concurrent put and mightContain operations to ensure thread safety.
+ */
+ @Test
+ public void testLazyBloomFilterConcurrentReadWrite() throws
InterruptedException {
+ final int threadCount = 10;
+ final int operationsPerThread = 100;
+ final CountDownLatch startLatch = new CountDownLatch(1);
+ final CountDownLatch doneLatch = new CountDownLatch(threadCount);
+ final ExecutorService executor =
Executors.newFixedThreadPool(threadCount);
+
+ final CacheChangesTracker.LazyBloomFilter lazyFilter =
+ new CacheChangesTracker.LazyBloomFilter(2000);
+
+ final AtomicInteger readOperations = new AtomicInteger(0);
+ final AtomicInteger writeOperations = new AtomicInteger(0);
+ final List<Exception> exceptions = Collections.synchronizedList(new
ArrayList<>());
+
+ try {
+ // Create mixed read/write threads
+ for (int i = 0; i < threadCount; i++) {
+ final int threadId = i;
+ final boolean isWriter = (i % 2 == 0);
+
+ executor.submit(() -> {
+ try {
+ startLatch.await();
+
+ for (int j = 0; j < operationsPerThread; j++) {
+ String key = "mixed-thread-" + threadId + "-key-"
+ j;
+
+ if (isWriter || j < 10) { // Writers, or first few
operations of readers
+ lazyFilter.put(key);
+ writeOperations.incrementAndGet();
+ }
+
+ // All threads also do reads
+ boolean result = lazyFilter.mightContain(key);
+ readOperations.incrementAndGet();
+
+ // If we just wrote the key, it should definitely
be found
+ if (isWriter || j < 10) {
+ assertTrue("Key should be found after being
added: " + key, result);
+ }
+ }
+ } catch (Exception e) {
+ exceptions.add(e);
+ } finally {
+ doneLatch.countDown();
+ }
+ });
+ }
+
+ startLatch.countDown();
+ assertTrue("Test timed out", doneLatch.await(30,
TimeUnit.SECONDS));
+
+ if (!exceptions.isEmpty()) {
+ fail("Exceptions occurred during concurrent read/write: " +
exceptions.get(0));
+ }
+
+ assertTrue("Should have performed read operations",
readOperations.get() > 0);
+ assertTrue("Should have performed write operations",
writeOperations.get() > 0);
+
+ } finally {
+ executor.shutdown();
+ if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
+ executor.shutdownNow();
+ }
+ }
+ }
+
+ /**
+ * Test that LazyBloomFilter behaves correctly when filter is never
initialized
+ * (i.e., only mightContain is called, never put).
+ */
+ @Test
+ public void testLazyBloomFilterNoInitialization() {
+ CacheChangesTracker.LazyBloomFilter lazyFilter =
+ new CacheChangesTracker.LazyBloomFilter(1000);
+
+ // Should return false for any key when filter is not initialized
+ assertFalse(lazyFilter.mightContain("any-key"));
+ assertFalse(lazyFilter.mightContain("another-key"));
+ assertFalse(lazyFilter.mightContain(""));
+ }
+}
\ No newline at end of file