This is an automated email from the ASF dual-hosted git repository.
reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new a9641650ac Revert "OAK-10674 Use Oak's Bloom filter (#2412)"
a9641650ac is described below
commit a9641650acb173d8a10183f14e8414519a4c646e
Author: Julian Reschke <[email protected]>
AuthorDate: Tue Aug 5 05:09:29 2025 +0100
Revert "OAK-10674 Use Oak's Bloom filter (#2412)"
This reverts commit c438a3f96a3b7c148a478b098aa59b7a57650c7d.
---
.../jackrabbit/oak/spi/blob/split/BlobIdSet.java | 16 +-
.../oak/spi/blob/split/BlobIdSetTest.java | 238 ---------------------
oak-commons/pom.xml | 4 -
.../oak/commons/collections/package-info.java | 2 +-
oak-run-commons/pom.xml | 5 -
.../flatfile/analysis/modules/BinaryId.java | 8 +-
.../analysis/modules/DistinctBinarySize.java | 4 +-
.../modules/DistinctBinarySizeHistogram.java | 2 +-
.../flatfile/analysis/modules/PropertyStats.java | 4 +-
.../flatfile/analysis/utils}/BloomFilter.java | 34 +--
.../document/flatfile/analysis/utils/Hash.java | 21 +-
.../flatfile/analysis/utils}/HyperLogLog.java | 4 +-
.../flatfile/analysis/utils/TopKValues.java | 3 +-
.../index/indexer/document/tree/Prefetcher.java | 6 +-
.../analysis/modules/PropertyStatsTest.java | 2 +-
.../flatfile/analysis/utils}/BloomFilterTest.java | 92 +-------
.../analysis/utils/CountMinSketchTest.java | 5 +-
.../analysis/utils/HyperLogLog3Linear64Test.java | 165 --------------
.../flatfile/analysis/utils}/HyperLogLogTest.java | 39 +++-
.../flatfile/analysis/utils/TopKValuesTest.java | 10 +-
.../document/cache/CacheChangesTracker.java | 44 ++--
.../cache/CacheChangesTrackerConcurrencyTest.java | 215 -------------------
22 files changed, 108 insertions(+), 815 deletions(-)
diff --git
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
index 49acb59232..e62084fcfe 100644
---
a/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
+++
b/oak-blob/src/main/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSet.java
@@ -25,6 +25,7 @@ import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
@@ -32,7 +33,8 @@ import org.slf4j.LoggerFactory;
import org.apache.jackrabbit.guava.common.cache.Cache;
import org.apache.jackrabbit.guava.common.cache.CacheBuilder;
-import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
+import org.apache.jackrabbit.guava.common.hash.BloomFilter;
+import org.apache.jackrabbit.guava.common.hash.Funnels;
class BlobIdSet {
@@ -40,19 +42,19 @@ class BlobIdSet {
private final File store;
- private final BloomFilter bloomFilter;
+ private final BloomFilter<CharSequence> bloomFilter;
private final Cache<String, Boolean> cache;
BlobIdSet(String repositoryDir, String filename) {
store = new File(new File(repositoryDir), filename);
- bloomFilter = BloomFilter.construct(9000000, 0.03); // 9M entries, 3%
false positive rate
+ bloomFilter =
BloomFilter.create(Funnels.stringFunnel(StandardCharsets.UTF_8), 9000000); //
about 8MB
cache = CacheBuilder.newBuilder().maximumSize(1000).build();
fillBloomFilter();
}
synchronized boolean contains(String blobId) throws IOException {
- if (!bloomFilter.mayContain(blobId)) {
+ if (!bloomFilter.apply(blobId)) {
return false;
}
Boolean cached = cache.getIfPresent(blobId);
@@ -62,7 +64,7 @@ class BlobIdSet {
if (isPresentInStore(blobId)) {
cache.put(blobId, Boolean.TRUE);
- bloomFilter.add(blobId);
+ bloomFilter.put(blobId);
return true;
} else {
cache.put(blobId, Boolean.FALSE);
@@ -72,7 +74,7 @@ class BlobIdSet {
synchronized void add(String blobId) throws IOException {
addToStore(blobId);
- bloomFilter.add(blobId);
+ bloomFilter.put(blobId);
cache.put(blobId, Boolean.TRUE);
}
@@ -112,7 +114,7 @@ class BlobIdSet {
reader = new BufferedReader(new FileReader(store));
String line;
while ((line = reader.readLine()) != null) {
- bloomFilter.add(line);
+ bloomFilter.put(line);
}
} catch (IOException e) {
log.error("Can't fill bloom filter", e);
diff --git
a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
b/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
deleted file mode 100644
index d933bf29d4..0000000000
---
a/oak-blob/src/test/java/org/apache/jackrabbit/oak/spi/blob/split/BlobIdSetTest.java
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.jackrabbit.oak.spi.blob.split;
-
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.util.List;
-
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-public class BlobIdSetTest {
-
- private static final String TEST_FILENAME = "test-blob-ids.txt";
-
- private File tempDir;
- private BlobIdSet blobIdSet;
-
- @Before
- public void setup() throws IOException {
- tempDir = Files.createTempDirectory("blob-id-set-test").toFile();
- blobIdSet = new BlobIdSet(tempDir.getAbsolutePath(), TEST_FILENAME);
- }
-
- @After
- public void cleanup() {
- File testFile = new File(tempDir, TEST_FILENAME);
- if (testFile.exists()) {
- testFile.delete();
- }
- tempDir.delete();
- }
-
- @Test
- public void testAddAndContains() throws IOException {
- String blobId = "testblob123";
- Assert.assertFalse("New set should not contain blob ID",
blobIdSet.contains(blobId));
-
- blobIdSet.add(blobId);
- Assert.assertTrue("Set should contain added blob ID",
blobIdSet.contains(blobId));
- }
-
- @Test
- public void testMultipleAddAndContains() throws IOException {
- String[] blobIds = {"blob1", "blob2", "blob3", "blob4", "blob5"};
-
- // Add all blob IDs
- for (String blobId : blobIds) {
- blobIdSet.add(blobId);
- }
-
- // Check all blob IDs are present
- for (String blobId : blobIds) {
- Assert.assertTrue("Set should contain: " + blobId,
blobIdSet.contains(blobId));
- }
-
- // Check a non-existent blob ID
- Assert.assertFalse("Set should not contain non-existent blob ID",
blobIdSet.contains("nonexistentblob"));
- }
-
- @Test
- public void testPersistenceAcrossInstances() throws IOException {
- String blobId = "persistenceblob";
-
- // Add to the first instance
- blobIdSet.add(blobId);
-
- // Create a new instance pointing to the same file
- BlobIdSet newSet = new BlobIdSet(tempDir.getAbsolutePath(),
TEST_FILENAME);
-
- // Verify the new instance sees the previously added blob ID
- Assert.assertTrue("New instance should see blob ID from file",
newSet.contains(blobId));
- }
-
- @Test
- public void testEmptyFileStore() throws IOException {
- // Create with non-existent file
- File nonExistentDir =
Files.createTempDirectory("non-existent").toFile();
- BlobIdSet emptySet = new BlobIdSet(nonExistentDir.getAbsolutePath(),
"empty.txt");
-
- // Should not contain any blob IDs
- Assert.assertFalse(emptySet.contains("anyblob"));
-
- // Clean up
- nonExistentDir.delete();
- }
-
- @Test
- public void testLargeNumberOfEntries() throws IOException {
- // Add a moderate number of entries
- int count = 1000;
- for (int i = 0; i < count; i++) {
- blobIdSet.add("blob-" + i);
- }
-
- // Verify all entries can be found
- for (int i = 0; i < count; i++) {
- Assert.assertTrue(blobIdSet.contains("blob-" + i));
- }
-
- // Non-existent entries should return false
- for (int i = 0; i < 10; i++) {
- Assert.assertFalse(blobIdSet.contains("nonexistent-blob-" + i));
- }
- }
-
- @Test
- public void testFileContainsAddedEntries() throws IOException {
- // Add several blob IDs
- String[] blobIds = {"a", "b", "c"};
- for (String id : blobIds) {
- blobIdSet.add(id);
- }
-
- // Verify the file contains the added blob IDs
- File storeFile = new File(tempDir, TEST_FILENAME);
- Assert.assertTrue("Store file should exist", storeFile.exists());
-
- List<String> fileContent = Files.readAllLines(storeFile.toPath());
- Assert.assertEquals("File should contain all added blob IDs",
blobIds.length, fileContent.size());
-
- for (int i = 0; i < blobIds.length; i++) {
- Assert.assertEquals("File line should match blob ID", blobIds[i],
fileContent.get(i));
- }
- }
-
- @Test
- public void testBloomFilterPreventsUnnecessaryFileReads() throws
IOException {
- // The bloom filter should prevent checking the file for non-existent
IDs
-
- // Add some entries to populate the bloom filter
- for (int i = 0; i < 10; i++) {
- blobIdSet.add("existing-" + i);
- }
-
- // Using a unique pattern for non-existent IDs to ensure they hash
differently
- // than the ones we added
- for (int i = 0; i < 10; i++) {
- Assert.assertFalse(blobIdSet.contains("definitely-not-there-" +
i));
- }
- }
-
- @Test
- public void testCachingBehavior() throws IOException {
- String blobId = "cachedblob";
-
- // Add the blob ID
- blobIdSet.add(blobId);
-
- // First check should populate cache
- Assert.assertTrue(blobIdSet.contains(blobId));
-
- // Even if we delete the file, the cached result should be used
- File storeFile = new File(tempDir, TEST_FILENAME);
- storeFile.delete();
-
- // Should still return true due to cache
- Assert.assertTrue(blobIdSet.contains(blobId));
- }
-
- @Test
- public void testContainsFindsExistingEntriesInFile() throws IOException {
- // Create some blob IDs
- String[] blobIds = {"fileblob1", "fileblob2", "fileblob3"};
-
- // Write blob IDs directly to file (not using BlobIdSet.add())
- File storeFile = new File(tempDir, TEST_FILENAME);
- try (FileWriter writer = new FileWriter(storeFile)) {
- for (String id : blobIds) {
- writer.write(id + "\n");
- }
- }
-
- // Create a new BlobIdSet instance (which should load from the file)
- BlobIdSet newBlobIdSet = new BlobIdSet(tempDir.getAbsolutePath(),
TEST_FILENAME);
-
- // Verify that contains() finds all the IDs
- for (String id : blobIds) {
- Assert.assertTrue("Should contain blob ID written directly to
file: " + id, newBlobIdSet.contains(id));
- }
-
- // Verify a non-existent ID still returns false
- Assert.assertFalse(newBlobIdSet.contains("notinfile"));
- }
-
- @Test
- public void testBloomFilterFalsePositiveProbabilityLessThanThreePercent()
throws IOException {
- // Load the bloom filter with a significant number of entries (about
5% of configured capacity)
- final int numToAdd = 5000;
-
- // Add entries to the bloom filter
- for (int i = 0; i < numToAdd; i++) {
- blobIdSet.add("entry-" + i);
- }
-
- // Test with non-existent entries using carefully crafted IDs
- int numTests = 1000;
- int falsePositives = 0;
-
- // Use a distinct prefix to ensure test IDs don't conflict with added
entries
- for (int i = 0; i < numTests; i++) {
- String nonExistentId = "non-existent-" + i;
-
- if (blobIdSet.contains(nonExistentId)) {
- falsePositives++;
- }
- }
-
- final double falsePositiveRate = (double) falsePositives / numTests;
-
- // Verify the false positive rate is below the configured 3%
- Assert.assertTrue(
- "False positive rate should be less than 3%, was: " +
(falsePositiveRate * 100) + "%",
- falsePositiveRate < 0.03
- );
- }
-}
\ No newline at end of file
diff --git a/oak-commons/pom.xml b/oak-commons/pom.xml
index 8b774a03dc..5e8396cd65 100644
--- a/oak-commons/pom.xml
+++ b/oak-commons/pom.xml
@@ -102,10 +102,6 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
</dependency>
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- </dependency>
<dependency>
<groupId>org.apache.jackrabbit</groupId>
<artifactId>jackrabbit-jcr-commons</artifactId>
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
index 5d1490f210..81c20db5ca 100644
---
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
+++
b/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/package-info.java
@@ -21,7 +21,7 @@
* Utilities for Java collections and streams.
*/
@Internal(since = "1.0.0")
-@Version("2.3.0")
+@Version("2.2.0")
package org.apache.jackrabbit.oak.commons.collections;
import org.apache.jackrabbit.oak.commons.annotations.Internal;
import org.osgi.annotation.versioning.Version;
diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml
index efd3657e51..e1288dccc2 100644
--- a/oak-run-commons/pom.xml
+++ b/oak-run-commons/pom.xml
@@ -110,11 +110,6 @@
<artifactId>oak-jcr</artifactId>
<version>${project.version}</version>
</dependency>
- <dependency>
- <groupId>org.apache.jackrabbit</groupId>
- <artifactId>oak-commons</artifactId>
- <version>${project.version}</version>
- </dependency>
<dependency>
<groupId>org.jetbrains</groupId>
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
index 741c1cceae..3ec93f175d 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
@@ -20,7 +20,7 @@ package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul
import java.util.Objects;
-import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
/**
* A binary id.
@@ -51,9 +51,9 @@ public class BinaryId {
// we need to hash again because some of the bits are fixed
// in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx
// (the hash64 is a reversible mapping, so there is no risk of
conflicts)
- this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0,
16), 16));
- this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16,
32), 16));
- this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32,
Math.min(48, buff.length())), 16));
+ this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16),
16));
+ this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32),
16));
+ this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32,
Math.min(48, buff.length())), 16));
}
@Override
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
index 878375f1f8..272c7f357b 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
@@ -28,8 +28,8 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
-import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
/**
* Collects the number and size of distinct binaries.
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
index 061aa39d52..349a3fe35b 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
@@ -27,7 +27,7 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
/**
* A histogram of distinct binaries. For each size range, we calculate the
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
index c749a1ab59..80227f5744 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
@@ -31,7 +31,7 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
-import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues;
@@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector {
stats.values += p.getValues().length;
if (stats.count > MIN_PROPERTY_COUNT) {
for (String v : p.getValues()) {
- long hash = HashUtils.hash64(v.hashCode(), seed);
+ long hash = Hash.hash64(v.hashCode(), seed);
stats.hll = HyperLogLog3Linear64.add(stats.hll, hash);
stats.size += v.length();
stats.maxSize = Math.max(stats.maxSize, v.length());
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
similarity index 83%
rename from
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
rename to
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
index 3c266ea1bb..3ad439a909 100644
---
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/BloomFilter.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilter.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.apache.jackrabbit.oak.commons.collections;
+package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
/**
* A Bloom filter implementation.
@@ -44,12 +44,6 @@ public class BloomFilter {
* @return the Bloom filter
*/
public static BloomFilter construct(long n, double fpp) {
- if (n <= 0) {
- throw new IllegalArgumentException("n must be greater than 0");
- }
- if (fpp <= 0 || fpp >= 1) {
- throw new IllegalArgumentException("fpp must be between 0 and 1");
- }
long m = calculateBits(n, fpp);
int k = calculateK((double) m / n);
return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
@@ -102,15 +96,6 @@ public class BloomFilter {
return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
}
- /**
- * Add an entry, using an int hash code.
- *
- * @param value the value to check
- */
- public void add(String value) {
- add(HashUtils.hash64(value));
- }
-
/**
* Add an entry.
*
@@ -121,22 +106,11 @@ public class BloomFilter {
long a = (hash >>> 32) | (hash << 32);
long b = hash;
for (int i = 0; i < k; i++) {
- data[HashUtils.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
+ data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
a += b;
}
}
- /**
- * Whether the entry may be in the set.
- *
- * @param value the value to check
- * @return true if the entry was added, or, with a certain false positive
- * probability, even if it was not added
- */
- public boolean mayContain(String value) {
- return mayContain(HashUtils.hash64(value));
- }
-
/**
* Whether the entry may be in the set.
*
@@ -149,7 +123,7 @@ public class BloomFilter {
long a = (hash >>> 32) | (hash << 32);
long b = hash;
for (int i = 0; i < k; i++) {
- if ((data[HashUtils.reduce((int) (a >>> 32), arraySize)] & 1L <<
a) == 0) {
+ if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) ==
0) {
return false;
}
a += b;
@@ -185,4 +159,4 @@ public class BloomFilter {
return (long) (-(m / k) * Math.log(1 - (x / m)));
}
-}
\ No newline at end of file
+}
\ No newline at end of file
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
similarity index 83%
rename from
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
rename to
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
index 90d998664c..a2cd599e7b 100644
---
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HashUtils.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/Hash.java
@@ -16,30 +16,17 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.apache.jackrabbit.oak.commons.collections;
-
-import java.nio.charset.StandardCharsets;
-import org.apache.commons.codec.digest.MurmurHash3;
+package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
/**
* A hash function utility class.
*/
-public class HashUtils {
+public class Hash {
- private HashUtils() {
+ private Hash() {
// utility class
}
- /**
- * Calculate a 64-bit hash value from a string.
- *
- * @param s the string
- * @return the hash value
- */
- public static long hash64(String s) {
- return MurmurHash3.hash128(s.getBytes(StandardCharsets.UTF_8))[0];
- }
-
/**
* Calculate a 64-bit hash value from a value, using a seed.
*
@@ -83,4 +70,4 @@ public class HashUtils {
return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
}
-}
\ No newline at end of file
+}
diff --git
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
similarity index 97%
rename from
oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
rename to
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
index 2054decff0..dd20a1e777 100644
---
a/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLog.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.apache.jackrabbit.oak.commons.collections;
+package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
import java.util.HashSet;
@@ -92,4 +92,4 @@ public class HyperLogLog {
return est;
}
-}
\ No newline at end of file
+}
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
index 48b4a63b0b..2c235f2197 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
@@ -21,7 +21,6 @@ package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils
import java.util.ArrayList;
import java.util.HashMap;
-import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
/**
@@ -92,7 +91,7 @@ public class TopKValues {
if (countedCount > 1000) {
skipRemaining = SKIP;
}
- long hash = HashUtils.hash64(value);
+ long hash = Hash.hash64(value.hashCode());
long est = sketch.addAndEstimate(hash);
if (est < min) {
return;
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
index 8f4e6d2cac..10992f5c06 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
@@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry;
-import org.apache.jackrabbit.oak.commons.collections.HashUtils;
-import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -240,7 +240,7 @@ public class Prefetcher {
// and then we use a secondary hash
// otherwise the estimation is way off
int h = blob.getContentIdentity().hashCode();
- return HashUtils.hash64(h | (blob.length() << 32));
+ return Hash.hash64(h | (blob.length() << 32));
}
static enum PrefetchType {
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
index 5fe819335d..7803a54400 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
@@ -67,7 +67,7 @@ public class PropertyStatsTest {
pc.add(n);
}
assertEquals("PropertyStats\n"
- + "skewed weight 3 count 1000000 distinct 394382 avgSize 7
maxSize 11 top
{\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+ + "skewed weight 3 count 1000000 distinct 394382 avgSize 7
maxSize 11 top
{\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+ "", pc.toString());
}
diff --git
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
similarity index 61%
rename from
oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
rename to
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
index 464de8c4ef..c3813f340d 100644
---
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/BloomFilterTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/BloomFilterTest.java
@@ -16,12 +16,10 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.apache.jackrabbit.oak.commons.collections;
+package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThrows;
import org.junit.Test;
@@ -63,6 +61,9 @@ public class BloomFilterTest {
BloomFilter f = BloomFilter.construct(100, 0.01);
assertEquals(960, f.getBitCount());
assertEquals(7, f.getK());
+ f = BloomFilter.construct(0, 0.01);
+ assertEquals(0, f.getBitCount());
+ assertEquals(1, f.getK());
}
@Test
@@ -71,14 +72,14 @@ public class BloomFilterTest {
int size = 500_000;
BloomFilter f = BloomFilter.construct(size, fpp);
for (int i = 0; i < size; i++) {
- f.add(HashUtils.hash64(i));
+ f.add(Hash.hash64(i));
}
for (int i = 0; i < size; i++) {
- assertTrue(f.mayContain(HashUtils.hash64(i)));
+ assertTrue(f.mayContain(Hash.hash64(i)));
}
int falsePositives = 0;
for (int i = 0; i < size; i++) {
- if (f.mayContain(HashUtils.hash64(i + size))) {
+ if (f.mayContain(Hash.hash64(i + size))) {
falsePositives++;
}
}
@@ -104,7 +105,7 @@ public class BloomFilterTest {
HyperLogLog hll = new HyperLogLog(1024, 0);
// now we calculate estimations with both the Bloom filter and
HyperLogLog
for(int i = 0; i < 20_000; i++) {
- long x = HashUtils.hash64(i);
+ long x = Hash.hash64(i);
bloom.add(x);
hll.add(x);
if (i > 0 && i % 1000 == 0) {
@@ -122,79 +123,4 @@ public class BloomFilterTest {
}
}
- @Test
- public void filterFunctionality() {
- BloomFilter filter = BloomFilter.construct(100, 0.01);
- String testValue = "test-value";
-
- // Initially should not contain anything
- assertFalse(filter.mayContain(testValue));
-
- // Add the item and verify it's found
- filter.add(testValue);
- assertTrue(filter.mayContain(testValue));
-
- // Verify another value is not found
- assertFalse(filter.mayContain("different-value"));
- }
-
- @Test
- public void filterWithMultipleEntries() {
- BloomFilter filter = BloomFilter.construct(100, 0.01);
-
- // Add multiple entries
- for (int i = 0; i < 100; i++) {
- filter.add("value-" + i);
- }
-
- // Verify all entries are found
- for (int i = 0; i < 100; i++) {
- assertTrue(filter.mayContain("value-" + i));
- }
- }
-
- @Test
- public void falsePositiveProbability() {
- // Create a filter with high false positive probability for testing
- double fpp = 0.3;
- BloomFilter filter = BloomFilter.construct(100, fpp);
-
- // Fill the filter to capacity
- for (int i = 0; i < 100; i++) {
- filter.add("existing-" + i);
- }
-
- // Test with values not in the filter
- int falsePositives = 0;
- int trials = 1000;
-
- for (int i = 0; i < trials; i++) {
- if (filter.mayContain("nonexistent-" + i)) {
- falsePositives++;
- }
- }
-
- // The false positive rate should be approximately fpp
- double actualFpp = (double) falsePositives / trials;
- assertTrue("False positive rate should be close to expected, got " +
actualFpp + " expected " + fpp, Math.abs(actualFpp - fpp) < 0.15);
- }
-
- @Test
- public void invalidEntries() {
- // Should throw exception for entries < 1
- assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(0, 0.01));
- }
-
- @Test
- public void invalidFppZero() {
- // Should throw exception for fpp <= 0
- assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(100, 0.0));
- }
-
- @Test
- public void invalidFppOne() {
- // Should throw exception for fpp >= 1
- assertThrows(IllegalArgumentException.class,() ->
BloomFilter.construct(100, 1.0));
- }
-
-}
\ No newline at end of file
+}
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
index 212c0fba22..92935ca286 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
@@ -24,7 +24,6 @@ import static org.junit.Assert.assertTrue;
import java.util.Arrays;
import java.util.Random;
-import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import org.junit.Test;
public class CountMinSketchTest {
@@ -84,11 +83,11 @@ public class CountMinSketchTest {
}
CountMinSketch est = new CountMinSketch(5, 16);
for (int i = 0; i < size; i++) {
- est.add(HashUtils.hash64(x + data[i]));
+ est.add(Hash.hash64(x + data[i]));
}
int[] counts = getCounts(data);
for (int i = 0; i < 10; i++) {
- long e = est.estimate(HashUtils.hash64(x + i));
+ long e = est.estimate(Hash.hash64(x + i));
long expectedPercent = (int) (100. * counts[i] / size);
long estPercent = (int) (100. * e / size);
if (debug) {
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
deleted file mode 100644
index eb7ab9a692..0000000000
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.jackrabbit.oak.commons.collections.HashUtils;
-import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
-import org.junit.Test;
-
-public class HyperLogLog3Linear64Test {
-
- @Test
- public void test() {
- int testCount = 50;
- for (int m = 8; m <= 128; m *= 2) {
- double avg = Math.sqrt(averageOverRange(m, 30_000, testCount,
false, 2));
- int min, max;
- switch (m) {
- case 8:
- min = 16;
- max = 17;
- break;
- case 16:
- min = 22;
- max = 23;
- break;
- case 32:
- min = 15;
- max = 16;
- break;
- case 64:
- min = 10;
- max = 11;
- break;
- case 128:
- min = 7;
- max = 8;
- break;
- default:
- min = 0;
- max = 0;
- break;
- }
- assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
- }
- }
-
- private static double averageOverRange(int m, long maxSize, int testCount,
boolean debug, double exponent) {
- double sum = 0;
- int count = 0;
- for (long size = 1; size <= 20; size++) {
- sum += test(m, size, testCount, debug, exponent);
- count++;
- }
- for (long size = 22; size <= 300; size += size / 5) {
- sum += test(m, size, testCount, debug, exponent);
- count++;
- }
- for (long size = 400; size <= maxSize; size *= 2) {
- sum += test(m, size, testCount, debug, exponent);
- count++;
- }
- return sum / count;
- }
-
- private static double test(int m, long size, int testCount, boolean debug,
double exponent) {
- long x = 0;
- long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
- long ns = System.nanoTime();
- double sumSquareError = 0;
- double sum = 0;
- double sumFirst = 0;
- int repeat = 10;
- int runs = 2;
- for (int test = 0; test < testCount; test++) {
- HyperLogLog hll;
- if (m == 8) {
- hll = new HyperLogLogUsingLong(16, 0);
- } else {
- hll = new HyperLogLog(m, 0);
- }
- long baseX = x;
- for (int i = 0; i < size; i++) {
- hll.add(HashUtils.hash64(x));
- x++;
- }
- long e = hll.estimate();
- sum += e;
- min = Math.min(min, e);
- max = Math.max(max, e);
- long error = e - size;
- sumSquareError += error * error;
- sumFirst += e;
- for (int add = 0; add < repeat; add++) {
- long x2 = baseX;
- for (int i = 0; i < size; i++) {
- hll.add(HashUtils.hash64(x2));
- x2++;
- }
- }
- e = hll.estimate();
- sum += e;
- min = Math.min(min, e);
- max = Math.max(max, e);
- error = e - size;
- sumSquareError += error * error;
- }
- ns = System.nanoTime() - ns;
- long nsPerItem = ns / testCount / runs / (1 + repeat) / size;
- double stdDev = Math.sqrt(sumSquareError / testCount / runs);
- double relStdDevP = stdDev / size * 100;
- int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100);
- int biasP = (int) (100 * (sum / testCount / runs / size) - 100);
- if (debug) {
- System.out.println("m " + m + " size " + size + " relStdDev% " +
(int) relStdDevP +
- " range " + min + ".." + max +
- " biasFirst% " + biasFirstP +
- " bias% " + biasP +
- " avg " + (sum / testCount / runs) +
- " time " + nsPerItem);
- }
- // we try to reduce the relStdDevP, make sure there are no large values
- // (trying to reduce sumSquareError directly
- // would mean we care more about larger sets, but we don't)
- return Math.pow(relStdDevP, exponent);
- }
-
- static class HyperLogLogUsingLong extends HyperLogLog {
-
- private long value;
-
- public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
- super(m, maxSmallSetSize);
- }
-
- @Override
- public void add(long hash) {
- value = HyperLogLog3Linear64.add(value, hash);
- }
-
- @Override
- public long estimate() {
- return HyperLogLog3Linear64.estimate(value);
- }
-
- }
-
-}
\ No newline at end of file
diff --git
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
similarity index 84%
rename from
oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
rename to
oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
index fd3fb39bff..570af9f16f 100644
---
a/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/collections/HyperLogLogTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLogTest.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.apache.jackrabbit.oak.commons.collections;
+package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
@@ -53,10 +53,14 @@ public class HyperLogLogTest {
@Test
public void test() {
int testCount = 50;
- for (int m = 16; m <= 128; m *= 2) {
+ for (int m = 8; m <= 128; m *= 2) {
double avg = Math.sqrt(averageOverRange(m, 30_000, testCount,
false, 2));
int min, max;
switch (m) {
+ case 8:
+ min = 16;
+ max = 17;
+ break;
case 16:
min = 22;
max = 23;
@@ -78,6 +82,7 @@ public class HyperLogLogTest {
max = 0;
break;
}
+ // System.out.println(type + " expected " + min + ".." + max + "
got " + avg);
assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
}
}
@@ -111,10 +116,14 @@ public class HyperLogLogTest {
int runs = 2;
for (int test = 0; test < testCount; test++) {
HyperLogLog hll;
- hll = new HyperLogLog(m, 0);
+ if (m == 8) {
+ hll = new HyperLogLogUsingLong(16, 0);
+ } else {
+ hll = new HyperLogLog(m, 0);
+ }
long baseX = x;
for (int i = 0; i < size; i++) {
- hll.add(HashUtils.hash64(x));
+ hll.add(Hash.hash64(x));
x++;
}
long e = hll.estimate();
@@ -127,7 +136,7 @@ public class HyperLogLogTest {
for (int add = 0; add < repeat; add++) {
long x2 = baseX;
for (int i = 0; i < size; i++) {
- hll.add(HashUtils.hash64(x2));
+ hll.add(Hash.hash64(x2));
x2++;
}
}
@@ -158,4 +167,22 @@ public class HyperLogLogTest {
return Math.pow(relStdDevP, exponent);
}
-}
\ No newline at end of file
+ static class HyperLogLogUsingLong extends HyperLogLog {
+
+ private long value;
+
+ public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
+ super(m, maxSmallSetSize);
+ }
+
+ public void add(long hash) {
+ value = HyperLogLog3Linear64.add(value, hash);
+ }
+
+ public long estimate() {
+ return HyperLogLog3Linear64.estimate(value);
+ }
+
+ }
+
+}
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
index dfd36a6eba..696e2e724f 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
@@ -30,17 +30,17 @@ public class TopKValuesTest {
public void test() {
TopKValues v = new TopKValues(3);
Random r = new Random(1);
- for (int i = 0; i < 1000000; i++) {
- if (r.nextBoolean()) {
+ for(int i=0; i<1000000; i++) {
+ if(r.nextBoolean()) {
v.add("common" + r.nextInt(2));
} else {
v.add("rare" + r.nextInt(100));
}
}
-
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}",
v.toString());
+
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}",
v.toString());
assertEquals(91809, v.getCount());
- assertEquals(24231, v.getTopCount());
- assertEquals(23844, v.getSecondCount());
+ assertEquals(24849, v.getTopCount());
+ assertEquals(24652, v.getSecondCount());
}
}
diff --git
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
index 196f6188d4..14e30cd673 100644
---
a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
+++
b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
@@ -16,13 +16,14 @@
*/
package org.apache.jackrabbit.oak.plugins.document.cache;
-import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
+import org.apache.jackrabbit.guava.common.hash.BloomFilter;
+import org.apache.jackrabbit.guava.common.hash.Funnel;
+import org.apache.jackrabbit.guava.common.hash.PrimitiveSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Closeable;
import java.util.List;
-import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Predicate;
public class CacheChangesTracker implements Closeable {
@@ -61,10 +62,10 @@ public class CacheChangesTracker implements Closeable {
changeTrackers.remove(this);
if (LOG.isDebugEnabled()) {
- if (lazyBloomFilter.filterRef.get() == null) {
+ if (lazyBloomFilter.filter == null) {
LOG.debug("Disposing CacheChangesTracker for {}, no filter was
needed", keyFilter);
} else {
- LOG.debug("Disposing CacheChangesTracker for {}, filter fpp
was: {}", keyFilter, LazyBloomFilter.FPP);
+ LOG.debug("Disposing CacheChangesTracker for {}, filter fpp
was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp());
}
}
}
@@ -75,33 +76,38 @@ public class CacheChangesTracker implements Closeable {
private final int entries;
- private final AtomicReference<BloomFilter> filterRef;
+ private volatile BloomFilter<String> filter;
public LazyBloomFilter(int entries) {
this.entries = entries;
- this.filterRef = new AtomicReference<>();
}
public synchronized void put(String entry) {
- getFilter().add(entry);
+ getFilter().put(entry);
}
public boolean mightContain(String entry) {
- BloomFilter f = filterRef.get();
- return f != null && f.mayContain(entry);
+ if (filter == null) {
+ return false;
+ } else {
+ synchronized (this) {
+ return filter.mightContain(entry);
+ }
+ }
}
- private BloomFilter getFilter() {
- BloomFilter result = filterRef.get();
- if (result == null) {
- BloomFilter newFilter = BloomFilter.construct(entries, FPP);
- if (filterRef.compareAndSet(null, newFilter)) {
- result = newFilter;
- } else {
- result = filterRef.get();
- }
+ private BloomFilter<String> getFilter() {
+ if (filter == null) {
+ filter = BloomFilter.create(new Funnel<String>() {
+ private static final long serialVersionUID =
-7114267990225941161L;
+
+ @Override
+ public void funnel(String from, PrimitiveSink into) {
+ into.putUnencodedChars(from);
+ }
+ }, entries, FPP);
}
- return result;
+ return filter;
}
}
}
diff --git
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
deleted file mode 100644
index 5d593df76c..0000000000
---
a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTrackerConcurrencyTest.java
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.jackrabbit.oak.plugins.document.cache;
-
-import org.junit.Test;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import static org.junit.Assert.*;
-
-/**
- * Tests for CacheChangesTracker concurrency scenarios, particularly
- * the LazyBloomFilter double-checked locking implementation.
- */
-public class CacheChangesTrackerConcurrencyTest {
-
- /**
- * Test concurrent initialization of LazyBloomFilter to ensure
- * double-checked locking prevents race conditions.
- */
- @Test
- public void testLazyBloomFilterConcurrentInitialization() throws
InterruptedException {
- final int threadCount = 20;
- final int entriesPerThread = 50;
- final CountDownLatch startLatch = new CountDownLatch(1);
- final CountDownLatch doneLatch = new CountDownLatch(threadCount);
- final ExecutorService executor =
Executors.newFixedThreadPool(threadCount);
-
- // Create a LazyBloomFilter instance
- final CacheChangesTracker.LazyBloomFilter lazyFilter =
- new CacheChangesTracker.LazyBloomFilter(1000);
-
- final AtomicInteger putOperations = new AtomicInteger(0);
- final List<Exception> exceptions = Collections.synchronizedList(new
ArrayList<>());
-
- try {
- // Create multiple threads that will all try to initialize and use
the filter simultaneously
- for (int i = 0; i < threadCount; i++) {
- final int threadId = i;
- executor.submit(() -> {
- try {
- // Wait for all threads to be ready
- startLatch.await();
-
- // Each thread adds multiple entries
- for (int j = 0; j < entriesPerThread; j++) {
- String key = "thread-" + threadId + "-key-" + j;
- lazyFilter.put(key);
- putOperations.incrementAndGet();
-
- // Add a small random delay to increase chance of
race condition
- if (j % 10 == 0) {
- Thread.sleep(1);
- }
- }
- } catch (Exception e) {
- exceptions.add(e);
- } finally {
- doneLatch.countDown();
- }
- });
- }
-
- // Start all threads simultaneously
- startLatch.countDown();
-
- // Wait for all threads to complete
- assertTrue("Test timed out", doneLatch.await(30,
TimeUnit.SECONDS));
-
- // Verify no exceptions occurred
- if (!exceptions.isEmpty()) {
- fail("Exceptions occurred during concurrent access: " +
exceptions.get(0));
- }
-
- // Verify all put operations completed
- assertEquals(threadCount * entriesPerThread, putOperations.get());
-
- // Verify the filter works correctly after concurrent
initialization
- for (int i = 0; i < threadCount; i++) {
- for (int j = 0; j < entriesPerThread; j++) {
- String key = "thread-" + i + "-key-" + j;
- assertTrue("Filter should contain key: " + key,
lazyFilter.mightContain(key));
- }
- }
-
- // Verify false positive behavior (some keys that weren't added
should return false)
- int falsePositives = 0;
- int testKeys = 100;
- for (int i = 0; i < testKeys; i++) {
- String nonExistentKey = "non-existent-key-" + i;
- if (lazyFilter.mightContain(nonExistentKey)) {
- falsePositives++;
- }
- }
-
- // With 1000 entries and 1% FPP, we expect roughly 1% false
positives for non-existent keys
- // Allow for some variance but it shouldn't be too high
- assertTrue("False positive rate too high: " + falsePositives + "/"
+ testKeys,
- falsePositives < testKeys * 0.05); // Allow up to 5% to
account for variance
-
- } finally {
- executor.shutdown();
- if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
- executor.shutdownNow();
- }
- }
- }
-
- /**
- * Test concurrent put and mightContain operations to ensure thread safety.
- */
- @Test
- public void testLazyBloomFilterConcurrentReadWrite() throws
InterruptedException {
- final int threadCount = 10;
- final int operationsPerThread = 100;
- final CountDownLatch startLatch = new CountDownLatch(1);
- final CountDownLatch doneLatch = new CountDownLatch(threadCount);
- final ExecutorService executor =
Executors.newFixedThreadPool(threadCount);
-
- final CacheChangesTracker.LazyBloomFilter lazyFilter =
- new CacheChangesTracker.LazyBloomFilter(2000);
-
- final AtomicInteger readOperations = new AtomicInteger(0);
- final AtomicInteger writeOperations = new AtomicInteger(0);
- final List<Exception> exceptions = Collections.synchronizedList(new
ArrayList<>());
-
- try {
- // Create mixed read/write threads
- for (int i = 0; i < threadCount; i++) {
- final int threadId = i;
- final boolean isWriter = (i % 2 == 0);
-
- executor.submit(() -> {
- try {
- startLatch.await();
-
- for (int j = 0; j < operationsPerThread; j++) {
- String key = "mixed-thread-" + threadId + "-key-"
+ j;
-
- if (isWriter || j < 10) { // Writers, or first few
operations of readers
- lazyFilter.put(key);
- writeOperations.incrementAndGet();
- }
-
- // All threads also do reads
- boolean result = lazyFilter.mightContain(key);
- readOperations.incrementAndGet();
-
- // If we just wrote the key, it should definitely
be found
- if (isWriter || j < 10) {
- assertTrue("Key should be found after being
added: " + key, result);
- }
- }
- } catch (Exception e) {
- exceptions.add(e);
- } finally {
- doneLatch.countDown();
- }
- });
- }
-
- startLatch.countDown();
- assertTrue("Test timed out", doneLatch.await(30,
TimeUnit.SECONDS));
-
- if (!exceptions.isEmpty()) {
- fail("Exceptions occurred during concurrent read/write: " +
exceptions.get(0));
- }
-
- assertTrue("Should have performed read operations",
readOperations.get() > 0);
- assertTrue("Should have performed write operations",
writeOperations.get() > 0);
-
- } finally {
- executor.shutdown();
- if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
- executor.shutdownNow();
- }
- }
- }
-
- /**
- * Test that LazyBloomFilter behaves correctly when filter is never
initialized
- * (i.e., only mightContain is called, never put).
- */
- @Test
- public void testLazyBloomFilterNoInitialization() {
- CacheChangesTracker.LazyBloomFilter lazyFilter =
- new CacheChangesTracker.LazyBloomFilter(1000);
-
- // Should return false for any key when filter is not initialized
- assertFalse(lazyFilter.mightContain("any-key"));
- assertFalse(lazyFilter.mightContain("another-key"));
- assertFalse(lazyFilter.mightContain(""));
- }
-}
\ No newline at end of file