This is an automated email from the ASF dual-hosted git repository.
reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 1a05416897 OAK-11856: oak-run-commons: remove usage of old Oak bloom
filter, use oak-commons instead (#2446)
1a05416897 is described below
commit 1a054168971314018d4c19319e8955f2ad81f3c1
Author: Julian Reschke <[email protected]>
AuthorDate: Wed Aug 13 16:55:12 2025 +0200
OAK-11856: oak-run-commons: remove usage of old Oak bloom filter, use
oak-commons instead (#2446)
* OAK-10856: oak-run-commons: remove usage of old Oak bloom filter, use
oak-commons instead
* OAK-10856: oak-run-commons: remove usage of old Oak bloom filter, use
oak-commons instead - undo change to search-elastic
---
oak-run-commons/pom.xml | 5 +
.../flatfile/analysis/modules/BinaryId.java | 8 +-
.../analysis/modules/DistinctBinarySize.java | 4 +-
.../modules/DistinctBinarySizeHistogram.java | 2 +-
.../flatfile/analysis/modules/PropertyStats.java | 4 +-
.../flatfile/analysis/utils/TopKValues.java | 3 +-
.../index/indexer/document/tree/Prefetcher.java | 6 +-
.../analysis/modules/PropertyStatsTest.java | 2 +-
.../analysis/utils/CountMinSketchTest.java | 5 +-
.../analysis/utils/HyperLogLog3Linear64Test.java | 165 +++++++++++++++++++++
.../flatfile/analysis/utils/TopKValuesTest.java | 10 +-
11 files changed, 193 insertions(+), 21 deletions(-)
diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml
index d8801c4536..201bdfb27b 100644
--- a/oak-run-commons/pom.xml
+++ b/oak-run-commons/pom.xml
@@ -110,6 +110,11 @@
<artifactId>oak-jcr</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.jackrabbit</groupId>
+ <artifactId>oak-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>org.jetbrains</groupId>
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
index 3ec93f175d..741c1cceae 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
@@ -20,7 +20,7 @@ package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul
import java.util.Objects;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
/**
* A binary id.
@@ -51,9 +51,9 @@ public class BinaryId {
// we need to hash again because some of the bits are fixed
// in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx
// (the hash64 is a reversible mapping, so there is no risk of
conflicts)
- this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16),
16));
- this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32),
16));
- this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32,
Math.min(48, buff.length())), 16));
+ this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0,
16), 16));
+ this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16,
32), 16));
+ this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32,
Math.min(48, buff.length())), 16));
}
@Override
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
index 272c7f357b..878375f1f8 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
@@ -28,8 +28,8 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
/**
* Collects the number and size of distinct binaries.
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
index 349a3fe35b..061aa39d52 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
@@ -27,7 +27,7 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
/**
* A histogram of distinct binaries. For each size range, we calculate the
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
index 80227f5744..c749a1ab59 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
@@ -31,7 +31,7 @@ import java.util.stream.Collectors;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64;
import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues;
@@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector {
stats.values += p.getValues().length;
if (stats.count > MIN_PROPERTY_COUNT) {
for (String v : p.getValues()) {
- long hash = Hash.hash64(v.hashCode(), seed);
+ long hash = HashUtils.hash64(v.hashCode(), seed);
stats.hll = HyperLogLog3Linear64.add(stats.hll, hash);
stats.size += v.length();
stats.maxSize = Math.max(stats.maxSize, v.length());
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
index 2c235f2197..48b4a63b0b 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
@@ -21,6 +21,7 @@ package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils
import java.util.ArrayList;
import java.util.HashMap;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
/**
@@ -91,7 +92,7 @@ public class TopKValues {
if (countedCount > 1000) {
skipRemaining = SKIP;
}
- long hash = Hash.hash64(value.hashCode());
+ long hash = HashUtils.hash64(value);
long est = sketch.addAndEstimate(hash);
if (est < min) {
return;
diff --git
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
index 10992f5c06..8f4e6d2cac 100644
---
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
+++
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
@@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
-import
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -240,7 +240,7 @@ public class Prefetcher {
// and then we use a secondary hash
// otherwise the estimation is way off
int h = blob.getContentIdentity().hashCode();
- return Hash.hash64(h | (blob.length() << 32));
+ return HashUtils.hash64(h | (blob.length() << 32));
}
static enum PrefetchType {
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
index 7803a54400..5fe819335d 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
@@ -67,7 +67,7 @@ public class PropertyStatsTest {
pc.add(n);
}
assertEquals("PropertyStats\n"
- + "skewed weight 3 count 1000000 distinct 394382 avgSize 7
maxSize 11 top
{\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+ + "skewed weight 3 count 1000000 distinct 394382 avgSize 7
maxSize 11 top
{\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+ "", pc.toString());
}
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
index 92935ca286..212c0fba22 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue;
import java.util.Arrays;
import java.util.Random;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
import org.junit.Test;
public class CountMinSketchTest {
@@ -83,11 +84,11 @@ public class CountMinSketchTest {
}
CountMinSketch est = new CountMinSketch(5, 16);
for (int i = 0; i < size; i++) {
- est.add(Hash.hash64(x + data[i]));
+ est.add(HashUtils.hash64(x + data[i]));
}
int[] counts = getCounts(data);
for (int i = 0; i < 10; i++) {
- long e = est.estimate(Hash.hash64(x + i));
+ long e = est.estimate(HashUtils.hash64(x + i));
long expectedPercent = (int) (100. * counts[i] / size);
long estPercent = (int) (100. * e / size);
if (debug) {
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
new file mode 100644
index 0000000000..eb7ab9a692
--- /dev/null
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
+import org.junit.Test;
+
+public class HyperLogLog3Linear64Test {
+
+ @Test
+ public void test() {
+ int testCount = 50;
+ for (int m = 8; m <= 128; m *= 2) {
+ double avg = Math.sqrt(averageOverRange(m, 30_000, testCount,
false, 2));
+ int min, max;
+ switch (m) {
+ case 8:
+ min = 16;
+ max = 17;
+ break;
+ case 16:
+ min = 22;
+ max = 23;
+ break;
+ case 32:
+ min = 15;
+ max = 16;
+ break;
+ case 64:
+ min = 10;
+ max = 11;
+ break;
+ case 128:
+ min = 7;
+ max = 8;
+ break;
+ default:
+ min = 0;
+ max = 0;
+ break;
+ }
+ assertTrue("m " + m + " expected " + min + ".." + max + " got " +
avg, min < avg && avg < max);
+ }
+ }
+
+ private static double averageOverRange(int m, long maxSize, int testCount,
boolean debug, double exponent) {
+ double sum = 0;
+ int count = 0;
+ for (long size = 1; size <= 20; size++) {
+ sum += test(m, size, testCount, debug, exponent);
+ count++;
+ }
+ for (long size = 22; size <= 300; size += size / 5) {
+ sum += test(m, size, testCount, debug, exponent);
+ count++;
+ }
+ for (long size = 400; size <= maxSize; size *= 2) {
+ sum += test(m, size, testCount, debug, exponent);
+ count++;
+ }
+ return sum / count;
+ }
+
+ private static double test(int m, long size, int testCount, boolean debug,
double exponent) {
+ long x = 0;
+ long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+ long ns = System.nanoTime();
+ double sumSquareError = 0;
+ double sum = 0;
+ double sumFirst = 0;
+ int repeat = 10;
+ int runs = 2;
+ for (int test = 0; test < testCount; test++) {
+ HyperLogLog hll;
+ if (m == 8) {
+ hll = new HyperLogLogUsingLong(16, 0);
+ } else {
+ hll = new HyperLogLog(m, 0);
+ }
+ long baseX = x;
+ for (int i = 0; i < size; i++) {
+ hll.add(HashUtils.hash64(x));
+ x++;
+ }
+ long e = hll.estimate();
+ sum += e;
+ min = Math.min(min, e);
+ max = Math.max(max, e);
+ long error = e - size;
+ sumSquareError += error * error;
+ sumFirst += e;
+ for (int add = 0; add < repeat; add++) {
+ long x2 = baseX;
+ for (int i = 0; i < size; i++) {
+ hll.add(HashUtils.hash64(x2));
+ x2++;
+ }
+ }
+ e = hll.estimate();
+ sum += e;
+ min = Math.min(min, e);
+ max = Math.max(max, e);
+ error = e - size;
+ sumSquareError += error * error;
+ }
+ ns = System.nanoTime() - ns;
+ long nsPerItem = ns / testCount / runs / (1 + repeat) / size;
+ double stdDev = Math.sqrt(sumSquareError / testCount / runs);
+ double relStdDevP = stdDev / size * 100;
+ int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100);
+ int biasP = (int) (100 * (sum / testCount / runs / size) - 100);
+ if (debug) {
+ System.out.println("m " + m + " size " + size + " relStdDev% " +
(int) relStdDevP +
+ " range " + min + ".." + max +
+ " biasFirst% " + biasFirstP +
+ " bias% " + biasP +
+ " avg " + (sum / testCount / runs) +
+ " time " + nsPerItem);
+ }
+ // we try to reduce the relStdDevP, make sure there are no large values
+ // (trying to reduce sumSquareError directly
+ // would mean we care more about larger sets, but we don't)
+ return Math.pow(relStdDevP, exponent);
+ }
+
+ static class HyperLogLogUsingLong extends HyperLogLog {
+
+ private long value;
+
+ public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
+ super(m, maxSmallSetSize);
+ }
+
+ @Override
+ public void add(long hash) {
+ value = HyperLogLog3Linear64.add(value, hash);
+ }
+
+ @Override
+ public long estimate() {
+ return HyperLogLog3Linear64.estimate(value);
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
index 696e2e724f..dfd36a6eba 100644
---
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
+++
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
@@ -30,17 +30,17 @@ public class TopKValuesTest {
public void test() {
TopKValues v = new TopKValues(3);
Random r = new Random(1);
- for(int i=0; i<1000000; i++) {
- if(r.nextBoolean()) {
+ for (int i = 0; i < 1000000; i++) {
+ if (r.nextBoolean()) {
v.add("common" + r.nextInt(2));
} else {
v.add("rare" + r.nextInt(100));
}
}
-
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}",
v.toString());
+
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}",
v.toString());
assertEquals(91809, v.getCount());
- assertEquals(24849, v.getTopCount());
- assertEquals(24652, v.getSecondCount());
+ assertEquals(24231, v.getTopCount());
+ assertEquals(23844, v.getSecondCount());
}
}