This is an automated email from the ASF dual-hosted git repository. reschke pushed a commit to branch OAK-10856 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 3e379807f4ae7ed49bc71888f56a26ba48f239b2 Author: Julian Reschke <[email protected]> AuthorDate: Wed Aug 13 08:05:23 2025 +0100 OAK-10856: oak-run-commons: remove usage of old Oak bloom filter, use oak-commons instead --- oak-run-commons/pom.xml | 5 + .../flatfile/analysis/modules/BinaryId.java | 8 +- .../analysis/modules/DistinctBinarySize.java | 4 +- .../modules/DistinctBinarySizeHistogram.java | 2 +- .../flatfile/analysis/modules/PropertyStats.java | 4 +- .../flatfile/analysis/utils/TopKValues.java | 3 +- .../index/indexer/document/tree/Prefetcher.java | 6 +- .../analysis/modules/PropertyStatsTest.java | 2 +- .../analysis/utils/CountMinSketchTest.java | 5 +- .../analysis/utils/HyperLogLog3Linear64Test.java | 165 +++++++++++++++++++++ .../flatfile/analysis/utils/TopKValuesTest.java | 10 +- .../elastic/ElasticRegexPropertyIndexTest.java | 5 +- 12 files changed, 195 insertions(+), 24 deletions(-) diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml index d8801c4536..201bdfb27b 100644 --- a/oak-run-commons/pom.xml +++ b/oak-run-commons/pom.xml @@ -110,6 +110,11 @@ <artifactId>oak-jcr</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.apache.jackrabbit</groupId> + <artifactId>oak-commons</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>org.jetbrains</groupId> diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java index 3ec93f175d..741c1cceae 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java @@ -20,7 +20,7 @@ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul import java.util.Objects; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; /** * A binary id. @@ -51,9 +51,9 @@ public class BinaryId { // we need to hash again because some of the bits are fixed // in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx // (the hash64 is a reversible mapping, so there is no risk of conflicts) - this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 16)); - this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 16)); - this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32, Math.min(48, buff.length())), 16)); + this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 16)); + this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 16)); + this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32, Math.min(48, buff.length())), 16)); } @Override diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java index 272c7f357b..878375f1f8 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java @@ -28,8 +28,8 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.BloomFilter; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; /** * Collects the number and size of distinct binaries. diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java index 349a3fe35b..061aa39d52 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java @@ -27,7 +27,7 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; /** * A histogram of distinct binaries. For each size range, we calculate the diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java index 80227f5744..c749a1ab59 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java @@ -31,7 +31,7 @@ import java.util.stream.Collectors; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues; @@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector { stats.values += p.getValues().length; if (stats.count > MIN_PROPERTY_COUNT) { for (String v : p.getValues()) { - long hash = Hash.hash64(v.hashCode(), seed); + long hash = HashUtils.hash64(v.hashCode(), seed); stats.hll = HyperLogLog3Linear64.add(stats.hll, hash); stats.size += v.length(); stats.maxSize = Math.max(stats.maxSize, v.length()); diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java index 2c235f2197..48b4a63b0b 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java @@ -21,6 +21,7 @@ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils import java.util.ArrayList; import java.util.HashMap; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.apache.jackrabbit.oak.commons.json.JsopBuilder; /** @@ -91,7 +92,7 @@ public class TopKValues { if (countedCount > 1000) { skipRemaining = SKIP; } - long hash = Hash.hash64(value.hashCode()); + long hash = HashUtils.hash64(value); long est = sketch.addAndEstimate(hash); if (est < min) { return; diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java index 10992f5c06..8f4e6d2cac 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java @@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash; -import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -240,7 +240,7 @@ public class Prefetcher { // and then we use a secondary hash // otherwise the estimation is way off int h = blob.getContentIdentity().hashCode(); - return Hash.hash64(h | (blob.length() << 32)); + return HashUtils.hash64(h | (blob.length() << 32)); } static enum PrefetchType { diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java index 7803a54400..5fe819335d 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java @@ -67,7 +67,7 @@ public class PropertyStatsTest { pc.add(n); } assertEquals("PropertyStats\n" - + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 maxSize 11 top {\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n" + + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 maxSize 11 top {\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n" + "", pc.toString()); } diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java index 92935ca286..212c0fba22 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue; import java.util.Arrays; import java.util.Random; +import org.apache.jackrabbit.oak.commons.collections.HashUtils; import org.junit.Test; public class CountMinSketchTest { @@ -83,11 +84,11 @@ public class CountMinSketchTest { } CountMinSketch est = new CountMinSketch(5, 16); for (int i = 0; i < size; i++) { - est.add(Hash.hash64(x + data[i])); + est.add(HashUtils.hash64(x + data[i])); } int[] counts = getCounts(data); for (int i = 0; i < 10; i++) { - long e = est.estimate(Hash.hash64(x + i)); + long e = est.estimate(HashUtils.hash64(x + i)); long expectedPercent = (int) (100. * counts[i] / size); long estPercent = (int) (100. * e / size); if (debug) { diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java new file mode 100644 index 0000000000..eb7ab9a692 --- /dev/null +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils; + +import static org.junit.Assert.assertTrue; + +import org.apache.jackrabbit.oak.commons.collections.HashUtils; +import org.apache.jackrabbit.oak.commons.collections.HyperLogLog; +import org.junit.Test; + +public class HyperLogLog3Linear64Test { + + @Test + public void test() { + int testCount = 50; + for (int m = 8; m <= 128; m *= 2) { + double avg = Math.sqrt(averageOverRange(m, 30_000, testCount, false, 2)); + int min, max; + switch (m) { + case 8: + min = 16; + max = 17; + break; + case 16: + min = 22; + max = 23; + break; + case 32: + min = 15; + max = 16; + break; + case 64: + min = 10; + max = 11; + break; + case 128: + min = 7; + max = 8; + break; + default: + min = 0; + max = 0; + break; + } + assertTrue("m " + m + " expected " + min + ".." + max + " got " + avg, min < avg && avg < max); + } + } + + private static double averageOverRange(int m, long maxSize, int testCount, boolean debug, double exponent) { + double sum = 0; + int count = 0; + for (long size = 1; size <= 20; size++) { + sum += test(m, size, testCount, debug, exponent); + count++; + } + for (long size = 22; size <= 300; size += size / 5) { + sum += test(m, size, testCount, debug, exponent); + count++; + } + for (long size = 400; size <= maxSize; size *= 2) { + sum += test(m, size, testCount, debug, exponent); + count++; + } + return sum / count; + } + + private static double test(int m, long size, int testCount, boolean debug, double exponent) { + long x = 0; + long min = Long.MAX_VALUE, max = Long.MIN_VALUE; + long ns = System.nanoTime(); + double sumSquareError = 0; + double sum = 0; + double sumFirst = 0; + int repeat = 10; + int runs = 2; + for (int test = 0; test < testCount; test++) { + HyperLogLog hll; + if (m == 8) { + hll = new HyperLogLogUsingLong(16, 0); + } else { + hll = new HyperLogLog(m, 0); + } + long baseX = x; + for (int i = 0; i < size; i++) { + hll.add(HashUtils.hash64(x)); + x++; + } + long e = hll.estimate(); + sum += e; + min = Math.min(min, e); + max = Math.max(max, e); + long error = e - size; + sumSquareError += error * error; + sumFirst += e; + for (int add = 0; add < repeat; add++) { + long x2 = baseX; + for (int i = 0; i < size; i++) { + hll.add(HashUtils.hash64(x2)); + x2++; + } + } + e = hll.estimate(); + sum += e; + min = Math.min(min, e); + max = Math.max(max, e); + error = e - size; + sumSquareError += error * error; + } + ns = System.nanoTime() - ns; + long nsPerItem = ns / testCount / runs / (1 + repeat) / size; + double stdDev = Math.sqrt(sumSquareError / testCount / runs); + double relStdDevP = stdDev / size * 100; + int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100); + int biasP = (int) (100 * (sum / testCount / runs / size) - 100); + if (debug) { + System.out.println("m " + m + " size " + size + " relStdDev% " + (int) relStdDevP + + " range " + min + ".." + max + + " biasFirst% " + biasFirstP + + " bias% " + biasP + + " avg " + (sum / testCount / runs) + + " time " + nsPerItem); + } + // we try to reduce the relStdDevP, make sure there are no large values + // (trying to reduce sumSquareError directly + // would mean we care more about larger sets, but we don't) + return Math.pow(relStdDevP, exponent); + } + + static class HyperLogLogUsingLong extends HyperLogLog { + + private long value; + + public HyperLogLogUsingLong(int m, int maxSmallSetSize) { + super(m, maxSmallSetSize); + } + + @Override + public void add(long hash) { + value = HyperLogLog3Linear64.add(value, hash); + } + + @Override + public long estimate() { + return HyperLogLog3Linear64.estimate(value); + } + + } + +} \ No newline at end of file diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java index 696e2e724f..dfd36a6eba 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java @@ -30,17 +30,17 @@ public class TopKValuesTest { public void test() { TopKValues v = new TopKValues(3); Random r = new Random(1); - for(int i=0; i<1000000; i++) { - if(r.nextBoolean()) { + for (int i = 0; i < 1000000; i++) { + if (r.nextBoolean()) { v.add("common" + r.nextInt(2)); } else { v.add("rare" + r.nextInt(100)); } } - assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}", v.toString()); + assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}", v.toString()); assertEquals(91809, v.getCount()); - assertEquals(24849, v.getTopCount()); - assertEquals(24652, v.getSecondCount()); + assertEquals(24231, v.getTopCount()); + assertEquals(23844, v.getSecondCount()); } } diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java index 9daf0c0f9e..52d504d1e2 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticRegexPropertyIndexTest.java @@ -138,11 +138,10 @@ public class ElasticRegexPropertyIndexTest extends ElasticAbstractQueryTest { fail(); } catch (CommitFailedException e) { Throwable cause = e.getCause(); + String msg = cause.getMessage(); assertTrue("Unexpected exception type. Expected IOException. Was " + cause, cause instanceof IOException); - // String msg = cause.getMessage(); - // assertTrue(msg, msg.contains("Error indexing documents for index:")); + assertTrue(msg, msg.contains("Error indexing documents for index:")); // Typically, the root cause is "Limit of total fields [1000] has been exceeded" - // and some times it is "Service error while indexing." // but something this is suppressed, and so we can not have an assertion on it } }
