This is an automated email from the ASF dual-hosted git repository.

reschke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 1a05416897 OAK-11856: oak-run-commons: remove usage of old Oak bloom 
filter, use oak-commons instead (#2446)
1a05416897 is described below

commit 1a054168971314018d4c19319e8955f2ad81f3c1
Author: Julian Reschke <[email protected]>
AuthorDate: Wed Aug 13 16:55:12 2025 +0200

    OAK-11856: oak-run-commons: remove usage of old Oak bloom filter, use 
oak-commons instead (#2446)
    
    * OAK-10856: oak-run-commons: remove usage of old Oak bloom filter, use 
oak-commons instead
    
    * OAK-10856: oak-run-commons: remove usage of old Oak bloom filter, use 
oak-commons instead - undo change to search-elastic
---
 oak-run-commons/pom.xml                            |   5 +
 .../flatfile/analysis/modules/BinaryId.java        |   8 +-
 .../analysis/modules/DistinctBinarySize.java       |   4 +-
 .../modules/DistinctBinarySizeHistogram.java       |   2 +-
 .../flatfile/analysis/modules/PropertyStats.java   |   4 +-
 .../flatfile/analysis/utils/TopKValues.java        |   3 +-
 .../index/indexer/document/tree/Prefetcher.java    |   6 +-
 .../analysis/modules/PropertyStatsTest.java        |   2 +-
 .../analysis/utils/CountMinSketchTest.java         |   5 +-
 .../analysis/utils/HyperLogLog3Linear64Test.java   | 165 +++++++++++++++++++++
 .../flatfile/analysis/utils/TopKValuesTest.java    |  10 +-
 11 files changed, 193 insertions(+), 21 deletions(-)

diff --git a/oak-run-commons/pom.xml b/oak-run-commons/pom.xml
index d8801c4536..201bdfb27b 100644
--- a/oak-run-commons/pom.xml
+++ b/oak-run-commons/pom.xml
@@ -110,6 +110,11 @@
             <artifactId>oak-jcr</artifactId>
             <version>${project.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.jackrabbit</groupId>
+            <artifactId>oak-commons</artifactId>
+            <version>${project.version}</version>
+        </dependency>
 
         <dependency>
             <groupId>org.jetbrains</groupId>
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
index 3ec93f175d..741c1cceae 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinaryId.java
@@ -20,7 +20,7 @@ package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modul
 
 import java.util.Objects;
 
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 
 /**
  * A binary id.
@@ -51,9 +51,9 @@ public class BinaryId {
         // we need to hash again because some of the bits are fixed
         // in case of UUIDs: always a "4" here: xxxxxxxx-xxxx-4xxx
         // (the hash64 is a reversible mapping, so there is no risk of 
conflicts)
-        this.v0 = Hash.hash64(Long.parseUnsignedLong(buff.substring(0, 16), 
16));
-        this.v1 = Hash.hash64(Long.parseUnsignedLong(buff.substring(16, 32), 
16));
-        this.v2 = Hash.hash64(Long.parseUnsignedLong(buff.substring(32, 
Math.min(48, buff.length())), 16));
+        this.v0 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(0, 
16), 16));
+        this.v1 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(16, 
32), 16));
+        this.v2 = HashUtils.hash64(Long.parseUnsignedLong(buff.substring(32, 
Math.min(48, buff.length())), 16));
     }
 
     @Override
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
index 272c7f357b..878375f1f8 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySize.java
@@ -28,8 +28,8 @@ import java.util.stream.Collectors;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.BloomFilter;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.BloomFilter;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 
 /**
  * Collects the number and size of distinct binaries.
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
index 349a3fe35b..061aa39d52 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/DistinctBinarySizeHistogram.java
@@ -27,7 +27,7 @@ import java.util.stream.Collectors;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 
 /**
  * A histogram of distinct binaries. For each size range, we calculate the
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
index 80227f5744..c749a1ab59 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStats.java
@@ -31,7 +31,7 @@ import java.util.stream.Collectors;
 
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog3Linear64;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.TopKValues;
 
@@ -123,7 +123,7 @@ public class PropertyStats implements StatsCollector {
         stats.values += p.getValues().length;
         if (stats.count > MIN_PROPERTY_COUNT) {
             for (String v : p.getValues()) {
-                long hash = Hash.hash64(v.hashCode(), seed);
+                long hash = HashUtils.hash64(v.hashCode(), seed);
                 stats.hll = HyperLogLog3Linear64.add(stats.hll, hash);
                 stats.size += v.length();
                 stats.maxSize = Math.max(stats.maxSize, v.length());
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
index 2c235f2197..48b4a63b0b 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValues.java
@@ -21,6 +21,7 @@ package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils
 import java.util.ArrayList;
 import java.util.HashMap;
 
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 import org.apache.jackrabbit.oak.commons.json.JsopBuilder;
 
 /**
@@ -91,7 +92,7 @@ public class TopKValues {
         if (countedCount > 1000) {
             skipRemaining = SKIP;
         }
-        long hash = Hash.hash64(value.hashCode());
+        long hash = HashUtils.hash64(value);
         long est = sketch.addAndEstimate(hash);
         if (est < min) {
             return;
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
index 10992f5c06..8f4e6d2cac 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/tree/Prefetcher.java
@@ -34,8 +34,8 @@ import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.index.indexer.document.NodeStateEntry;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.Hash;
-import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils.HyperLogLog;
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -240,7 +240,7 @@ public class Prefetcher {
         // and then we use a secondary hash
         // otherwise the estimation is way off
         int h = blob.getContentIdentity().hashCode();
-        return Hash.hash64(h | (blob.length() << 32));
+        return HashUtils.hash64(h | (blob.length() << 32));
     }
 
     static enum PrefetchType {
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
index 7803a54400..5fe819335d 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/PropertyStatsTest.java
@@ -67,7 +67,7 @@ public class PropertyStatsTest {
             pc.add(n);
         }
         assertEquals("PropertyStats\n"
-                + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 
maxSize 11 top 
{\"skipped\":899091,\"counted\":90910,\"false\":25583,\"true\":25518,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
+                + "skewed weight 3 count 1000000 distinct 394382 avgSize 7 
maxSize 11 top 
{\"skipped\":899091,\"counted\":90910,\"false\":25618,\"true\":25543,\"-411461567\":1,\"1483286044\":1,\"1310925467\":1,\"-1752252714\":1,\"-1433290908\":1,\"-1209544007\":1}\n"
                 + "", pc.toString());
     }
 
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
index 92935ca286..212c0fba22 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/CountMinSketchTest.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue;
 import java.util.Arrays;
 import java.util.Random;
 
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
 import org.junit.Test;
 
 public class CountMinSketchTest {
@@ -83,11 +84,11 @@ public class CountMinSketchTest {
                     }
                     CountMinSketch est = new CountMinSketch(5, 16);
                     for (int i = 0; i < size; i++) {
-                        est.add(Hash.hash64(x + data[i]));
+                        est.add(HashUtils.hash64(x + data[i]));
                     }
                     int[] counts = getCounts(data);
                     for (int i = 0; i < 10; i++) {
-                        long e = est.estimate(Hash.hash64(x + i));
+                        long e = est.estimate(HashUtils.hash64(x + i));
                         long expectedPercent = (int) (100. * counts[i] / size);
                         long estPercent = (int) (100. * e / size);
                         if (debug) {
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
new file mode 100644
index 0000000000..eb7ab9a692
--- /dev/null
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/HyperLogLog3Linear64Test.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.utils;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.jackrabbit.oak.commons.collections.HashUtils;
+import org.apache.jackrabbit.oak.commons.collections.HyperLogLog;
+import org.junit.Test;
+
+public class HyperLogLog3Linear64Test {
+
+    @Test
+    public void test() {
+        int testCount = 50;
+        for (int m = 8; m <= 128; m *= 2) {
+            double avg = Math.sqrt(averageOverRange(m, 30_000, testCount, 
false, 2));
+            int min, max;
+            switch (m) {
+            case 8:
+                min = 16;
+                max = 17;
+                break;
+            case 16:
+                min = 22;
+                max = 23;
+                break;
+            case 32:
+                min = 15;
+                max = 16;
+                break;
+            case 64:
+                min = 10;
+                max = 11;
+                break;
+            case 128:
+                min = 7;
+                max = 8;
+                break;
+            default:
+                min = 0;
+                max = 0;
+                break;
+            }
+            assertTrue("m " + m + " expected " + min + ".." + max + " got " + 
avg, min < avg && avg < max);
+        }
+    }
+
+    private static double averageOverRange(int m, long maxSize, int testCount, 
boolean debug, double exponent) {
+        double sum = 0;
+        int count = 0;
+        for (long size = 1; size <= 20; size++) {
+            sum += test(m, size, testCount, debug, exponent);
+            count++;
+        }
+        for (long size = 22; size <= 300; size += size / 5) {
+            sum += test(m, size, testCount, debug, exponent);
+            count++;
+        }
+        for (long size = 400; size <= maxSize; size *= 2) {
+            sum += test(m, size, testCount, debug, exponent);
+            count++;
+        }
+        return sum / count;
+    }
+
+    private static double test(int m, long size, int testCount, boolean debug, 
double exponent) {
+        long x = 0;
+        long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+        long ns = System.nanoTime();
+        double sumSquareError = 0;
+        double sum = 0;
+        double sumFirst = 0;
+        int repeat = 10;
+        int runs = 2;
+        for (int test = 0; test < testCount; test++) {
+            HyperLogLog hll;
+            if (m == 8) {
+                hll = new HyperLogLogUsingLong(16, 0);
+            } else {
+                hll = new HyperLogLog(m, 0);
+            }
+            long baseX = x;
+            for (int i = 0; i < size; i++) {
+                hll.add(HashUtils.hash64(x));
+                x++;
+            }
+            long e = hll.estimate();
+            sum += e;
+            min = Math.min(min, e);
+            max = Math.max(max, e);
+            long error = e - size;
+            sumSquareError += error * error;
+            sumFirst += e;
+            for (int add = 0; add < repeat; add++) {
+                long x2 = baseX;
+                for (int i = 0; i < size; i++) {
+                    hll.add(HashUtils.hash64(x2));
+                    x2++;
+                }
+            }
+            e = hll.estimate();
+            sum += e;
+            min = Math.min(min, e);
+            max = Math.max(max, e);
+            error = e - size;
+            sumSquareError += error * error;
+        }
+        ns = System.nanoTime() - ns;
+        long nsPerItem = ns / testCount / runs / (1 + repeat) / size;
+        double stdDev = Math.sqrt(sumSquareError / testCount / runs);
+        double relStdDevP = stdDev / size * 100;
+        int biasFirstP = (int) (100 * (sumFirst / testCount / size) - 100);
+        int biasP = (int) (100 * (sum / testCount / runs / size) - 100);
+        if (debug) {
+            System.out.println("m " + m + " size " + size + " relStdDev% " + 
(int) relStdDevP +
+                    " range " + min + ".." + max +
+                    " biasFirst% " + biasFirstP +
+                    " bias% " + biasP +
+                    " avg " + (sum / testCount / runs) +
+                    " time " + nsPerItem);
+        }
+        // we try to reduce the relStdDevP, make sure there are no large values
+        // (trying to reduce sumSquareError directly
+        // would mean we care more about larger sets, but we don't)
+        return Math.pow(relStdDevP, exponent);
+    }
+
+    static class HyperLogLogUsingLong extends HyperLogLog {
+
+        private long value;
+
+        public HyperLogLogUsingLong(int m, int maxSmallSetSize) {
+            super(m, maxSmallSetSize);
+        }
+
+        @Override
+        public void add(long hash) {
+            value = HyperLogLog3Linear64.add(value, hash);
+        }
+
+        @Override
+        public long estimate() {
+            return HyperLogLog3Linear64.estimate(value);
+        }
+
+    }
+
+} 
\ No newline at end of file
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
index 696e2e724f..dfd36a6eba 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/utils/TopKValuesTest.java
@@ -30,17 +30,17 @@ public class TopKValuesTest {
     public void test() {
         TopKValues v = new TopKValues(3);
         Random r = new Random(1);
-        for(int i=0; i<1000000; i++) {
-            if(r.nextBoolean()) {
+        for (int i = 0; i < 1000000; i++) {
+            if (r.nextBoolean()) {
                 v.add("common" + r.nextInt(2));
             } else {
                 v.add("rare" + r.nextInt(100));
             }
         }
-        
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common1\":24849,\"common0\":24652,\"rare13\":2374}",
 v.toString());
+        
assertEquals("{\"notSkewed\":5,\"skipped\":908191,\"counted\":91809,\"common0\":24231,\"common1\":23844,\"rare13\":2722}",
 v.toString());
         assertEquals(91809, v.getCount());
-        assertEquals(24849, v.getTopCount());
-        assertEquals(24652, v.getSecondCount());
+        assertEquals(24231, v.getTopCount());
+        assertEquals(23844, v.getSecondCount());
     }
 
 }

Reply via email to