This is an automated email from the ASF dual-hosted git repository.

gianm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 4628e43081b perf: Optimize HLL sketch merge by avoiding unnecessary 
HLL_8 to HLL_4 conversion (#19482)
4628e43081b is described below

commit 4628e43081bf0ecc5aec4f904e544e03bf4ab7d6
Author: Maytas Monsereenusorn <[email protected]>
AuthorDate: Mon May 18 22:26:06 2026 -0700

    perf: Optimize HLL sketch merge by avoiding unnecessary HLL_8 to HLL_4 
conversion (#19482)
---
 .../druid/benchmark/DataSketchesHllBenchmark.java  |  36 ++++++
 .../datasketches/hll/HllSketchHolder.java          |  11 +-
 .../datasketches/hll/HllSketchHolderTest.java      | 139 +++++++++++++++++++++
 3 files changed, 185 insertions(+), 1 deletion(-)

diff --git 
a/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
 
b/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
index 9b75df49c3f..a9e084b6547 100644
--- 
a/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
+++ 
b/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
@@ -20,8 +20,10 @@
 package org.apache.druid.benchmark;
 
 import org.apache.datasketches.hll.HllSketch;
+import org.apache.datasketches.hll.Union;
 import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.query.aggregation.BufferAggregator;
+import org.apache.druid.query.aggregation.datasketches.hll.HllSketchHolder;
 import 
org.apache.druid.query.aggregation.datasketches.hll.HllSketchMergeAggregatorFactory;
 import org.apache.druid.query.dimension.DimensionSpec;
 import org.apache.druid.segment.ColumnSelectorFactory;
@@ -66,11 +68,27 @@ public class DataSketchesHllBenchmark
 
   private final ByteBuffer buf = 
ByteBuffer.allocateDirect(aggregatorFactory.getMaxIntermediateSize());
 
+  private static final int MERGE_LG_K = 12;
+  private static final int MERGE_NUM_VALUES = (1 << MERGE_LG_K) * 8;
+
   private BufferAggregator aggregator;
 
+  private byte[] mergeSketchBytes1;
+  private byte[] mergeSketchBytes2;
+  private HllSketchHolder mergeHolder1;
+  private HllSketchHolder mergeHolder2;
+
   @Setup(Level.Trial)
   public void setUp()
   {
+    HllSketch s1 = new HllSketch(MERGE_LG_K);
+    HllSketch s2 = new HllSketch(MERGE_LG_K);
+    for (int i = 0; i < MERGE_NUM_VALUES; i++) {
+      s1.update(i);
+      s2.update(MERGE_NUM_VALUES + i);
+    }
+    mergeSketchBytes1 = s1.toCompactByteArray();
+    mergeSketchBytes2 = s2.toCompactByteArray();
     aggregator = aggregatorFactory.factorizeBuffered(
         new ColumnSelectorFactory()
         {
@@ -122,4 +140,22 @@ public class DataSketchesHllBenchmark
     aggregator.init(buf, 0);
     return aggregatorFactory.deserialize(((HllSketch) aggregator.get(buf, 
0)).toCompactByteArray());
   }
+
+  @Setup(Level.Invocation)
+  public void setUpMerge()
+  {
+    Union u1 = new Union(MERGE_LG_K);
+    u1.update(HllSketch.heapify(mergeSketchBytes1));
+    mergeHolder1 = HllSketchHolder.of(u1);
+
+    Union u2 = new Union(MERGE_LG_K);
+    u2.update(HllSketch.heapify(mergeSketchBytes2));
+    mergeHolder2 = HllSketchHolder.of(u2);
+  }
+
+  @Benchmark
+  public HllSketchHolder mergeUnionHolders()
+  {
+    return mergeHolder1.merge(mergeHolder2);
+  }
 }
diff --git 
a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
 
b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
index df0b884eaae..4ba041f80a6 100644
--- 
a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
+++ 
b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
@@ -158,7 +158,16 @@ public class HllSketchHolder
         return other;
       }
     } else {
-      add(other.getSketch());
+      if (other.union != null && other.sketch == null) {
+        // Both holders have unions. Materialize the other union's result as 
HLL_8
+        // (the union's native internal type) to avoid the expensive HLL_8 to 
HLL_4
+        // conversion that getResult() would trigger by default. The receiving 
union
+        // accepts any HLL type, so HLL_8 is fine and copies as a cheap byte[] 
clone.
+        union.update(other.union.getResult(TgtHllType.HLL_8));
+        sketch = null;
+      } else {
+        add(other.getSketch());
+      }
       return this;
     }
   }
diff --git 
a/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolderTest.java
 
b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolderTest.java
new file mode 100644
index 00000000000..7b358c5bd36
--- /dev/null
+++ 
b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolderTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.query.aggregation.datasketches.hll;
+
+import org.apache.datasketches.hll.HllSketch;
+import org.apache.datasketches.hll.TgtHllType;
+import org.apache.datasketches.hll.Union;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class HllSketchHolderTest
+{
+  private static final int LG_K = 12;
+  private static final int NUM_VALUES = 100;
+
+  @Test
+  public void testMergeSketchWithSketch()
+  {
+    HllSketchHolder holder1 = makeSketchHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeSketchHolder(NUM_VALUES, NUM_VALUES * 2);
+
+    HllSketchHolder result = holder1.merge(holder2);
+
+    Assert.assertSame(holder1, result);
+    Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+  }
+
+  @Test
+  public void testMergeSketchWithUnion()
+  {
+    HllSketchHolder holder1 = makeSketchHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+    HllSketchHolder result = holder1.merge(holder2);
+
+    Assert.assertSame(holder2, result);
+    Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+  }
+
+  @Test
+  public void testMergeUnionWithSketch()
+  {
+    HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeSketchHolder(NUM_VALUES, NUM_VALUES * 2);
+
+    HllSketchHolder result = holder1.merge(holder2);
+
+    Assert.assertSame(holder1, result);
+    Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+  }
+
+  @Test
+  public void testMergeUnionWithUnion()
+  {
+    HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+    HllSketchHolder result = holder1.merge(holder2);
+
+    Assert.assertSame(holder1, result);
+    Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+  }
+
+  @Test
+  public void testMergeUnionWithUnionMatchesNaiveMerge()
+  {
+    HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+    HllSketchHolder naiveHolder1 = makeUnionHolder(0, NUM_VALUES);
+    HllSketchHolder naiveHolder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+    naiveHolder1.add(naiveHolder2.getSketch());
+
+    HllSketchHolder optimizedResult = holder1.merge(holder2);
+
+    Assert.assertEquals(naiveHolder1.getEstimate(), 
optimizedResult.getEstimate(), 0.0);
+  }
+
+  @Test
+  public void testMergeUnionWithUnionPreservesTargetType()
+  {
+    HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+    HllSketchHolder result = holder1.merge(holder2);
+
+    Assert.assertEquals(TgtHllType.HLL_4, result.getSketch().getTgtHllType());
+  }
+
+  @Test
+  public void testMergeWithOverlappingValues()
+  {
+    HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+    HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES / 2, NUM_VALUES + 
NUM_VALUES / 2);
+
+    HllSketchHolder result = holder1.merge(holder2);
+
+    double estimate = result.getEstimate();
+    double expected = NUM_VALUES + (double) NUM_VALUES / 2;
+    Assert.assertEquals(expected, estimate, expected * 0.01);
+  }
+
+  private static HllSketchHolder makeSketchHolder(int startValue, int endValue)
+  {
+    HllSketch sketch = new HllSketch(LG_K);
+    for (int i = startValue; i < endValue; i++) {
+      sketch.update(i);
+    }
+    return HllSketchHolder.of(sketch);
+  }
+
+  private static HllSketchHolder makeUnionHolder(int startValue, int endValue)
+  {
+    HllSketch sketch = new HllSketch(LG_K);
+    for (int i = startValue; i < endValue; i++) {
+      sketch.update(i);
+    }
+    Union union = new Union(LG_K);
+    union.update(sketch);
+    return HllSketchHolder.of(union);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to