This is an automated email from the ASF dual-hosted git repository.
gianm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git
The following commit(s) were added to refs/heads/master by this push:
new 4628e43081b perf: Optimize HLL sketch merge by avoiding unnecessary
HLL_8 to HLL_4 conversion (#19482)
4628e43081b is described below
commit 4628e43081bf0ecc5aec4f904e544e03bf4ab7d6
Author: Maytas Monsereenusorn <[email protected]>
AuthorDate: Mon May 18 22:26:06 2026 -0700
perf: Optimize HLL sketch merge by avoiding unnecessary HLL_8 to HLL_4
conversion (#19482)
---
.../druid/benchmark/DataSketchesHllBenchmark.java | 36 ++++++
.../datasketches/hll/HllSketchHolder.java | 11 +-
.../datasketches/hll/HllSketchHolderTest.java | 139 +++++++++++++++++++++
3 files changed, 185 insertions(+), 1 deletion(-)
diff --git
a/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
b/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
index 9b75df49c3f..a9e084b6547 100644
---
a/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
+++
b/benchmarks/src/test/java/org/apache/druid/benchmark/DataSketchesHllBenchmark.java
@@ -20,8 +20,10 @@
package org.apache.druid.benchmark;
import org.apache.datasketches.hll.HllSketch;
+import org.apache.datasketches.hll.Union;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.aggregation.BufferAggregator;
+import org.apache.druid.query.aggregation.datasketches.hll.HllSketchHolder;
import
org.apache.druid.query.aggregation.datasketches.hll.HllSketchMergeAggregatorFactory;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.segment.ColumnSelectorFactory;
@@ -66,11 +68,27 @@ public class DataSketchesHllBenchmark
private final ByteBuffer buf =
ByteBuffer.allocateDirect(aggregatorFactory.getMaxIntermediateSize());
+ private static final int MERGE_LG_K = 12;
+ private static final int MERGE_NUM_VALUES = (1 << MERGE_LG_K) * 8;
+
private BufferAggregator aggregator;
+ private byte[] mergeSketchBytes1;
+ private byte[] mergeSketchBytes2;
+ private HllSketchHolder mergeHolder1;
+ private HllSketchHolder mergeHolder2;
+
@Setup(Level.Trial)
public void setUp()
{
+ HllSketch s1 = new HllSketch(MERGE_LG_K);
+ HllSketch s2 = new HllSketch(MERGE_LG_K);
+ for (int i = 0; i < MERGE_NUM_VALUES; i++) {
+ s1.update(i);
+ s2.update(MERGE_NUM_VALUES + i);
+ }
+ mergeSketchBytes1 = s1.toCompactByteArray();
+ mergeSketchBytes2 = s2.toCompactByteArray();
aggregator = aggregatorFactory.factorizeBuffered(
new ColumnSelectorFactory()
{
@@ -122,4 +140,22 @@ public class DataSketchesHllBenchmark
aggregator.init(buf, 0);
return aggregatorFactory.deserialize(((HllSketch) aggregator.get(buf,
0)).toCompactByteArray());
}
+
+ @Setup(Level.Invocation)
+ public void setUpMerge()
+ {
+ Union u1 = new Union(MERGE_LG_K);
+ u1.update(HllSketch.heapify(mergeSketchBytes1));
+ mergeHolder1 = HllSketchHolder.of(u1);
+
+ Union u2 = new Union(MERGE_LG_K);
+ u2.update(HllSketch.heapify(mergeSketchBytes2));
+ mergeHolder2 = HllSketchHolder.of(u2);
+ }
+
+ @Benchmark
+ public HllSketchHolder mergeUnionHolders()
+ {
+ return mergeHolder1.merge(mergeHolder2);
+ }
}
diff --git
a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
index df0b884eaae..4ba041f80a6 100644
---
a/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
+++
b/extensions-core/datasketches/src/main/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolder.java
@@ -158,7 +158,16 @@ public class HllSketchHolder
return other;
}
} else {
- add(other.getSketch());
+ if (other.union != null && other.sketch == null) {
+ // Both holders have unions. Materialize the other union's result as
HLL_8
+ // (the union's native internal type) to avoid the expensive HLL_8 to
HLL_4
+ // conversion that getResult() would trigger by default. The receiving
union
+ // accepts any HLL type, so HLL_8 is fine and copies as a cheap byte[]
clone.
+ union.update(other.union.getResult(TgtHllType.HLL_8));
+ sketch = null;
+ } else {
+ add(other.getSketch());
+ }
return this;
}
}
diff --git
a/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolderTest.java
b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolderTest.java
new file mode 100644
index 00000000000..7b358c5bd36
--- /dev/null
+++
b/extensions-core/datasketches/src/test/java/org/apache/druid/query/aggregation/datasketches/hll/HllSketchHolderTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.query.aggregation.datasketches.hll;
+
+import org.apache.datasketches.hll.HllSketch;
+import org.apache.datasketches.hll.TgtHllType;
+import org.apache.datasketches.hll.Union;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class HllSketchHolderTest
+{
+ private static final int LG_K = 12;
+ private static final int NUM_VALUES = 100;
+
+ @Test
+ public void testMergeSketchWithSketch()
+ {
+ HllSketchHolder holder1 = makeSketchHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeSketchHolder(NUM_VALUES, NUM_VALUES * 2);
+
+ HllSketchHolder result = holder1.merge(holder2);
+
+ Assert.assertSame(holder1, result);
+ Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+ }
+
+ @Test
+ public void testMergeSketchWithUnion()
+ {
+ HllSketchHolder holder1 = makeSketchHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+ HllSketchHolder result = holder1.merge(holder2);
+
+ Assert.assertSame(holder2, result);
+ Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+ }
+
+ @Test
+ public void testMergeUnionWithSketch()
+ {
+ HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeSketchHolder(NUM_VALUES, NUM_VALUES * 2);
+
+ HllSketchHolder result = holder1.merge(holder2);
+
+ Assert.assertSame(holder1, result);
+ Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+ }
+
+ @Test
+ public void testMergeUnionWithUnion()
+ {
+ HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+ HllSketchHolder result = holder1.merge(holder2);
+
+ Assert.assertSame(holder1, result);
+ Assert.assertEquals(NUM_VALUES * 2, result.getEstimate(), 0.01);
+ }
+
+ @Test
+ public void testMergeUnionWithUnionMatchesNaiveMerge()
+ {
+ HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+ HllSketchHolder naiveHolder1 = makeUnionHolder(0, NUM_VALUES);
+ HllSketchHolder naiveHolder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+ naiveHolder1.add(naiveHolder2.getSketch());
+
+ HllSketchHolder optimizedResult = holder1.merge(holder2);
+
+ Assert.assertEquals(naiveHolder1.getEstimate(),
optimizedResult.getEstimate(), 0.0);
+ }
+
+ @Test
+ public void testMergeUnionWithUnionPreservesTargetType()
+ {
+ HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES, NUM_VALUES * 2);
+
+ HllSketchHolder result = holder1.merge(holder2);
+
+ Assert.assertEquals(TgtHllType.HLL_4, result.getSketch().getTgtHllType());
+ }
+
+ @Test
+ public void testMergeWithOverlappingValues()
+ {
+ HllSketchHolder holder1 = makeUnionHolder(0, NUM_VALUES);
+ HllSketchHolder holder2 = makeUnionHolder(NUM_VALUES / 2, NUM_VALUES +
NUM_VALUES / 2);
+
+ HllSketchHolder result = holder1.merge(holder2);
+
+ double estimate = result.getEstimate();
+ double expected = NUM_VALUES + (double) NUM_VALUES / 2;
+ Assert.assertEquals(expected, estimate, expected * 0.01);
+ }
+
+ private static HllSketchHolder makeSketchHolder(int startValue, int endValue)
+ {
+ HllSketch sketch = new HllSketch(LG_K);
+ for (int i = startValue; i < endValue; i++) {
+ sketch.update(i);
+ }
+ return HllSketchHolder.of(sketch);
+ }
+
+ private static HllSketchHolder makeUnionHolder(int startValue, int endValue)
+ {
+ HllSketch sketch = new HllSketch(LG_K);
+ for (int i = startValue; i < endValue; i++) {
+ sketch.update(i);
+ }
+ Union union = new Union(LG_K);
+ union.update(sketch);
+ return HllSketchHolder.of(union);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]