(druid) branch master updated: Speed up ClusterByStatisticsCollector tests. (#18927)

karan Mon, 19 Jan 2026 20:42:56 -0800

This is an automated email from the ASF dual-hosted git repository.

karan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git



The following commit(s) were added to refs/heads/master by this push:
     new 2df0f61290e Speed up ClusterByStatisticsCollector tests. (#18927)
2df0f61290e is described below

commit 2df0f61290e8aaf8a63d32e65787a4f50bbdeffd
Author: Gian Merlino <[email protected]>
AuthorDate: Mon Jan 19 20:42:41 2026 -0800

    Speed up ClusterByStatisticsCollector tests. (#18927)
    
    In QuantilesSketchKeyCollectorTest, DistinctKeyCollectorTest, and
    ClusterByStatisticsCollectorImplTest, lower the number of rows per test
    to 100k, 100k, and 150k respectively. Also lower MAX_BYTES and MAX_BUCKETS
    in ClusterByStatisticsCollectorImplTest. This provides a similar level
    of coverage while running significantly faster: a few seconds per test
    rather than nearly a minute.
---
 .../ClusterByStatisticsCollectorImplTest.java      | 32 +++++++++++-----------
 .../msq/statistics/DistinctKeyCollectorTest.java   |  2 +-
 .../QuantilesSketchKeyCollectorTest.java           |  4 +--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git 
a/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java
 
b/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java
index 13fda326267..4c65ca366c5 100644
--- 
a/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java
+++ 
b/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/ClusterByStatisticsCollectorImplTest.java
@@ -70,7 +70,7 @@ import java.util.stream.LongStream;
 
 public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlingTest
 {
-  private static final double PARTITION_SIZE_LEEWAY = 0.3;
+  private static final double PARTITION_SIZE_LEEWAY = 0.35;
 
   private static final RowSignature SIGNATURE = RowSignature.builder()
                                                             .add("x", 
ColumnType.LONG)
@@ -96,14 +96,14 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
       1
   );
 
-  // These numbers are roughly 10x lower than authentic production numbers. 
(See StageDefinition.)
-  private static final int MAX_BYTES = 1_000_000;
-  private static final int MAX_BUCKETS = 1000;
+  // These numbers are roughly 50x lower than authentic production numbers. 
(See StageDefinition.)
+  private static final int MAX_BYTES = 150_000;
+  private static final int MAX_BUCKETS = 200;
 
   @Test
   public void test_clusterByX_unique()
   {
-    final long numRows = 1_000_000;
+    final long numRows = 150_000;
     final boolean aggregate = false;
     final ClusterBy clusterBy = CLUSTER_BY_X;
     final Iterable<RowKey> keys = () ->
@@ -122,7 +122,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
           Assert.assertEquals(StringUtils.format("%s: tracked bucket count", 
testName), 1, trackedBuckets(collector));
           Assert.assertEquals(StringUtils.format("%s: tracked row count", 
testName), numRows, trackedRows(collector));
 
-          for (int targetPartitionWeight : new int[]{51111, 65432, (int) 
numRows + 10}) {
+          for (int targetPartitionWeight : new int[]{5111, 6543, (int) numRows 
+ 10}) {
             verifyPartitionsWithTargetWeight(
                 StringUtils.format("%s: 
generatePartitionsWithTargetWeight(%d)", testName, targetPartitionWeight),
                 collector,
@@ -150,7 +150,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
   @Test
   public void test_clusterByX_everyKeyAppearsTwice()
   {
-    final long numRows = 1_000_000;
+    final long numRows = 150_000;
     final boolean aggregate = false;
     final ClusterBy clusterBy = CLUSTER_BY_X;
     final List<RowKey> keys = new ArrayList<>();
@@ -171,7 +171,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
           Assert.assertEquals(StringUtils.format("%s: tracked bucket count", 
testName), 1, trackedBuckets(collector));
           Assert.assertEquals(StringUtils.format("%s: tracked row count", 
testName), numRows, trackedRows(collector));
 
-          for (int targetPartitionWeight : new int[]{51111, 65432, (int) 
numRows + 10}) {
+          for (int targetPartitionWeight : new int[]{5111, 6543, (int) numRows 
+ 10}) {
             verifyPartitionsWithTargetWeight(
                 StringUtils.format("%s: 
generatePartitionsWithTargetWeight(%d)", testName, targetPartitionWeight),
                 collector,
@@ -199,7 +199,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
   @Test
   public void test_clusterByX_everyKeyAppearsTwice_withAggregation()
   {
-    final long numRows = 1_000_000;
+    final long numRows = 150_000;
     final boolean aggregate = true;
     final ClusterBy clusterBy = CLUSTER_BY_X;
     final List<RowKey> keys = new ArrayList<>();
@@ -229,7 +229,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
               expectedNumRows * .05 // Acceptable estimation error
           );
 
-          for (int targetPartitionWeight : new int[]{51111, 65432, (int) 
numRows + 10}) {
+          for (int targetPartitionWeight : new int[]{5111, 6543, (int) numRows 
+ 10}) {
             verifyPartitionsWithTargetWeight(
                 StringUtils.format("%s: 
generatePartitionsWithTargetWeight(%d)", testName, targetPartitionWeight),
                 collector,
@@ -259,7 +259,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
   {
     final int numBuckets = 3;
     final boolean aggregate = false;
-    final long numRows = 1_000_000;
+    final long numRows = 150_000;
     final ClusterBy clusterBy = CLUSTER_BY_XY_BUCKET_BY_X;
     final List<RowKey> keys = new ArrayList<>((int) numRows);
 
@@ -281,7 +281,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
           Assert.assertEquals(StringUtils.format("%s: bucket count", 
testName), numBuckets, trackedBuckets(collector));
           Assert.assertEquals(StringUtils.format("%s: row count", testName), 
numRows, trackedRows(collector));
 
-          for (int targetPartitionWeight : new int[]{17001, 23007}) {
+          for (int targetPartitionWeight : new int[]{5111, 6543}) {
             verifyPartitionsWithTargetWeight(
                 StringUtils.format("%s: 
generatePartitionsWithTargetWeight(%d)", testName, targetPartitionWeight),
                 collector,
@@ -323,7 +323,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
   {
     final int numBuckets = MAX_BUCKETS;
     final boolean aggregate = false;
-    final long numRows = 1_000_000;
+    final long numRows = 150_000;
     final ClusterBy clusterBy = CLUSTER_BY_XY_BUCKET_BY_X;
     final List<RowKey> keys = new ArrayList<>((int) numRows);
 
@@ -344,7 +344,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
         (testName, collector) -> {
           Assert.assertEquals(StringUtils.format("%s: bucket count", 
testName), numBuckets, trackedBuckets(collector));
 
-          for (int targetPartitionWeight : new int[]{17001, 23007}) {
+          for (int targetPartitionWeight : new int[]{1701, 2301}) {
             verifyPartitionsWithTargetWeight(
                 StringUtils.format("%s: 
generatePartitionsWithTargetWeight(%d)", testName, targetPartitionWeight),
                 collector,
@@ -394,7 +394,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
   {
     final int numBuckets = MAX_BUCKETS;
     final boolean aggregate = true;
-    final long numRows = 1_000_000;
+    final long numRows = 150_000;
     final ClusterBy clusterBy = CLUSTER_BY_XY_BUCKET_BY_X;
     final List<RowKey> keys = new ArrayList<>((int) numRows);
 
@@ -418,7 +418,7 @@ public class ClusterByStatisticsCollectorImplTest extends 
InitializedNullHandlin
           // trackedRows will equal numBuckets, because the collectors have 
been downsampled so much
           Assert.assertEquals(StringUtils.format("%s: row count", testName), 
numBuckets, trackedRows(collector));
 
-          for (int targetPartitionWeight : new int[]{17001, 23007}) {
+          for (int targetPartitionWeight : new int[]{1701, 2301}) {
             verifyPartitionsWithTargetWeight(
                 StringUtils.format("%s: 
generatePartitionsWithTargetWeight(%d)", testName, targetPartitionWeight),
                 collector,
diff --git 
a/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/DistinctKeyCollectorTest.java
 
b/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/DistinctKeyCollectorTest.java
index 943b3108c8f..765aead2694 100644
--- 
a/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/DistinctKeyCollectorTest.java
+++ 
b/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/DistinctKeyCollectorTest.java
@@ -46,7 +46,7 @@ public class DistinctKeyCollectorTest
   private final ClusterBy clusterBy = new ClusterBy(ImmutableList.of(new 
KeyColumn("x", KeyOrder.ASCENDING)), 0);
   private final RowSignature signature = RowSignature.builder().add("x", 
ColumnType.LONG).build();
   private final Comparator<RowKey> comparator = 
clusterBy.keyComparator(signature);
-  private final int numKeys = 500_000;
+  private final int numKeys = 100_000;
 
   @Test
   public void test_empty()
diff --git 
a/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/QuantilesSketchKeyCollectorTest.java
 
b/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/QuantilesSketchKeyCollectorTest.java
index c4c32becdb2..c53563aa68a 100644
--- 
a/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/QuantilesSketchKeyCollectorTest.java
+++ 
b/multi-stage-query/src/test/java/org/apache/druid/msq/statistics/QuantilesSketchKeyCollectorTest.java
@@ -45,7 +45,7 @@ public class QuantilesSketchKeyCollectorTest
   private final ClusterBy clusterBy = new ClusterBy(ImmutableList.of(new 
KeyColumn("x", KeyOrder.ASCENDING)), 0);
   private final RowSignature signature = RowSignature.builder().add("x", 
ColumnType.LONG).build();
   private final Comparator<RowKey> comparator = 
clusterBy.keyComparator(signature);
-  private final int numKeys = 500_000;
+  private final int numKeys = 100_000;
 
   @Test
   public void test_empty()
@@ -120,7 +120,7 @@ public class QuantilesSketchKeyCollectorTest
           }
 
           Assert.assertEquals(testName, 2, collector.getSketch().getK());
-          Assert.assertEquals(testName, 14, collector.estimatedRetainedKeys());
+          Assert.assertEquals(testName, 12, collector.estimatedRetainedKeys());
 
           // Don't use verifyCollector, since this collector is downsampled so 
aggressively that it can't possibly
           // hope to pass those tests. Grade on a curve.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(druid) branch master updated: Speed up ClusterByStatisticsCollector tests. (#18927)

Reply via email to