(hive) branch master updated: HIVE-29155: Improve performance of TestVectorAggregation (#6036)

dkuzmenko Thu, 28 Aug 2025 03:11:56 -0700

This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new b94375867fe HIVE-29155: Improve performance of TestVectorAggregation 
(#6036)
b94375867fe is described below

commit b94375867fe80152783ad66aa6a27dd24223b687
Author: Thomas Rebele <[email protected]>
AuthorDate: Thu Aug 28 12:11:34 2025 +0200

    HIVE-29155: Improve performance of TestVectorAggregation (#6036)
---
 .../vector/aggregation/TestVectorAggregation.java  | 48 +++++++++++++---------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/aggregation/TestVectorAggregation.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/aggregation/TestVectorAggregation.java
index e4e8c812316..5b5629bb016 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/aggregation/TestVectorAggregation.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/aggregation/TestVectorAggregation.java
@@ -18,28 +18,20 @@
 
 package org.apache.hadoop.hive.ql.exec.vector.aggregation;
 
-import java.lang.reflect.Constructor;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Random;
 import java.util.Set;
-import java.sql.Timestamp;
 
 import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation;
 import org.apache.hadoop.hive.ql.exec.vector.VectorRandomBatchSource;
 import org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource;
 import 
org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource.GenerationSpec;
-import 
org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
 import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount.GenericUDAFCountEvaluator;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFVariance;
-import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector;
 import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
@@ -314,19 +306,35 @@ private boolean checkDecimal64(boolean tryDecimal64, 
TypeInfo typeInfo) {
     return result;
   }
 
+  /**
+   * Generate a random number according to a distribution with the following 
properties:
+   *
+   * <ul>
+   *   <li>the probability decreases linearly from 1 (most probable) to 
<code>maxSize</code> (least probable)</li>
+   *   <li>1 is <emph>maxSize</emph>-times more likely than 
<code>maxSize</code></li>
+   * </ul>
+   *
+   * @return a number from 1 to <code>maxSize</code> (both inclusive)
+   */
   public static int getLinearRandomNumber(Random random, int maxSize) {
-    //Get a linearly multiplied random number
-    int randomMultiplier = maxSize * (maxSize + 1) / 2;
-    int randomInt = random.nextInt(randomMultiplier);
-
-    //Linearly iterate through the possible values to find the correct one
-    int linearRandomNumber = 0;
-    for(int i=maxSize; randomInt >= 0; i--){
-        randomInt -= i;
-        linearRandomNumber++;
-    }
-
-    return linearRandomNumber;
+    // Explanatory example: maxSize is 4, then the numbers 1 to 4 are 
distributed according to
+    // 1:****, 2:***, 3:**, 4:*
+    // The number of stars is a triangular number, so 10 in the example
+    int triangularNumber = maxSize * (maxSize + 1) / 2;
+    // Pick a random star
+    int randomInt = random.nextInt(triangularNumber);
+
+    // Invert the problem: 1:*, 2:**, 3:***, 4:****
+    // So in the example, star index 0 becomes index 9 and vice versa
+    randomInt = triangularNumber - randomInt - 1;
+    // Use the formula for triangular numbers to convert from the star index 
to the number
+    // n*(n+1)/2 = s
+    // n*n + n - 2s = 0
+    // then use the larger solution of the quadratic formula
+    // n = ( -1 + sqrt(1-4*1*(-2s)) )/2
+    int n = (int)(-1 + Math.sqrt(1+8*randomInt))/2;
+    // Invert the result, so that we get the desired distribution
+    return maxSize - n;
   }
 
   private static final int TEST_ROW_COUNT = 100000;

(hive) branch master updated: HIVE-29155: Improve performance of TestVectorAggregation (#6036)

Reply via email to