This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new c4f6eb2f914 HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich) c4f6eb2f914 is described below commit c4f6eb2f91478152c89070a7455df9a1b8980c75 Author: okumin <g...@okumin.com> AuthorDate: Tue Jun 14 00:26:36 2022 +0900 HIVE-26184: COLLECT_SET with GROUP BY is very slow when some keys are highly skewed (#3253) (okumin reviewed by Zoltan Haindrich) --- .../udf/generic/GenericUDAFMkCollectionEvaluator.java | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java index b05023dd37a..cffc7f76510 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMkCollectionEvaluator.java @@ -95,11 +95,27 @@ public class GenericUDAFMkCollectionEvaluator extends GenericUDAFEvaluator throw new RuntimeException("Buffer type unknown"); } } + + private void reset() { + if (bufferType == BufferType.LIST) { + container.clear(); + } else if (bufferType == BufferType.SET) { + // Don't reuse a container because HashSet#clear can be very slow. The operation takes O(N) + // and N is the capacity of the internal hash table. The internal capacity grows based on + // the number of elements and it never shrinks. Thus, HashSet#clear takes O(N) every time + // once a skewed key appears. + // In order to avoid too many resizing in average cases, we set the initial capacity to the + // number of elements of the previous aggregation. + container = new LinkedHashSet<>(container.size()); + } else { + throw new RuntimeException("Buffer type unknown"); + } + } } @Override public void reset(AggregationBuffer agg) throws HiveException { - ((MkArrayAggregationBuffer) agg).container.clear(); + ((MkArrayAggregationBuffer) agg).reset(); } @Override