This is an automated email from the ASF dual-hosted git repository.
zzcclp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new d87c2b0c47 [GLUTEN-12172][CH] Fix group limit first array result
offset (#12173)
d87c2b0c47 is described below
commit d87c2b0c470f5abff0bce8ff7094d7fb208d2cdd
Author: lgbo <[email protected]>
AuthorDate: Fri May 29 18:03:34 2026 +0800
[GLUTEN-12172][CH] Fix group limit first array result offset (#12173)
* [CH] Fix group limit first array result offset
RowNumGroupArraySorted writes aggregate results into a newly created
ColumnArray. For the first output row the array offsets vector can be empty,
but insertResultInto read result_array_offsets.back() before appending the
first offset. That is undefined behavior and can crash when aggregate top-k
writes its first result.
Treat an empty offsets vector as having previous offset 0 before appending
the next cumulative offset. Add a ClickHouse backend regression test that
forces row_number top-k through the aggregate group limit path and validates
the first array result row against vanilla Spark.
* [CH] Stabilize group limit empty offsets test
---
.../GlutenClickHouseTPCHSaltNullParquetSuite.scala | 24 ++++++++++++++++++++++
.../AggregateFunctions/GroupLimitFunctions.cpp | 3 ++-
2 files changed, 26 insertions(+), 1 deletion(-)
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
index 0539f721e6..4766c2fd24 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseTPCHSaltNullParquetSuite.scala
@@ -3045,6 +3045,30 @@ class GlutenClickHouseTPCHSaltNullParquetSuite
}
+ test("row number aggregate topk handles first array result offset") {
+ withSQLConf(
+ (CHConfig.runtimeSettings("enable_window_group_limit_to_aggregate"),
"true"),
+
(CHConfig.runtimeSettings("window.aggregate_topk_high_cardinality_threshold"),
"2.0")
+ ) {
+ compareResultsAgainstVanillaSpark(
+ """
+ |select * from (
+ | select a, b, row_number() over (partition by a order by b) as r
+ | from (select * from values ('a', 2), ('a', 1) as t(a, b))
+ |) where r <= 1
+ |""".stripMargin,
+ compareResult = true,
+ df => {
+ val groupLimit =
collectWithSubqueries(df.queryExecution.executedPlan) {
+ case e: CHAggregateGroupLimitExecTransformer => e
+ case wgl: CHWindowGroupLimitExecTransformer => wgl
+ }
+ assert(groupLimit.nonEmpty)
+ }
+ )
+ }
+ }
+
test("GLUTEN-7905 get topk of window by window") {
withSQLConf(
(CHConfig.runtimeSettings("enable_window_group_limit_to_aggregate"),
"true"),
diff --git a/cpp-ch/local-engine/AggregateFunctions/GroupLimitFunctions.cpp
b/cpp-ch/local-engine/AggregateFunctions/GroupLimitFunctions.cpp
index 20ef632ee6..032bf6f686 100644
--- a/cpp-ch/local-engine/AggregateFunctions/GroupLimitFunctions.cpp
+++ b/cpp-ch/local-engine/AggregateFunctions/GroupLimitFunctions.cpp
@@ -142,7 +142,8 @@ public:
sortAndLimit(max_elements, sort_orders);
- result_array_offsets.push_back(result_array_offsets.back() +
values.size());
+ const auto previous_offset = result_array_offsets.empty() ? 0 :
result_array_offsets.back();
+ result_array_offsets.push_back(previous_offset + values.size());
if (values.empty())
return;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]