This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5f2d37e0b429 [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and refactor code 5f2d37e0b429 is described below commit 5f2d37e0b429a940dc52872c41ac71e4012f22dd Author: Mihailo Milosevic <mihailo.milose...@databricks.com> AuthorDate: Fri Mar 21 21:18:10 2025 +0800 [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and refactor code ### What changes were proposed in this pull request? It was noticed that some tests could be flaky for listagg. The flakiness mainly can come due to non-deterministic behaviour of grouping by a column or collations (i.e. utf8_lcase, unicode_rtrim...). This PR aims to make queries more deterministic by checking only the required info in the result of listagg queries. ### Why are the changes needed? We do not want to waste resources on flaky tests. Also, we want the tests to scope the minimum set of truth, which means we do not assume ordering in specific result, when there was an aggregation or grouping. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Test only change. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #50338 from mihailom-db/listaggFollowUp. Authored-by: Mihailo Milosevic <mihailo.milose...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../catalyst/expressions/aggregate/collect.scala | 12 +- .../analyzer-results/listagg-collations.sql.out | 56 ++- .../sql-tests/analyzer-results/listagg.sql.out | 417 ++++++++++++--------- .../sql-tests/inputs/listagg-collations.sql | 18 +- .../test/resources/sql-tests/inputs/listagg.sql | 51 +-- .../sql-tests/results/listagg-collations.sql.out | 44 +-- .../resources/sql-tests/results/listagg.sql.out | 156 ++++---- .../apache/spark/sql/DataFrameAggregateSuite.scala | 33 +- 8 files changed, 433 insertions(+), 354 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index 7789c23b50a4..015bd1e3e142 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -277,14 +277,14 @@ private[aggregate] object CollectTopK { @ExpressionDescription( usage = """ _FUNC_(expr[, delimiter])[ WITHIN GROUP (ORDER BY key [ASC | DESC] [,...])] - Returns - the concatenation of non-null input values, separated by the delimiter ordered by key. - If all values are null, null is returned. + the concatenation of non-NULL input values, separated by the delimiter ordered by key. + If all values are NULL, NULL is returned. """, arguments = """ Arguments: * expr - a string or binary expression to be concatenated. * delimiter - an optional string or binary foldable expression used to separate the input values. - If null, the concatenation will be performed without a delimiter. Default is null. + If NULL, the concatenation will be performed without a delimiter. Default is NULL. * key - an optional expression for ordering the input values. Multiple keys can be specified. If none are specified, the order of the rows in the result is non-deterministic. """, @@ -400,7 +400,7 @@ case class ListAgg( ) ) } else if (delimiter.dataType == NullType) { - // null is the default empty delimiter so type is not important + // Null is the default empty delimiter so type is not important TypeCheckSuccess } else { TypeUtils.checkForSameTypeInputExpr(child.dataType :: delimiter.dataType :: Nil, prettyName) @@ -451,7 +451,7 @@ case class ListAgg( } /** - * @return ordering by (orderValue0, orderValue1, ...) + * @return Ordering by (orderValue0, orderValue1, ...) * for InternalRow with format [childValue, orderValue0, orderValue1, ...] */ private[this] def bufferOrdering: Ordering[InternalRow] = { @@ -477,7 +477,7 @@ case class ListAgg( } /** - * @return delimiter value or default empty value if delimiter is null. Type respects [[dataType]] + * @return Delimiter value or default empty value if delimiter is null. Type respects [[dataType]] */ private[this] def getDelimiterValue: Either[UTF8String, Array[Byte]] = { val delimiterValue = delimiter.eval() diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out index ca471858a541..3d239f1bdb04 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out @@ -9,66 +9,82 @@ Aggregate [listagg(c1#x, null, collate(c1#x, utf8_binary) ASC NULLS FIRST, 0, 0) -- !query -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) -- !query analysis -Aggregate [listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)#x] +Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS listagg(DISTINCT collate(c1, utf8_binary), NULL)#x] +- SubqueryAlias t +- Project [col1#x AS c1#x] +- LocalRelation [col1#x] -- !query -SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t -- !query analysis -Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS listagg(DISTINCT collate(c1, utf8_binary), NULL)#x] -+- SubqueryAlias t - +- Project [col1#x AS c1#x] - +- LocalRelation [col1#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x AS c1#x] +: +- Aggregate [listagg(col1#x, null, col1#x ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x] ++- Project [replace(replace(c1#x, , ), +, $) AS replace(replace(c1, , ), +, $)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [c1#x], false, false, 1 -- !query -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) -- !query analysis -Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0) AS listagg(DISTINCT collate(c1, utf8_lcase), NULL)#x] +Aggregate [lower(listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0)) AS lower(listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST))#x] +- SubqueryAlias t +- Project [col1#x AS c1#x] +- LocalRelation [col1#x] -- !query -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) +SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) -- !query analysis -Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)#x] +Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0)) AS lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL))#x] +- SubqueryAlias t +- Project [col1#x AS c1#x] +- LocalRelation [col1#x] -- !query -SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('x'), ('abc')) AS t(c1) +SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) -- !query analysis -Aggregate [listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0) AS listagg(DISTINCT collate(c1, unicode_rtrim), NULL)#x] +Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0)) AS lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST))#x] +- SubqueryAlias t +- Project [col1#x AS c1#x] +- LocalRelation [col1#x] -- !query -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc '), ('xbc '), ('a'), ('xbc')) AS t(c1) -- !query analysis -Aggregate [listagg(c1#x, null, c1#x ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST)#x] +Aggregate [rtrim(listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0), None) AS rtrim(listagg(DISTINCT collate(c1, unicode_rtrim), NULL))#x] +- SubqueryAlias t +- Project [col1#x AS c1#x] +- LocalRelation [col1#x] -- !query -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t -- !query analysis -Aggregate [listagg(c1#x, null, collate(c1#x, unicode_rtrim) ASC NULLS FIRST, 0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim) ASC NULLS FIRST)#x] -+- SubqueryAlias t - +- Project [col1#x AS c1#x] - +- LocalRelation [col1#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY collate(col1, unicode_rtrim) ASC NULLS FIRST)#x AS c1#x] +: +- Aggregate [listagg(col1#x, null, collate(col1#x, unicode_rtrim) ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY collate(col1, unicode_rtrim) ASC NULLS FIRST)#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x] ++- Project [replace(replace(c1#x, , ), +, $) AS replace(replace(c1, , ), +, $)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [c1#x], false, false, 1 -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out index 71eb3f8ca76b..76aba15b4617 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out @@ -1,223 +1,274 @@ -- Automatically generated by SQLQueryTestSuite -- !query CREATE TEMP VIEW df AS -SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b) +SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) -- !query analysis -CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b), false, false, LocalTempView, UNSUPPORTED, true - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)), false, false, LocalTempView, UNSUPPORTED, true + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query CREATE TEMP VIEW df2 AS -SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b) +SELECT * FROM (VALUES (1, true), (2, false), (3, false)) -- !query analysis -CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b), false, false, LocalTempView, UNSUPPORTED, true - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3, false)), false, false, LocalTempView, UNSUPPORTED, true + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(b) FROM df GROUP BY a +WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query analysis -Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col2, NULL)#x AS col#x] +: +- Aggregate [col1#x], [listagg(col2#x, null, 0, 0) AS listagg(col2, NULL)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS regexp_count(col, d)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false -- !query -SELECT string_agg(b) FROM df GROUP BY a +WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query analysis -Aggregate [a#x], [string_agg(b#x, null, 0, 0) AS string_agg(b, NULL)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [string_agg(col2, NULL)#x AS col#x] +: +- Aggregate [col1#x], [string_agg(col2#x, null, 0, 0) AS string_agg(col2, NULL)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS regexp_count(col, d)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false -- !query -SELECT listagg(b, NULL) FROM df GROUP BY a +WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query analysis -Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col2, NULL)#x AS col#x] +: +- Aggregate [col1#x], [listagg(col2#x, null, 0, 0) AS listagg(col2, NULL)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS regexp_count(col, d)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false -- !query -SELECT listagg(b) FROM df WHERE 1 != 1 +SELECT listagg(col2) FROM df WHERE 1 != 1 -- !query analysis -Aggregate [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x] +Aggregate [listagg(col2#x, null, 0, 0) AS listagg(col2, NULL)#x] +- Filter NOT (1 = 1) +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(b, '|') FROM df GROUP BY a +WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query analysis -Aggregate [a#x], [listagg(b#x, |, 0, 0) AS listagg(b, |)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col2, |)#x AS col#x] +: +- Aggregate [col1#x], [listagg(col2#x, |, 0, 0) AS listagg(col2, |)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS regexp_count(col, d)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false -- !query -SELECT listagg(a) FROM df +WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b') FROM t -- !query analysis -Aggregate [listagg(a#x, null, 0, 0) AS listagg(a, NULL)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col1, NULL)#x AS col#x] +: +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false, 1 -- !query -SELECT listagg(DISTINCT a) FROM df +WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b') FROM t -- !query analysis -Aggregate [listagg(distinct a#x, null, 0, 0) AS listagg(DISTINCT a, NULL)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(DISTINCT col1, NULL)#x AS col#x] +: +- Aggregate [listagg(distinct col1#x, null, 0, 0) AS listagg(DISTINCT col1, NULL)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false, 1 -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df -- !query analysis -Aggregate [listagg(a#x, null, a#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a ASC NULLS FIRST)#x] +Aggregate [listagg(col1#x, null, col1#x ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df -- !query analysis -Aggregate [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST)#x] +Aggregate [listagg(col1#x, null, col1#x DESC NULLS LAST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST)#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY col2) FROM df -- !query analysis -Project [listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] -+- Project [a#x, b#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] - +- Window [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) windowspecdefinition(b#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [b#x] - +- Project [a#x, b#x] +Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] ++- Project [col1#x, col2#x, listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] + +- Window [listagg(col1#x, null, col1#x DESC NULLS LAST, 0, 0) windowspecdefinition(col2#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [col2#x] + +- Project [col1#x, col2#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df -- !query analysis -Aggregate [listagg(a#x, null, b#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b ASC NULLS FIRST)#x] +Aggregate [listagg(col1#x, null, col2#x ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 ASC NULLS FIRST)#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df +WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM df) SELECT (col == 'baba') || (col == 'bbaa') FROM t -- !query analysis -Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST)#x] -+- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x AS col#x] +: +- Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [concat(cast((col#x = baba) as string), cast((col#x = bbaa) as string)) AS concat((col = baba), (col = bbaa))#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false, 1 -- !query -SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df +WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t -- !query analysis -Aggregate [listagg(a#x, |, b#x DESC NULLS LAST, 0, 0) AS listagg(a, |) WITHIN GROUP (ORDER BY b DESC NULLS LAST)#x] +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x AS col#x] +: +- Aggregate [listagg(col1#x, |, col2#x DESC NULLS LAST, 0, 0) AS listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x] +: +- SubqueryAlias df +: +- View (`df`, [col1#x, col2#x]) +: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] +: +- Project [col1#x, col2#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x, col2#x] ++- Project [concat(cast((col#x = b|a|b|a) as string), cast((col#x = b|b|a|a) as string)) AS concat((col = b|a|b|a), (col = b|b|a|a))#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false, 1 + + +-- !query +SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df +-- !query analysis +Aggregate [listagg(col1#x, |, col2#x DESC NULLS LAST, 0, 0) AS listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df -- !query analysis -Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS FIRST)#x] +Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1 ASC NULLS FIRST)#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df -- !query analysis -Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS LAST)#x] +Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x DESC NULLS LAST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1 DESC NULLS LAST)#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query @@ -248,16 +299,15 @@ Aggregate [listagg(c1#x, 0x42, 0, 0) AS listagg(c1, X'42')#x] -- !query -SELECT listagg(a), listagg(b, ',') FROM df2 +SELECT listagg(col1), listagg(col2, ',') FROM df2 -- !query analysis -Aggregate [listagg(cast(a#x as string), null, 0, 0) AS listagg(a, NULL)#x, listagg(cast(b#x as string), ,, 0, 0) AS listagg(b, ,)#x] +Aggregate [listagg(cast(col1#x as string), null, 0, 0) AS listagg(col1, NULL)#x, listagg(cast(col2#x as string), ,, 0, 0) AS listagg(col2, ,)#x] +- SubqueryAlias df2 - +- View (`df2`, [a#x, b#x]) - +- Project [cast(a#x as int) AS a#x, cast(b#x as boolean) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df2`, [col1#x, col2#x]) + +- Project [cast(col1#x as int) AS col1#x, cast(col2#x as boolean) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query @@ -307,129 +357,128 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query -SELECT listagg(b, a) FROM df GROUP BY a +SELECT listagg(col2, col1) FROM df GROUP BY col1 -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", "sqlState" : "42K09", "messageParameters" : { - "inputExpr" : "\"a\"", + "inputExpr" : "\"col1\"", "inputName" : "`delimiter`", "inputType" : "\"STRING\"", - "sqlExpr" : "\"listagg(b, a)\"" + "sqlExpr" : "\"listagg(col2, col1)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 20, - "fragment" : "listagg(b, a)" + "stopIndex" : 26, + "fragment" : "listagg(col2, col1)" } ] } -- !query -SELECT listagg(a) OVER (ORDER BY a) FROM df +SELECT listagg(col1) OVER (ORDER BY col1) FROM df -- !query analysis -Project [listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x] -+- Project [a#x, listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x] - +- Window [listagg(a#x, null, 0, 0) windowspecdefinition(a#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x], [a#x ASC NULLS FIRST] - +- Project [a#x] +Project [listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x] ++- Project [col1#x, listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x] + +- Window [listagg(col1#x, null, 0, 0) windowspecdefinition(col1#x ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x], [col1#x ASC NULLS FIRST] + +- Project [col1#x] +- SubqueryAlias df - +- View (`df`, [a#x, b#x]) - +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] - +- Project [a#x, b#x] - +- SubqueryAlias t - +- Project [col1#x AS a#x, col2#x AS b#x] - +- LocalRelation [col1#x, col2#x] + +- View (`df`, [col1#x, col2#x]) + +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS col2#x] + +- Project [col1#x, col2#x] + +- SubqueryAlias __auto_generated_subquery_name + +- LocalRelation [col1#x, col2#x] -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", "sqlState" : "42601", "messageParameters" : { - "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 61, - "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + "stopIndex" : 70, + "fragment" : "listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1)" } ] } -- !query -SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", "sqlState" : "42601", "messageParameters" : { - "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 64, - "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + "stopIndex" : 73, + "fragment" : "string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1)" } ] } -- !query -SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df +SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED", "sqlState" : "0A000", "messageParameters" : { - "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\"" + "windowExpr" : "\"listagg(DISTINCT col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 44, - "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)" + "stopIndex" : 50, + "fragment" : "listagg(DISTINCT col1) OVER (ORDER BY col1)" } ] } -- !query -SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df +SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", "sqlState" : "42K0K", "messageParameters" : { - "funcArg" : "\"a\"", + "funcArg" : "\"col1\"", "funcName" : "`listagg`", - "orderingExpr" : "\"b\"" + "orderingExpr" : "\"col2\"" } } -- !query -SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df +SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", "sqlState" : "42K0K", "messageParameters" : { - "funcArg" : "\"a\"", + "funcArg" : "\"col1\"", "funcName" : "`listagg`", - "orderingExpr" : "\"a\", \"b\"" + "orderingExpr" : "\"col1\", \"col2\"" } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql index 35f86183c37b..a1949cec1ebf 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql @@ -1,12 +1,14 @@ --- Test cases with collations +-- Test cases with utf8_binary SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1); -SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('x'), ('abc')) AS t(c1); -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1); -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1); +WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t; +-- Test cases with utf8_lcase. Lower expression added for determinism +SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); +SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1); +SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1); +-- Test cases with unicode_rtrim. +SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc '), ('xbc '), ('a'), ('xbc')) AS t(c1); +WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t; -- Error case with collations -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1); \ No newline at end of file +SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1); diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql index 573e0ddddf2f..7acac239cd9c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql @@ -2,39 +2,40 @@ -- Create temporary views CREATE TEMP VIEW df AS -SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b); +SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)); CREATE TEMP VIEW df2 AS -SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b); +SELECT * FROM (VALUES (1, true), (2, false), (3, false)); -- Test cases for listagg function -SELECT listagg(b) FROM df GROUP BY a; -SELECT string_agg(b) FROM df GROUP BY a; -SELECT listagg(b, NULL) FROM df GROUP BY a; -SELECT listagg(b) FROM df WHERE 1 != 1; -SELECT listagg(b, '|') FROM df GROUP BY a; -SELECT listagg(a) FROM df; -SELECT listagg(DISTINCT a) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df; -SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df; +WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t; +WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t; +WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t; +SELECT listagg(col2) FROM df WHERE 1 != 1; +WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t; +WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b') FROM t; +WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b') FROM t; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY col2) FROM df; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df; +WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM df) SELECT (col == 'baba') || (col == 'bbaa') FROM t; +WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t; +SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df; SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); -SELECT listagg(a), listagg(b, ',') FROM df2; +SELECT listagg(col1), listagg(col2, ',') FROM df2; -- Error cases SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1); SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); -SELECT listagg(b, a) FROM df GROUP BY a; -SELECT listagg(a) OVER (ORDER BY a) FROM df; -SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df; -SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df; -SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df; -SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df; -SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df; \ No newline at end of file +SELECT listagg(col2, col1) FROM df GROUP BY col1; +SELECT listagg(col1) OVER (ORDER BY col1) FROM df; +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df; +SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df; +SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df; +SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df; +SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df; diff --git a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out index cf3bac04f09c..7d5b054217b4 100644 --- a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out @@ -8,61 +8,61 @@ ABab -- !query -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) -- !query schema -struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST):string> +struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string> -- !query output aAbB -- !query -SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t -- !query schema -struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string> +struct<replace(replace(c1, , ), +, $):string> -- !query output -aAbB +abcabc$abcabcx -- !query -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) +SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) -- !query schema -struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL):string collate UTF8_LCASE> +struct<lower(listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)):string> -- !query output -ab +aabb -- !query -SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) +SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1) -- !query schema -struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST):string collate UTF8_LCASE> +struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL)):string collate UTF8_LCASE> -- !query output -aB +ab -- !query -SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('x'), ('abc')) AS t(c1) +SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1) -- !query schema -struct<listagg(DISTINCT collate(c1, unicode_rtrim), NULL):string collate UNICODE_RTRIM> +struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)):string collate UTF8_LCASE> -- !query output -abc x +ab -- !query -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc '), ('xbc '), ('a'), ('xbc')) AS t(c1) -- !query schema -struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST):string> +struct<rtrim(listagg(DISTINCT collate(c1, unicode_rtrim), NULL)):string collate UNICODE_RTRIM> -- !query output -abcabc -abc abc x +axbc -- !query -SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1) +WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' ', ''), '\n', '$') FROM t -- !query schema -struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim) ASC NULLS FIRST):string> +struct<replace(replace(c1, , ), +, $):string> -- !query output -abc abc abcabc -x +abcabcabc$x -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out index cc4b568c9e4e..de5432939031 100644 --- a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out @@ -1,7 +1,7 @@ -- Automatically generated by SQLQueryTestSuite -- !query CREATE TEMP VIEW df AS -SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b) +SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, NULL)) -- !query schema struct<> -- !query output @@ -10,7 +10,7 @@ struct<> -- !query CREATE TEMP VIEW df2 AS -SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b) +SELECT * FROM (VALUES (1, true), (2, false), (3, false)) -- !query schema struct<> -- !query output @@ -18,89 +18,89 @@ struct<> -- !query -SELECT listagg(b) FROM df GROUP BY a +WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query schema -struct<listagg(b, NULL):string> +struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int,regexp_count(col, c):int,regexp_count(col, d):int> -- !query output -NULL -bc -cd +2 0 0 1 1 +2 0 1 1 0 +NULL NULL NULL NULL NULL -- !query -SELECT string_agg(b) FROM df GROUP BY a +WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query schema -struct<string_agg(b, NULL):string> +struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int,regexp_count(col, c):int,regexp_count(col, d):int> -- !query output -NULL -bc -cd +2 0 0 1 1 +2 0 1 1 0 +NULL NULL NULL NULL NULL -- !query -SELECT listagg(b, NULL) FROM df GROUP BY a +WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query schema -struct<listagg(b, NULL):string> +struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int,regexp_count(col, c):int,regexp_count(col, d):int> -- !query output -NULL -bc -cd +2 0 0 1 1 +2 0 1 1 0 +NULL NULL NULL NULL NULL -- !query -SELECT listagg(b) FROM df WHERE 1 != 1 +SELECT listagg(col2) FROM df WHERE 1 != 1 -- !query schema -struct<listagg(b, NULL):string> +struct<listagg(col2, NULL):string> -- !query output NULL -- !query -SELECT listagg(b, '|') FROM df GROUP BY a +WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), regexp_count(col, 'd') FROM t -- !query schema -struct<listagg(b, |):string> +struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int,regexp_count(col, c):int,regexp_count(col, d):int> -- !query output -NULL -b|c -c|d +3 0 0 1 1 +3 0 1 1 0 +NULL NULL NULL NULL NULL -- !query -SELECT listagg(a) FROM df +WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b') FROM t -- !query schema -struct<listagg(a, NULL):string> +struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int> -- !query output -aabb +4 2 2 -- !query -SELECT listagg(DISTINCT a) FROM df +WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col), regexp_count(col, 'a'), regexp_count(col, 'b') FROM t -- !query schema -struct<listagg(DISTINCT a, NULL):string> +struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int> -- !query output -ab +2 1 1 -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a ASC NULLS FIRST):string> +struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST):string> -- !query output aabb -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST):string> +struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST):string> -- !query output bbaa -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY col2) FROM df -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):string> +struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING):string> -- !query output NULL a @@ -110,41 +110,49 @@ ba -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b ASC NULLS FIRST):string> +struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 ASC NULLS FIRST):string> -- !query output aabb -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df +WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM df) SELECT (col == 'baba') || (col == 'bbaa') FROM t -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string> +struct<concat((col = baba), (col = bbaa)):string> -- !query output -baba +truefalse + + +-- !query +WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t +-- !query schema +struct<concat((col = b|a|b|a), (col = b|b|a|a)):string> +-- !query output +truefalse -- !query -SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df +SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df -- !query schema -struct<listagg(a, |) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string> +struct<listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST):string> -- !query output b|a|b|a -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS FIRST):string> +struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1 ASC NULLS FIRST):string> -- !query output baba -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df -- !query schema -struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS LAST):string> +struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1 DESC NULLS LAST):string> -- !query output bbaa @@ -174,9 +182,9 @@ DEAD42BEEF -- !query -SELECT listagg(a), listagg(b, ',') FROM df2 +SELECT listagg(col1), listagg(col2, ',') FROM df2 -- !query schema -struct<listagg(a, NULL):string,listagg(b, ,):string> +struct<listagg(col1, NULL):string,listagg(col2, ,):string> -- !query output 123 true,false,false @@ -232,7 +240,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query -SELECT listagg(b, a) FROM df GROUP BY a +SELECT listagg(col2, col1) FROM df GROUP BY col1 -- !query schema struct<> -- !query output @@ -241,25 +249,25 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", "sqlState" : "42K09", "messageParameters" : { - "inputExpr" : "\"a\"", + "inputExpr" : "\"col1\"", "inputName" : "`delimiter`", "inputType" : "\"STRING\"", - "sqlExpr" : "\"listagg(b, a)\"" + "sqlExpr" : "\"listagg(col2, col1)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 20, - "fragment" : "listagg(b, a)" + "stopIndex" : 26, + "fragment" : "listagg(col2, col1)" } ] } -- !query -SELECT listagg(a) OVER (ORDER BY a) FROM df +SELECT listagg(col1) OVER (ORDER BY col1) FROM df -- !query schema -struct<listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):string> +struct<listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):string> -- !query output NULL aa @@ -269,7 +277,7 @@ aabb -- !query -SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df -- !query schema struct<> -- !query output @@ -278,20 +286,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", "sqlState" : "42601", "messageParameters" : { - "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 61, - "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + "stopIndex" : 70, + "fragment" : "listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1)" } ] } -- !query -SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df +SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df -- !query schema struct<> -- !query output @@ -300,20 +308,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC", "sqlState" : "42601", "messageParameters" : { - "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\"" + "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 64, - "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)" + "stopIndex" : 73, + "fragment" : "string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1)" } ] } -- !query -SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df +SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df -- !query schema struct<> -- !query output @@ -322,20 +330,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED", "sqlState" : "0A000", "messageParameters" : { - "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\"" + "windowExpr" : "\"listagg(DISTINCT col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\"" }, "queryContext" : [ { "objectType" : "", "objectName" : "", "startIndex" : 8, - "stopIndex" : 44, - "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)" + "stopIndex" : 50, + "fragment" : "listagg(DISTINCT col1) OVER (ORDER BY col1)" } ] } -- !query -SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df +SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df -- !query schema struct<> -- !query output @@ -344,15 +352,15 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", "sqlState" : "42K0K", "messageParameters" : { - "funcArg" : "\"a\"", + "funcArg" : "\"col1\"", "funcName" : "`listagg`", - "orderingExpr" : "\"b\"" + "orderingExpr" : "\"col2\"" } } -- !query -SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df +SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df -- !query schema struct<> -- !query output @@ -361,8 +369,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT", "sqlState" : "42K0K", "messageParameters" : { - "funcArg" : "\"a\"", + "funcArg" : "\"col1\"", "funcName" : "`listagg`", - "orderingExpr" : "\"a\", \"b\"" + "orderingExpr" : "\"col1\", \"col2\"" } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 141d6b219f2a..6ce0a657d5b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -624,36 +624,39 @@ class DataFrameAggregateSuite extends QueryTest } test("listagg function") { - // normal case - val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b") + // Normal case. + val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("col1", "col2") checkAnswer( - df.selectExpr("listagg(a)", "listagg(b)"), + df.selectExpr("listagg(col1)", "listagg(col2)"), Seq(Row("abc", "bcd")) ) checkAnswer( - df.select(listagg($"a"), listagg($"b")), + df.select(listagg($"col1"), listagg($"col2")), Seq(Row("abc", "bcd")) ) - // distinct case - val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("a", "b") + // Distinct case. + val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("col1", "col2") checkAnswer( - df2.select(listagg_distinct($"a"), listagg_distinct($"b")), + df2.select(listagg_distinct($"col1"), listagg_distinct($"col2")), Seq(Row("ab", "bd")) ) - // null case - val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null, null)).toDF("a", "b", "c") + // Null case. + val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null, null)) + .toDF("col1", "col2", "col3") checkAnswer( - df3.select(listagg_distinct($"a"), listagg($"a"), listagg_distinct($"b"), listagg($"b"), - listagg($"c")), - Seq(Row("a", "aa", "b", "bb", null)) + df3.select( + listagg_distinct($"col1"), listagg($"col1"), + listagg_distinct($"col2"), listagg($"col2"), + listagg_distinct($"col3"), listagg($"col3")), + Seq(Row("a", "aa", "b", "bb", null, null)) ) - // custom delimiter - val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b") + // Custom delimiter. + val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("col1", "col2") checkAnswer( - df4.selectExpr("listagg(a, '|')", "listagg(b, '|')"), + df4.selectExpr("listagg(col1, '|')", "listagg(col2, '|')"), Seq(Row("a|b|c", "b|c|d")) ) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org