(spark) branch master updated: [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and refactor code

wenchen Sat, 29 Mar 2025 03:20:47 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 5f2d37e0b429 [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and 
refactor code
5f2d37e0b429 is described below

commit 5f2d37e0b429a940dc52872c41ac71e4012f22dd
Author: Mihailo Milosevic <mihailo.milose...@databricks.com>
AuthorDate: Fri Mar 21 21:18:10 2025 +0800

    [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and refactor code
    
    ### What changes were proposed in this pull request?
    It was noticed that some tests could be flaky for listagg. The flakiness 
mainly can come due to non-deterministic behaviour of grouping by a column or 
collations (i.e. utf8_lcase, unicode_rtrim...). This PR aims to make queries 
more deterministic by checking only the required info in the result of listagg 
queries.
    
    ### Why are the changes needed?
    We do not want to waste resources on flaky tests. Also, we want the tests 
to scope the minimum set of truth,  which means we do not assume ordering in 
specific result, when there was an aggregation or grouping.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Test only change.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #50338 from mihailom-db/listaggFollowUp.
    
    Authored-by: Mihailo Milosevic <mihailo.milose...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../catalyst/expressions/aggregate/collect.scala   |  12 +-
 .../analyzer-results/listagg-collations.sql.out    |  56 ++-
 .../sql-tests/analyzer-results/listagg.sql.out     | 417 ++++++++++++---------
 .../sql-tests/inputs/listagg-collations.sql        |  18 +-
 .../test/resources/sql-tests/inputs/listagg.sql    |  51 +--
 .../sql-tests/results/listagg-collations.sql.out   |  44 +--
 .../resources/sql-tests/results/listagg.sql.out    | 156 ++++----
 .../apache/spark/sql/DataFrameAggregateSuite.scala |  33 +-
 8 files changed, 433 insertions(+), 354 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 7789c23b50a4..015bd1e3e142 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -277,14 +277,14 @@ private[aggregate] object CollectTopK {
 @ExpressionDescription(
   usage = """
     _FUNC_(expr[, delimiter])[ WITHIN GROUP (ORDER BY key [ASC | DESC] 
[,...])] - Returns
-    the concatenation of non-null input values, separated by the delimiter 
ordered by key.
-    If all values are null, null is returned.
+    the concatenation of non-NULL input values, separated by the delimiter 
ordered by key.
+    If all values are NULL, NULL is returned.
     """,
   arguments = """
     Arguments:
       * expr - a string or binary expression to be concatenated.
       * delimiter - an optional string or binary foldable expression used to 
separate the input values.
-        If null, the concatenation will be performed without a delimiter. 
Default is null.
+        If NULL, the concatenation will be performed without a delimiter. 
Default is NULL.
       * key - an optional expression for ordering the input values. Multiple 
keys can be specified.
         If none are specified, the order of the rows in the result is 
non-deterministic.
   """,
@@ -400,7 +400,7 @@ case class ListAgg(
         )
       )
     } else if (delimiter.dataType == NullType) {
-      // null is the default empty delimiter so type is not important
+      // Null is the default empty delimiter so type is not important
       TypeCheckSuccess
     } else {
       TypeUtils.checkForSameTypeInputExpr(child.dataType :: delimiter.dataType 
:: Nil, prettyName)
@@ -451,7 +451,7 @@ case class ListAgg(
   }
 
   /**
-   * @return ordering by (orderValue0, orderValue1, ...)
+   * @return Ordering by (orderValue0, orderValue1, ...)
    *         for InternalRow with format [childValue, orderValue0, 
orderValue1, ...]
    */
   private[this] def bufferOrdering: Ordering[InternalRow] = {
@@ -477,7 +477,7 @@ case class ListAgg(
   }
 
   /**
-   * @return delimiter value or default empty value if delimiter is null. Type 
respects [[dataType]]
+   * @return Delimiter value or default empty value if delimiter is null. Type 
respects [[dataType]]
    */
   private[this] def getDelimiterValue: Either[UTF8String, Array[Byte]] = {
     val delimiterValue = delimiter.eval()
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
index ca471858a541..3d239f1bdb04 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
@@ -9,66 +9,82 @@ Aggregate [listagg(c1#x, null, collate(c1#x, utf8_binary) ASC 
NULLS FIRST, 0, 0)
 
 
 -- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES 
('a'), ('A'), ('b'), ('B')) AS t(c1)
+SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
 -- !query analysis
-Aggregate [listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 
0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC 
NULLS FIRST)#x]
+Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS 
listagg(DISTINCT collate(c1, utf8_binary), NULL)#x]
 +- SubqueryAlias t
    +- Project [col1#x AS c1#x]
       +- LocalRelation [col1#x]
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES 
('abc  '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' 
', ''), '\n', '$') FROM t
 -- !query analysis
-Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS 
listagg(DISTINCT collate(c1, utf8_binary), NULL)#x]
-+- SubqueryAlias t
-   +- Project [col1#x AS c1#x]
-      +- LocalRelation [col1#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS 
FIRST)#x AS c1#x]
+:        +- Aggregate [listagg(col1#x, null, col1#x ASC NULLS FIRST, 0, 0) AS 
listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x]
+:           +- SubqueryAlias __auto_generated_subquery_name
+:              +- LocalRelation [col1#x]
++- Project [replace(replace(c1#x,  , ), 
+, $) AS replace(replace(c1,  , ), 
+, $)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [c1#x], false, false, 1
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
+SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM 
(VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1)
 -- !query analysis
-Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0) AS 
listagg(DISTINCT collate(c1, utf8_lcase), NULL)#x]
+Aggregate [lower(listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS 
FIRST, 0, 0)) AS lower(listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, 
utf8_lcase) ASC NULLS FIRST))#x]
 +- SubqueryAlias t
    +- Project [col1#x AS c1#x]
       +- LocalRelation [col1#x]
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), 
('A'), ('b'), ('B')) AS t(c1)
 -- !query analysis
-Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x, 
utf8_lcase) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT collate(c1, utf8_lcase), 
NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)#x]
+Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0)) AS 
lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL))#x]
 +- SubqueryAlias t
    +- Project [col1#x AS c1#x]
       +- LocalRelation [col1#x]
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc  '), 
('abc '), ('x'), ('abc')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
 -- !query analysis
-Aggregate [listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0) AS 
listagg(DISTINCT collate(c1, unicode_rtrim), NULL)#x]
+Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, 
collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0)) AS lower(listagg(DISTINCT 
collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) 
ASC NULLS FIRST))#x]
 +- SubqueryAlias t
    +- Project [col1#x AS c1#x]
       +- LocalRelation [col1#x]
 
 
 -- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc  '), ('abc 
'), ('abc\n'), ('abc'), ('x')) AS t(c1)
+SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc  
'), ('xbc '), ('a'), ('xbc')) AS t(c1)
 -- !query analysis
-Aggregate [listagg(c1#x, null, c1#x ASC NULLS FIRST, 0, 0) AS listagg(c1, 
NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST)#x]
+Aggregate [rtrim(listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0), 
None) AS rtrim(listagg(DISTINCT collate(c1, unicode_rtrim), NULL))#x]
 +- SubqueryAlias t
    +- Project [col1#x AS c1#x]
       +- LocalRelation [col1#x]
 
 
 -- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM 
(VALUES ('abc  '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE 
unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT 
replace(replace(c1, ' ', ''), '\n', '$') FROM t
 -- !query analysis
-Aggregate [listagg(c1#x, null, collate(c1#x, unicode_rtrim) ASC NULLS FIRST, 
0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim) 
ASC NULLS FIRST)#x]
-+- SubqueryAlias t
-   +- Project [col1#x AS c1#x]
-      +- LocalRelation [col1#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY collate(col1, 
unicode_rtrim) ASC NULLS FIRST)#x AS c1#x]
+:        +- Aggregate [listagg(col1#x, null, collate(col1#x, unicode_rtrim) 
ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY 
collate(col1, unicode_rtrim) ASC NULLS FIRST)#x]
+:           +- SubqueryAlias __auto_generated_subquery_name
+:              +- LocalRelation [col1#x]
++- Project [replace(replace(c1#x,  , ), 
+, $) AS replace(replace(c1,  , ), 
+, $)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [c1#x], false, false, 1
 
 
 -- !query
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
index 71eb3f8ca76b..76aba15b4617 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
@@ -1,223 +1,274 @@
 -- Automatically generated by SQLQueryTestSuite
 -- !query
 CREATE TEMP VIEW df AS
-SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL)) AS t(a, b)
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL))
 -- !query analysis
-CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 
'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b), false, false, LocalTempView, 
UNSUPPORTED, true
-   +- Project [a#x, b#x]
-      +- SubqueryAlias t
-         +- Project [col1#x AS a#x, col2#x AS b#x]
-            +- LocalRelation [col1#x, col2#x]
+CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 
'c'), ('b', 'd'), (NULL, NULL)), false, false, LocalTempView, UNSUPPORTED, true
+   +- Project [col1#x, col2#x]
+      +- SubqueryAlias __auto_generated_subquery_name
+         +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
 CREATE TEMP VIEW df2 AS
-SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b)
+SELECT * FROM (VALUES (1, true), (2, false), (3, false))
 -- !query analysis
-CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3, 
false)) AS t(a, b), false, false, LocalTempView, UNSUPPORTED, true
-   +- Project [a#x, b#x]
-      +- SubqueryAlias t
-         +- Project [col1#x AS a#x, col2#x AS b#x]
-            +- LocalRelation [col1#x, col2#x]
+CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3, 
false)), false, false, LocalTempView, UNSUPPORTED, true
+   +- Project [col1#x, col2#x]
+      +- SubqueryAlias __auto_generated_subquery_name
+         +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), 
regexp_count(col, 'd') FROM t
 -- !query analysis
-Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col2, NULL)#x AS col#x]
+:        +- Aggregate [col1#x], [listagg(col2#x, null, 0, 0) AS listagg(col2, 
NULL)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS 
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, 
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS 
regexp_count(col, d)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false
 
 
 -- !query
-SELECT string_agg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t
 -- !query analysis
-Aggregate [a#x], [string_agg(b#x, null, 0, 0) AS string_agg(b, NULL)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [string_agg(col2, NULL)#x AS col#x]
+:        +- Aggregate [col1#x], [string_agg(col2#x, null, 0, 0) AS 
string_agg(col2, NULL)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS 
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, 
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS 
regexp_count(col, d)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false
 
 
 -- !query
-SELECT listagg(b, NULL) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t
 -- !query analysis
-Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col2, NULL)#x AS col#x]
+:        +- Aggregate [col1#x], [listagg(col2#x, null, 0, 0) AS listagg(col2, 
NULL)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS 
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, 
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS 
regexp_count(col, d)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false
 
 
 -- !query
-SELECT listagg(b) FROM df WHERE 1 != 1
+SELECT listagg(col2) FROM df WHERE 1 != 1
 -- !query analysis
-Aggregate [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x]
+Aggregate [listagg(col2#x, null, 0, 0) AS listagg(col2, NULL)#x]
 +- Filter NOT (1 = 1)
    +- SubqueryAlias df
-      +- View (`df`, [a#x, b#x])
-         +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-            +- Project [a#x, b#x]
-               +- SubqueryAlias t
-                  +- Project [col1#x AS a#x, col2#x AS b#x]
-                     +- LocalRelation [col1#x, col2#x]
+      +- View (`df`, [col1#x, col2#x])
+         +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) 
AS col2#x]
+            +- Project [col1#x, col2#x]
+               +- SubqueryAlias __auto_generated_subquery_name
+                  +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(b, '|') FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t
 -- !query analysis
-Aggregate [a#x], [listagg(b#x, |, 0, 0) AS listagg(b, |)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col2, |)#x AS col#x]
+:        +- Aggregate [col1#x], [listagg(col2#x, |, 0, 0) AS listagg(col2, 
|)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS 
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x, 
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS 
regexp_count(col, d)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false
 
 
 -- !query
-SELECT listagg(a) FROM df
+WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
 -- !query analysis
-Aggregate [listagg(a#x, null, 0, 0) AS listagg(a, NULL)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col1, NULL)#x AS col#x]
+:        +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS 
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false, 1
 
 
 -- !query
-SELECT listagg(DISTINCT a) FROM df
+WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
 -- !query analysis
-Aggregate [listagg(distinct a#x, null, 0, 0) AS listagg(DISTINCT a, NULL)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(DISTINCT col1, NULL)#x AS col#x]
+:        +- Aggregate [listagg(distinct col1#x, null, 0, 0) AS 
listagg(DISTINCT col1, NULL)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS 
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false, 1
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df
 -- !query analysis
-Aggregate [listagg(a#x, null, a#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) 
WITHIN GROUP (ORDER BY a ASC NULLS FIRST)#x]
+Aggregate [listagg(col1#x, null, col1#x ASC NULLS FIRST, 0, 0) AS 
listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x]
 +- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df`, [col1#x, col2#x])
+      +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df
 -- !query analysis
-Aggregate [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) 
WITHIN GROUP (ORDER BY a DESC NULLS LAST)#x]
+Aggregate [listagg(col1#x, null, col1#x DESC NULLS LAST, 0, 0) AS 
listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST)#x]
 +- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df`, [col1#x, col2#x])
+      +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY 
col2) FROM df
 -- !query analysis
-Project [listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER 
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x]
-+- Project [a#x, b#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS 
LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING)#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER 
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x]
-   +- Window [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) 
windowspecdefinition(b#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), 
unboundedfollowing$())) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS 
LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING)#x], [b#x]
-      +- Project [a#x, b#x]
+Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER 
(PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x]
++- Project [col1#x, col2#x, listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 
DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND 
UNBOUNDED FOLLOWING)#x, listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC 
NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND 
UNBOUNDED FOLLOWING)#x]
+   +- Window [listagg(col1#x, null, col1#x DESC NULLS LAST, 0, 0) 
windowspecdefinition(col2#x, specifiedwindowframe(RowFrame, 
unboundedpreceding$(), unboundedfollowing$())) AS listagg(col1, NULL) WITHIN 
GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN 
UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [col2#x]
+      +- Project [col1#x, col2#x]
          +- SubqueryAlias df
-            +- View (`df`, [a#x, b#x])
-               +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS 
b#x]
-                  +- Project [a#x, b#x]
-                     +- SubqueryAlias t
-                        +- Project [col1#x AS a#x, col2#x AS b#x]
-                           +- LocalRelation [col1#x, col2#x]
+            +- View (`df`, [col1#x, col2#x])
+               +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+                  +- Project [col1#x, col2#x]
+                     +- SubqueryAlias __auto_generated_subquery_name
+                        +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df
 -- !query analysis
-Aggregate [listagg(a#x, null, b#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL) 
WITHIN GROUP (ORDER BY b ASC NULLS FIRST)#x]
+Aggregate [listagg(col1#x, null, col2#x ASC NULLS FIRST, 0, 0) AS 
listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 ASC NULLS FIRST)#x]
 +- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df`, [col1#x, col2#x])
+      +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df
+WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM 
df) SELECT (col == 'baba') || (col == 'bbaa') FROM t
 -- !query analysis
-Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL) 
WITHIN GROUP (ORDER BY b DESC NULLS LAST)#x]
-+- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS 
LAST)#x AS col#x]
+:        +- Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, 0, 0) AS 
listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [concat(cast((col#x = baba) as string), cast((col#x = bbaa) as 
string)) AS concat((col = baba), (col = bbaa))#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false, 1
 
 
 -- !query
-SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df
+WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) 
FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t
 -- !query analysis
-Aggregate [listagg(a#x, |, b#x DESC NULLS LAST, 0, 0) AS listagg(a, |) WITHIN 
GROUP (ORDER BY b DESC NULLS LAST)#x]
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias t
+:     +- Project [listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS 
LAST)#x AS col#x]
+:        +- Aggregate [listagg(col1#x, |, col2#x DESC NULLS LAST, 0, 0) AS 
listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x]
+:           +- SubqueryAlias df
+:              +- View (`df`, [col1#x, col2#x])
+:                 +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+:                    +- Project [col1#x, col2#x]
+:                       +- SubqueryAlias __auto_generated_subquery_name
+:                          +- LocalRelation [col1#x, col2#x]
++- Project [concat(cast((col#x = b|a|b|a) as string), cast((col#x = b|b|a|a) 
as string)) AS concat((col = b|a|b|a), (col = b|b|a|a))#x]
+   +- SubqueryAlias t
+      +- CTERelationRef xxxx, true, [col#x], false, false, 1
+
+
+-- !query
+SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df
+-- !query analysis
+Aggregate [listagg(col1#x, |, col2#x DESC NULLS LAST, 0, 0) AS listagg(col1, 
|) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x]
 +- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df`, [col1#x, col2#x])
+      +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df
 -- !query analysis
-Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x ASC NULLS FIRST, 0, 0) 
AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS 
FIRST)#x]
+Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x ASC NULLS 
FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS 
LAST, col1 ASC NULLS FIRST)#x]
 +- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df`, [col1#x, col2#x])
+      +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df
 -- !query analysis
-Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x DESC NULLS LAST, 0, 0) 
AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS 
LAST)#x]
+Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x DESC NULLS 
LAST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, 
col1 DESC NULLS LAST)#x]
 +- SubqueryAlias df
-   +- View (`df`, [a#x, b#x])
-      +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df`, [col1#x, col2#x])
+      +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
@@ -248,16 +299,15 @@ Aggregate [listagg(c1#x, 0x42, 0, 0) AS listagg(c1, 
X'42')#x]
 
 
 -- !query
-SELECT listagg(a), listagg(b, ',') FROM df2
+SELECT listagg(col1), listagg(col2, ',') FROM df2
 -- !query analysis
-Aggregate [listagg(cast(a#x as string), null, 0, 0) AS listagg(a, NULL)#x, 
listagg(cast(b#x as string), ,, 0, 0) AS listagg(b, ,)#x]
+Aggregate [listagg(cast(col1#x as string), null, 0, 0) AS listagg(col1, 
NULL)#x, listagg(cast(col2#x as string), ,, 0, 0) AS listagg(col2, ,)#x]
 +- SubqueryAlias df2
-   +- View (`df2`, [a#x, b#x])
-      +- Project [cast(a#x as int) AS a#x, cast(b#x as boolean) AS b#x]
-         +- Project [a#x, b#x]
-            +- SubqueryAlias t
-               +- Project [col1#x AS a#x, col2#x AS b#x]
-                  +- LocalRelation [col1#x, col2#x]
+   +- View (`df2`, [col1#x, col2#x])
+      +- Project [cast(col1#x as int) AS col1#x, cast(col2#x as boolean) AS 
col2#x]
+         +- Project [col1#x, col2#x]
+            +- SubqueryAlias __auto_generated_subquery_name
+               +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
@@ -307,129 +357,128 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-SELECT listagg(b, a) FROM df GROUP BY a
+SELECT listagg(col2, col1) FROM df GROUP BY col1
 -- !query analysis
 org.apache.spark.sql.catalyst.ExtendedAnalysisException
 {
   "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
   "sqlState" : "42K09",
   "messageParameters" : {
-    "inputExpr" : "\"a\"",
+    "inputExpr" : "\"col1\"",
     "inputName" : "`delimiter`",
     "inputType" : "\"STRING\"",
-    "sqlExpr" : "\"listagg(b, a)\""
+    "sqlExpr" : "\"listagg(col2, col1)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 20,
-    "fragment" : "listagg(b, a)"
+    "stopIndex" : 26,
+    "fragment" : "listagg(col2, col1)"
   } ]
 }
 
 
 -- !query
-SELECT listagg(a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) OVER (ORDER BY col1) FROM df
 -- !query analysis
-Project [listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 
UNBOUNDED PRECEDING AND CURRENT ROW)#x]
-+- Project [a#x, listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE 
BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(a, NULL) OVER (ORDER BY 
a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x]
-   +- Window [listagg(a#x, null, 0, 0) windowspecdefinition(a#x ASC NULLS 
FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) 
AS listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED 
PRECEDING AND CURRENT ROW)#x], [a#x ASC NULLS FIRST]
-      +- Project [a#x]
+Project [listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN 
UNBOUNDED PRECEDING AND CURRENT ROW)#x]
++- Project [col1#x, listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST 
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(col1, NULL) OVER 
(ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT 
ROW)#x]
+   +- Window [listagg(col1#x, null, 0, 0) windowspecdefinition(col1#x ASC 
NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), 
currentrow$())) AS listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST 
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x], [col1#x ASC NULLS FIRST]
+      +- Project [col1#x]
          +- SubqueryAlias df
-            +- View (`df`, [a#x, b#x])
-               +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS 
b#x]
-                  +- Project [a#x, b#x]
-                     +- SubqueryAlias t
-                        +- Project [col1#x AS a#x, col2#x AS b#x]
-                           +- LocalRelation [col1#x, col2#x]
+            +- View (`df`, [col1#x, col2#x])
+               +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as 
string) AS col2#x]
+                  +- Project [col1#x, col2#x]
+                     +- SubqueryAlias __auto_generated_subquery_name
+                        +- LocalRelation [col1#x, col2#x]
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df
 -- !query analysis
 org.apache.spark.sql.catalyst.ExtendedAnalysisException
 {
   "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
   "sqlState" : "42601",
   "messageParameters" : {
-    "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+    "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 61,
-    "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+    "stopIndex" : 70,
+    "fragment" : "listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY 
col1)"
   } ]
 }
 
 
 -- !query
-SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM 
df
 -- !query analysis
 org.apache.spark.sql.catalyst.ExtendedAnalysisException
 {
   "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
   "sqlState" : "42601",
   "messageParameters" : {
-    "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+    "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 64,
-    "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+    "stopIndex" : 73,
+    "fragment" : "string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY 
col1)"
   } ]
 }
 
 
 -- !query
-SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df
+SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df
 -- !query analysis
 org.apache.spark.sql.catalyst.ExtendedAnalysisException
 {
   "errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED",
   "sqlState" : "0A000",
   "messageParameters" : {
-    "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS 
FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
+    "windowExpr" : "\"listagg(DISTINCT col1, NULL) OVER (ORDER BY col1 ASC 
NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 44,
-    "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)"
+    "stopIndex" : 50,
+    "fragment" : "listagg(DISTINCT col1) OVER (ORDER BY col1)"
   } ]
 }
 
 
 -- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df
 -- !query analysis
 org.apache.spark.sql.catalyst.ExtendedAnalysisException
 {
   "errorClass" : 
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
   "sqlState" : "42K0K",
   "messageParameters" : {
-    "funcArg" : "\"a\"",
+    "funcArg" : "\"col1\"",
     "funcName" : "`listagg`",
-    "orderingExpr" : "\"b\""
+    "orderingExpr" : "\"col2\""
   }
 }
 
 
 -- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df
 -- !query analysis
 org.apache.spark.sql.catalyst.ExtendedAnalysisException
 {
   "errorClass" : 
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
   "sqlState" : "42K0K",
   "messageParameters" : {
-    "funcArg" : "\"a\"",
+    "funcArg" : "\"col1\"",
     "funcName" : "`listagg`",
-    "orderingExpr" : "\"a\", \"b\""
+    "orderingExpr" : "\"col1\", \"col2\""
   }
 }
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql 
b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
index 35f86183c37b..a1949cec1ebf 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
@@ -1,12 +1,14 @@
--- Test cases with collations
+-- Test cases with utf8_binary
 SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES 
('a'), ('A'), ('b'), ('B')) AS t(c1);
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES 
('a'), ('A'), ('b'), ('B')) AS t(c1);
 SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1);
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1);
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1);
-SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc  '), 
('abc '), ('x'), ('abc')) AS t(c1);
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc  '), ('abc 
'), ('abc\n'), ('abc'), ('x')) AS t(c1);
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM 
(VALUES ('abc  '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1);
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES 
('abc  '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' 
', ''), '\n', '$') FROM t;
+-- Test cases with utf8_lcase. Lower expression added for determinism
+SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM 
(VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1);
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), 
('A'), ('b'), ('B')) AS t(c1);
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1);
+-- Test cases with unicode_rtrim.
+SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc  
'), ('xbc '), ('a'), ('xbc')) AS t(c1);
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE 
unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT 
replace(replace(c1, ' ', ''), '\n', '$') FROM t;
 
 -- Error case with collations
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1);
\ No newline at end of file
+SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql 
b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
index 573e0ddddf2f..7acac239cd9c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
@@ -2,39 +2,40 @@
 
 -- Create temporary views
 CREATE TEMP VIEW df AS
-SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL)) AS t(a, b);
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL));
 
 CREATE TEMP VIEW df2 AS
-SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b);
+SELECT * FROM (VALUES (1, true), (2, false), (3, false));
 
 -- Test cases for listagg function
-SELECT listagg(b) FROM df GROUP BY a;
-SELECT string_agg(b) FROM df GROUP BY a;
-SELECT listagg(b, NULL) FROM df GROUP BY a;
-SELECT listagg(b) FROM df WHERE 1 != 1;
-SELECT listagg(b, '|') FROM df GROUP BY a;
-SELECT listagg(a) FROM df;
-SELECT listagg(DISTINCT a) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df;
-SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df;
+WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), 
regexp_count(col, 'd') FROM t;
+WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t;
+WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t;
+SELECT listagg(col2) FROM df WHERE 1 != 1;
+WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t;
+WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t;
+WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY 
col2) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df;
+WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM 
df) SELECT (col == 'baba') || (col == 'bbaa') FROM t;
+WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) 
FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t;
+SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df;
 SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
 SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
 SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
-SELECT listagg(a), listagg(b, ',') FROM df2;
+SELECT listagg(col1), listagg(col2, ',') FROM df2;
 
 -- Error cases
 SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1);
 SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
-SELECT listagg(b, a) FROM df GROUP BY a;
-SELECT listagg(a) OVER (ORDER BY a) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df;
-SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df;
-SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df;
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df;
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df;
\ No newline at end of file
+SELECT listagg(col2, col1) FROM df GROUP BY col1;
+SELECT listagg(col1) OVER (ORDER BY col1) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df;
+SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM 
df;
+SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df;
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df;
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df;
diff --git 
a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out 
b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
index cf3bac04f09c..7d5b054217b4 100644
--- a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
@@ -8,61 +8,61 @@ ABab
 
 
 -- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES 
('a'), ('A'), ('b'), ('B')) AS t(c1)
+SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
 -- !query schema
-struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC 
NULLS FIRST):string>
+struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string>
 -- !query output
 aAbB
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES 
('abc  '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, ' 
', ''), '\n', '$') FROM t
 -- !query schema
-struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string>
+struct<replace(replace(c1,  , ), 
+, $):string>
 -- !query output
-aAbB
+abcabc$abcabcx
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'), 
('b'), ('B')) AS t(c1)
+SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM 
(VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1)
 -- !query schema
-struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL):string collate 
UTF8_LCASE>
+struct<lower(listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) 
ASC NULLS FIRST)):string>
 -- !query output
-ab
+aabb
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'), 
('A'), ('b'), ('B')) AS t(c1)
 -- !query schema
-struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY 
collate(c1, utf8_lcase) ASC NULLS FIRST):string collate UTF8_LCASE>
+struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL)):string collate 
UTF8_LCASE>
 -- !query output
-aB
+ab
 
 
 -- !query
-SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc  '), 
('abc '), ('x'), ('abc')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 
COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
 -- !query schema
-struct<listagg(DISTINCT collate(c1, unicode_rtrim), NULL):string collate 
UNICODE_RTRIM>
+struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP 
(ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)):string collate UTF8_LCASE>
 -- !query output
-abc  x
+ab
 
 
 -- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc  '), ('abc 
'), ('abc\n'), ('abc'), ('x')) AS t(c1)
+SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc  
'), ('xbc '), ('a'), ('xbc')) AS t(c1)
 -- !query schema
-struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST):string>
+struct<rtrim(listagg(DISTINCT collate(c1, unicode_rtrim), NULL)):string 
collate UNICODE_RTRIM>
 -- !query output
-abcabc
-abc abc  x
+axbc
 
 
 -- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM 
(VALUES ('abc  '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE 
unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT 
replace(replace(c1, ' ', ''), '\n', '$') FROM t
 -- !query schema
-struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim) ASC 
NULLS FIRST):string>
+struct<replace(replace(c1,  , ), 
+, $):string>
 -- !query output
-abc  abc abcabc
-x
+abcabcabc$x
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out 
b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
index cc4b568c9e4e..de5432939031 100644
--- a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
@@ -1,7 +1,7 @@
 -- Automatically generated by SQLQueryTestSuite
 -- !query
 CREATE TEMP VIEW df AS
-SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL)) AS t(a, b)
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL, 
NULL))
 -- !query schema
 struct<>
 -- !query output
@@ -10,7 +10,7 @@ struct<>
 
 -- !query
 CREATE TEMP VIEW df2 AS
-SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b)
+SELECT * FROM (VALUES (1, true), (2, false), (3, false))
 -- !query schema
 struct<>
 -- !query output
@@ -18,89 +18,89 @@ struct<>
 
 
 -- !query
-SELECT listagg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'), 
regexp_count(col, 'd') FROM t
 -- !query schema
-struct<listagg(b, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, 
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
 -- !query output
-NULL
-bc
-cd
+2      0       0       1       1
+2      0       1       1       0
+NULL   NULL    NULL    NULL    NULL
 
 
 -- !query
-SELECT string_agg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t
 -- !query schema
-struct<string_agg(b, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, 
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
 -- !query output
-NULL
-bc
-cd
+2      0       0       1       1
+2      0       1       1       0
+NULL   NULL    NULL    NULL    NULL
 
 
 -- !query
-SELECT listagg(b, NULL) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t
 -- !query schema
-struct<listagg(b, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, 
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
 -- !query output
-NULL
-bc
-cd
+2      0       0       1       1
+2      0       1       1       0
+NULL   NULL    NULL    NULL    NULL
 
 
 -- !query
-SELECT listagg(b) FROM df WHERE 1 != 1
+SELECT listagg(col2) FROM df WHERE 1 != 1
 -- !query schema
-struct<listagg(b, NULL):string>
+struct<listagg(col2, NULL):string>
 -- !query output
 NULL
 
 
 -- !query
-SELECT listagg(b, '|') FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT 
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 
'c'), regexp_count(col, 'd') FROM t
 -- !query schema
-struct<listagg(b, |):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, 
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
 -- !query output
-NULL
-b|c
-c|d
+3      0       0       1       1
+3      0       1       1       0
+NULL   NULL    NULL    NULL    NULL
 
 
 -- !query
-SELECT listagg(a) FROM df
+WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
 -- !query schema
-struct<listagg(a, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int>
 -- !query output
-aabb
+4      2       2
 
 
 -- !query
-SELECT listagg(DISTINCT a) FROM df
+WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col), 
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
 -- !query schema
-struct<listagg(DISTINCT a, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int>
 -- !query output
-ab
+2      1       1
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a ASC NULLS FIRST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST):string>
 -- !query output
 aabb
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST):string>
 -- !query output
 bbaa
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY 
col2) FROM df
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER 
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER 
(PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED 
FOLLOWING):string>
 -- !query output
 NULL
 a
@@ -110,41 +110,49 @@ ba
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b ASC NULLS FIRST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 ASC NULLS FIRST):string>
 -- !query output
 aabb
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df
+WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM 
df) SELECT (col == 'baba') || (col == 'bbaa') FROM t
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string>
+struct<concat((col = baba), (col = bbaa)):string>
 -- !query output
-baba
+truefalse
+
+
+-- !query
+WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) 
FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t
+-- !query schema
+struct<concat((col = b|a|b|a), (col = b|b|a|a)):string>
+-- !query output
+truefalse
 
 
 -- !query
-SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df
+SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df
 -- !query schema
-struct<listagg(a, |) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string>
+struct<listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST):string>
 -- !query output
 b|a|b|a
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS 
FIRST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1 
ASC NULLS FIRST):string>
 -- !query output
 baba
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df
 -- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS 
LAST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1 
DESC NULLS LAST):string>
 -- !query output
 bbaa
 
@@ -174,9 +182,9 @@ DEAD42BEEF
 
 
 -- !query
-SELECT listagg(a), listagg(b, ',') FROM df2
+SELECT listagg(col1), listagg(col2, ',') FROM df2
 -- !query schema
-struct<listagg(a, NULL):string,listagg(b, ,):string>
+struct<listagg(col1, NULL):string,listagg(col2, ,):string>
 -- !query output
 123    true,false,false
 
@@ -232,7 +240,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-SELECT listagg(b, a) FROM df GROUP BY a
+SELECT listagg(col2, col1) FROM df GROUP BY col1
 -- !query schema
 struct<>
 -- !query output
@@ -241,25 +249,25 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
   "sqlState" : "42K09",
   "messageParameters" : {
-    "inputExpr" : "\"a\"",
+    "inputExpr" : "\"col1\"",
     "inputName" : "`delimiter`",
     "inputType" : "\"STRING\"",
-    "sqlExpr" : "\"listagg(b, a)\""
+    "sqlExpr" : "\"listagg(col2, col1)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 20,
-    "fragment" : "listagg(b, a)"
+    "stopIndex" : 26,
+    "fragment" : "listagg(col2, col1)"
   } ]
 }
 
 
 -- !query
-SELECT listagg(a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) OVER (ORDER BY col1) FROM df
 -- !query schema
-struct<listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 
UNBOUNDED PRECEDING AND CURRENT ROW):string>
+struct<listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN 
UNBOUNDED PRECEDING AND CURRENT ROW):string>
 -- !query output
 NULL
 aa
@@ -269,7 +277,7 @@ aabb
 
 
 -- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df
 -- !query schema
 struct<>
 -- !query output
@@ -278,20 +286,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
   "sqlState" : "42601",
   "messageParameters" : {
-    "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+    "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 61,
-    "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+    "stopIndex" : 70,
+    "fragment" : "listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY 
col1)"
   } ]
 }
 
 
 -- !query
-SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM 
df
 -- !query schema
 struct<>
 -- !query output
@@ -300,20 +308,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
   "sqlState" : "42601",
   "messageParameters" : {
-    "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+    "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 64,
-    "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+    "stopIndex" : 73,
+    "fragment" : "string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY 
col1)"
   } ]
 }
 
 
 -- !query
-SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df
+SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df
 -- !query schema
 struct<>
 -- !query output
@@ -322,20 +330,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED",
   "sqlState" : "0A000",
   "messageParameters" : {
-    "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS 
FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
+    "windowExpr" : "\"listagg(DISTINCT col1, NULL) OVER (ORDER BY col1 ASC 
NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
   },
   "queryContext" : [ {
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 8,
-    "stopIndex" : 44,
-    "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)"
+    "stopIndex" : 50,
+    "fragment" : "listagg(DISTINCT col1) OVER (ORDER BY col1)"
   } ]
 }
 
 
 -- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df
 -- !query schema
 struct<>
 -- !query output
@@ -344,15 +352,15 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : 
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
   "sqlState" : "42K0K",
   "messageParameters" : {
-    "funcArg" : "\"a\"",
+    "funcArg" : "\"col1\"",
     "funcName" : "`listagg`",
-    "orderingExpr" : "\"b\""
+    "orderingExpr" : "\"col2\""
   }
 }
 
 
 -- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df
 -- !query schema
 struct<>
 -- !query output
@@ -361,8 +369,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : 
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
   "sqlState" : "42K0K",
   "messageParameters" : {
-    "funcArg" : "\"a\"",
+    "funcArg" : "\"col1\"",
     "funcName" : "`listagg`",
-    "orderingExpr" : "\"a\", \"b\""
+    "orderingExpr" : "\"col1\", \"col2\""
   }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 141d6b219f2a..6ce0a657d5b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -624,36 +624,39 @@ class DataFrameAggregateSuite extends QueryTest
   }
 
   test("listagg function") {
-    // normal case
-    val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b")
+    // Normal case.
+    val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("col1", "col2")
     checkAnswer(
-      df.selectExpr("listagg(a)", "listagg(b)"),
+      df.selectExpr("listagg(col1)", "listagg(col2)"),
       Seq(Row("abc", "bcd"))
     )
     checkAnswer(
-      df.select(listagg($"a"), listagg($"b")),
+      df.select(listagg($"col1"), listagg($"col2")),
       Seq(Row("abc", "bcd"))
     )
 
-    // distinct case
-    val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("a", "b")
+    // Distinct case.
+    val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("col1", "col2")
     checkAnswer(
-      df2.select(listagg_distinct($"a"), listagg_distinct($"b")),
+      df2.select(listagg_distinct($"col1"), listagg_distinct($"col2")),
       Seq(Row("ab", "bd"))
     )
 
-    // null case
-    val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null, 
null)).toDF("a", "b", "c")
+    // Null case.
+    val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null, null))
+      .toDF("col1", "col2", "col3")
     checkAnswer(
-      df3.select(listagg_distinct($"a"), listagg($"a"), 
listagg_distinct($"b"), listagg($"b"),
-        listagg($"c")),
-      Seq(Row("a", "aa", "b", "bb", null))
+      df3.select(
+        listagg_distinct($"col1"), listagg($"col1"),
+        listagg_distinct($"col2"), listagg($"col2"),
+        listagg_distinct($"col3"), listagg($"col3")),
+      Seq(Row("a", "aa", "b", "bb", null, null))
     )
 
-    // custom delimiter
-    val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b")
+    // Custom delimiter.
+    val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("col1", "col2")
     checkAnswer(
-      df4.selectExpr("listagg(a, '|')", "listagg(b, '|')"),
+      df4.selectExpr("listagg(col1, '|')", "listagg(col2, '|')"),
       Seq(Row("a|b|c", "b|c|d"))
     )
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and refactor code

Reply via email to