This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 73eab92d30bf [SPARK-48841][SQL] Include `collationName` to `sql()` of `Collate` 73eab92d30bf is described below commit 73eab92d30bf78d39c80dd5af03b052fe9fc5211 Author: panbingkun <panbing...@baidu.com> AuthorDate: Fri Jul 12 10:16:11 2024 +0800 [SPARK-48841][SQL] Include `collationName` to `sql()` of `Collate` ### What changes were proposed in this pull request? In the PR, I propose to fix the `sql()` method of the `Collate` expression, and append the `collationName` clause. ### Why are the changes needed? To distinguish column names when the `collationName` argument is used by `collate`. Before the changes, columns might conflict like the example below, and that could confuse users: ``` sql("CREATE TEMP VIEW tbl as (SELECT collate('A', 'UTF8_BINARY'), collate('A', 'UTF8_LCASE'))") ``` - Before: ``` [COLUMN_ALREADY_EXISTS] The column `collate(a)` already exists. Choose another name or rename the existing column. SQLSTATE: 42711 org.apache.spark.sql.AnalysisException: [COLUMN_ALREADY_EXISTS] The column `collate(a)` already exists. Choose another name or rename the existing column. SQLSTATE: 42711 at org.apache.spark.sql.errors.QueryCompilationErrors$.columnAlreadyExistsError(QueryCompilationErrors.scala:2595) at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:115) at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:97) ``` - After: ``` describe extended tbl; +-----------------------+-------------------------+-------+ |col_name |data_type |comment| +-----------------------+-------------------------+-------+ |collate(A, UTF8_BINARY)|string |NULL | |collate(A, UTF8_LCASE) |string collate UTF8_LCASE|NULL | +-----------------------+-------------------------+-------+ ``` ### Does this PR introduce _any_ user-facing change? Should not. ### How was this patch tested? Update existed UT. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47265 from panbingkun/SPARK-48841. Authored-by: panbingkun <panbing...@baidu.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../explain-results/function_collate.explain | 2 +- .../expressions/collationExpressions.scala | 4 + .../sql-functions/sql-expression-schema.md | 2 +- .../sql-tests/analyzer-results/collations.sql.out | 60 ++--- .../resources/sql-tests/results/collations.sql.out | 50 ++-- .../spark/sql/CollationExpressionWalkerSuite.scala | 4 +- .../apache/spark/sql/CollationSQLRegexpSuite.scala | 292 ++++++++++++++++----- .../org/apache/spark/sql/CollationSuite.scala | 4 +- 8 files changed, 296 insertions(+), 122 deletions(-) diff --git a/connect/common/src/test/resources/query-tests/explain-results/function_collate.explain b/connect/common/src/test/resources/query-tests/explain-results/function_collate.explain index c736abf67b11..e4e6aedc34de 100644 --- a/connect/common/src/test/resources/query-tests/explain-results/function_collate.explain +++ b/connect/common/src/test/resources/query-tests/explain-results/function_collate.explain @@ -1,2 +1,2 @@ -Project [collate(g#0, UNICODE) AS collate(g)#0] +Project [collate(g#0, UNICODE) AS collate(g, UNICODE)#0] +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index c528b523c5e7..c7fbb39ea285 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -92,6 +92,10 @@ case class Collate(child: Expression, collationName: String) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = defineCodeGen(ctx, ev, (in) => in) + + override def sql: String = s"$prettyName(${child.sql}, $collationName)" + + override def toString: String = s"$prettyName($child, $collationName)" } // scalastyle:off line.contains.tab diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index cf218becdf1d..bf4622accf41 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -81,7 +81,7 @@ | org.apache.spark.sql.catalyst.expressions.Chr | char | SELECT char(65) | struct<char(65):string> | | org.apache.spark.sql.catalyst.expressions.Chr | chr | SELECT chr(65) | struct<chr(65):string> | | org.apache.spark.sql.catalyst.expressions.Coalesce | coalesce | SELECT coalesce(NULL, 1, NULL) | struct<coalesce(NULL, 1, NULL):int> | -| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate | SELECT COLLATION('Spark SQL' collate UTF8_LCASE) | struct<collation(collate(Spark SQL)):string> | +| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate | SELECT COLLATION('Spark SQL' collate UTF8_LCASE) | struct<collation(collate(Spark SQL, UTF8_LCASE)):string> | | org.apache.spark.sql.catalyst.expressions.Collation | collation | SELECT collation('Spark SQL') | struct<collation(Spark SQL):string> | | org.apache.spark.sql.catalyst.expressions.Concat | concat | SELECT concat('Spark', 'SQL') | struct<concat(Spark, SQL):string> | | org.apache.spark.sql.catalyst.expressions.ConcatWs | concat_ws | SELECT concat_ws(' ', 'Spark', 'SQL') | struct<concat_ws( , Spark, SQL):string> | diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out index e6409806bad7..6f9cb3b75971 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out @@ -159,9 +159,9 @@ DropTable false, false select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Except false -:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] +- LocalRelation [col1#x] @@ -169,9 +169,9 @@ Except false select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Except All true -:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] +- LocalRelation [col1#x] @@ -180,9 +180,9 @@ select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ( -- !query analysis Distinct +- Union false, false - :- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] + :- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] : +- LocalRelation [col1#x] - +- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] + +- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] +- LocalRelation [col1#x] @@ -190,9 +190,9 @@ Distinct select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Union false, false -:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] +- LocalRelation [col1#x] @@ -200,9 +200,9 @@ Union false, false select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query analysis Intersect false -:- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] +:- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] : +- LocalRelation [col1#x] -+- Project [collate(col1#x, utf8_lcase) AS collate(col1)#x] ++- Project [collate(col1#x, utf8_lcase) AS collate(col1, utf8_lcase)#x] +- LocalRelation [col1#x] @@ -254,138 +254,138 @@ DropTable false, false -- !query select array_contains(ARRAY('aaa' collate utf8_lcase),'AAA' collate utf8_lcase) -- !query analysis -Project [array_contains(array(collate(aaa, utf8_lcase)), collate(AAA, utf8_lcase)) AS array_contains(array(collate(aaa)), collate(AAA))#x] +Project [array_contains(array(collate(aaa, utf8_lcase)), collate(AAA, utf8_lcase)) AS array_contains(array(collate(aaa, utf8_lcase)), collate(AAA, utf8_lcase))#x] +- OneRowRelation -- !query select array_position(ARRAY('aaa' collate utf8_lcase, 'bbb' collate utf8_lcase),'BBB' collate utf8_lcase) -- !query analysis -Project [array_position(array(collate(aaa, utf8_lcase), collate(bbb, utf8_lcase)), collate(BBB, utf8_lcase)) AS array_position(array(collate(aaa), collate(bbb)), collate(BBB))#xL] +Project [array_position(array(collate(aaa, utf8_lcase), collate(bbb, utf8_lcase)), collate(BBB, utf8_lcase)) AS array_position(array(collate(aaa, utf8_lcase), collate(bbb, utf8_lcase)), collate(BBB, utf8_lcase))#xL] +- OneRowRelation -- !query select nullif('aaa' COLLATE utf8_lcase, 'AAA' COLLATE utf8_lcase) -- !query analysis -Project [nullif(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase)) AS nullif(collate(aaa), collate(AAA))#x] +Project [nullif(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase)) AS nullif(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase))#x] +- OneRowRelation -- !query select least('aaa' COLLATE utf8_lcase, 'AAA' collate utf8_lcase, 'a' collate utf8_lcase) -- !query analysis -Project [least(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase), collate(a, utf8_lcase)) AS least(collate(aaa), collate(AAA), collate(a))#x] +Project [least(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase), collate(a, utf8_lcase)) AS least(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase), collate(a, utf8_lcase))#x] +- OneRowRelation -- !query select arrays_overlap(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [arrays_overlap(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS arrays_overlap(array(collate(aaa)), array(collate(AAA)))#x] +Project [arrays_overlap(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS arrays_overlap(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase)))#x] +- OneRowRelation -- !query select array_distinct(array('aaa' collate utf8_lcase, 'AAA' collate utf8_lcase)) -- !query analysis -Project [array_distinct(array(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase))) AS array_distinct(array(collate(aaa), collate(AAA)))#x] +Project [array_distinct(array(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase))) AS array_distinct(array(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase)))#x] +- OneRowRelation -- !query select array_union(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [array_union(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_union(array(collate(aaa)), array(collate(AAA)))#x] +Project [array_union(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_union(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase)))#x] +- OneRowRelation -- !query select array_intersect(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [array_intersect(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_intersect(array(collate(aaa)), array(collate(AAA)))#x] +Project [array_intersect(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_intersect(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase)))#x] +- OneRowRelation -- !query select array_except(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query analysis -Project [array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_except(array(collate(aaa)), array(collate(AAA)))#x] +Project [array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))) AS array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase)))#x] +- OneRowRelation -- !query select 'a' collate unicode < 'A' -- !query analysis -Project [(collate(a, unicode) < cast(A as string collate UNICODE)) AS (collate(a) < A)#x] +Project [(collate(a, unicode) < cast(A as string collate UNICODE)) AS (collate(a, unicode) < A)#x] +- OneRowRelation -- !query select 'a' collate unicode_ci = 'A' -- !query analysis -Project [(collate(a, unicode_ci) = cast(A as string collate UNICODE_CI)) AS (collate(a) = A)#x] +Project [(collate(a, unicode_ci) = cast(A as string collate UNICODE_CI)) AS (collate(a, unicode_ci) = A)#x] +- OneRowRelation -- !query select 'a' collate unicode_ai = 'å' -- !query analysis -Project [(collate(a, unicode_ai) = cast(å as string collate UNICODE_AI)) AS (collate(a) = å)#x] +Project [(collate(a, unicode_ai) = cast(å as string collate UNICODE_AI)) AS (collate(a, unicode_ai) = å)#x] +- OneRowRelation -- !query select 'a' collate unicode_ci_ai = 'Å' -- !query analysis -Project [(collate(a, unicode_ci_ai) = cast(Å as string collate UNICODE_CI_AI)) AS (collate(a) = Å)#x] +Project [(collate(a, unicode_ci_ai) = cast(Å as string collate UNICODE_CI_AI)) AS (collate(a, unicode_ci_ai) = Å)#x] +- OneRowRelation -- !query select 'a' collate en < 'A' -- !query analysis -Project [(collate(a, en) < cast(A as string collate en)) AS (collate(a) < A)#x] +Project [(collate(a, en) < cast(A as string collate en)) AS (collate(a, en) < A)#x] +- OneRowRelation -- !query select 'a' collate en_ci = 'A' -- !query analysis -Project [(collate(a, en_ci) = cast(A as string collate en_CI)) AS (collate(a) = A)#x] +Project [(collate(a, en_ci) = cast(A as string collate en_CI)) AS (collate(a, en_ci) = A)#x] +- OneRowRelation -- !query select 'a' collate en_ai = 'å' -- !query analysis -Project [(collate(a, en_ai) = cast(å as string collate en_AI)) AS (collate(a) = å)#x] +Project [(collate(a, en_ai) = cast(å as string collate en_AI)) AS (collate(a, en_ai) = å)#x] +- OneRowRelation -- !query select 'a' collate en_ci_ai = 'Å' -- !query analysis -Project [(collate(a, en_ci_ai) = cast(Å as string collate en_CI_AI)) AS (collate(a) = Å)#x] +Project [(collate(a, en_ci_ai) = cast(Å as string collate en_CI_AI)) AS (collate(a, en_ci_ai) = Å)#x] +- OneRowRelation -- !query select 'Kypper' collate sv < 'Köpfe' -- !query analysis -Project [(collate(Kypper, sv) < cast(Köpfe as string collate sv)) AS (collate(Kypper) < Köpfe)#x] +Project [(collate(Kypper, sv) < cast(Köpfe as string collate sv)) AS (collate(Kypper, sv) < Köpfe)#x] +- OneRowRelation -- !query select 'Kypper' collate de > 'Köpfe' -- !query analysis -Project [(collate(Kypper, de) > cast(Köpfe as string collate de)) AS (collate(Kypper) > Köpfe)#x] +Project [(collate(Kypper, de) > cast(Köpfe as string collate de)) AS (collate(Kypper, de) > Köpfe)#x] +- OneRowRelation -- !query select 'I' collate tr_ci = 'ı' -- !query analysis -Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I) = ı)#x] +Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I, tr_ci) = ı)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index 89e6665df9d0..37fb6c4e114e 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -170,7 +170,7 @@ struct<> -- !query select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct<collate(col1):string collate UTF8_LCASE> +struct<collate(col1, utf8_lcase):string collate UTF8_LCASE> -- !query output zzz @@ -178,7 +178,7 @@ zzz -- !query select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct<collate(col1):string collate UTF8_LCASE> +struct<collate(col1, utf8_lcase):string collate UTF8_LCASE> -- !query output aaa bbb @@ -189,7 +189,7 @@ zzz -- !query select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct<collate(col1):string collate UTF8_LCASE> +struct<collate(col1, utf8_lcase):string collate UTF8_LCASE> -- !query output aaa bbb @@ -199,7 +199,7 @@ zzz -- !query select col1 collate utf8_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct<collate(col1):string collate UTF8_LCASE> +struct<collate(col1, utf8_lcase):string collate UTF8_LCASE> -- !query output AAA BBB @@ -214,7 +214,7 @@ zzz -- !query select col1 collate utf8_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_lcase from values ('aaa'), ('bbb') -- !query schema -struct<collate(col1):string collate UTF8_LCASE> +struct<collate(col1, utf8_lcase):string collate UTF8_LCASE> -- !query output aaa bbb @@ -272,7 +272,7 @@ struct<> -- !query select array_contains(ARRAY('aaa' collate utf8_lcase),'AAA' collate utf8_lcase) -- !query schema -struct<array_contains(array(collate(aaa)), collate(AAA)):boolean> +struct<array_contains(array(collate(aaa, utf8_lcase)), collate(AAA, utf8_lcase)):boolean> -- !query output true @@ -280,7 +280,7 @@ true -- !query select array_position(ARRAY('aaa' collate utf8_lcase, 'bbb' collate utf8_lcase),'BBB' collate utf8_lcase) -- !query schema -struct<array_position(array(collate(aaa), collate(bbb)), collate(BBB)):bigint> +struct<array_position(array(collate(aaa, utf8_lcase), collate(bbb, utf8_lcase)), collate(BBB, utf8_lcase)):bigint> -- !query output 2 @@ -288,7 +288,7 @@ struct<array_position(array(collate(aaa), collate(bbb)), collate(BBB)):bigint> -- !query select nullif('aaa' COLLATE utf8_lcase, 'AAA' COLLATE utf8_lcase) -- !query schema -struct<nullif(collate(aaa), collate(AAA)):string collate UTF8_LCASE> +struct<nullif(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase)):string collate UTF8_LCASE> -- !query output NULL @@ -296,7 +296,7 @@ NULL -- !query select least('aaa' COLLATE utf8_lcase, 'AAA' collate utf8_lcase, 'a' collate utf8_lcase) -- !query schema -struct<least(collate(aaa), collate(AAA), collate(a)):string collate UTF8_LCASE> +struct<least(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase), collate(a, utf8_lcase)):string collate UTF8_LCASE> -- !query output a @@ -304,7 +304,7 @@ a -- !query select arrays_overlap(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct<arrays_overlap(array(collate(aaa)), array(collate(AAA))):boolean> +struct<arrays_overlap(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))):boolean> -- !query output true @@ -312,7 +312,7 @@ true -- !query select array_distinct(array('aaa' collate utf8_lcase, 'AAA' collate utf8_lcase)) -- !query schema -struct<array_distinct(array(collate(aaa), collate(AAA))):array<string collate UTF8_LCASE>> +struct<array_distinct(array(collate(aaa, utf8_lcase), collate(AAA, utf8_lcase))):array<string collate UTF8_LCASE>> -- !query output ["aaa"] @@ -320,7 +320,7 @@ struct<array_distinct(array(collate(aaa), collate(AAA))):array<string collate UT -- !query select array_union(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct<array_union(array(collate(aaa)), array(collate(AAA))):array<string collate UTF8_LCASE>> +struct<array_union(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))):array<string collate UTF8_LCASE>> -- !query output ["aaa"] @@ -328,7 +328,7 @@ struct<array_union(array(collate(aaa)), array(collate(AAA))):array<string collat -- !query select array_intersect(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct<array_intersect(array(collate(aaa)), array(collate(AAA))):array<string collate UTF8_LCASE>> +struct<array_intersect(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))):array<string collate UTF8_LCASE>> -- !query output ["aaa"] @@ -336,7 +336,7 @@ struct<array_intersect(array(collate(aaa)), array(collate(AAA))):array<string co -- !query select array_except(array('aaa' collate utf8_lcase), array('AAA' collate utf8_lcase)) -- !query schema -struct<array_except(array(collate(aaa)), array(collate(AAA))):array<string collate UTF8_LCASE>> +struct<array_except(array(collate(aaa, utf8_lcase)), array(collate(AAA, utf8_lcase))):array<string collate UTF8_LCASE>> -- !query output [] @@ -344,7 +344,7 @@ struct<array_except(array(collate(aaa)), array(collate(AAA))):array<string colla -- !query select 'a' collate unicode < 'A' -- !query schema -struct<(collate(a) < A):boolean> +struct<(collate(a, unicode) < A):boolean> -- !query output true @@ -352,7 +352,7 @@ true -- !query select 'a' collate unicode_ci = 'A' -- !query schema -struct<(collate(a) = A):boolean> +struct<(collate(a, unicode_ci) = A):boolean> -- !query output true @@ -360,7 +360,7 @@ true -- !query select 'a' collate unicode_ai = 'å' -- !query schema -struct<(collate(a) = å):boolean> +struct<(collate(a, unicode_ai) = å):boolean> -- !query output true @@ -368,7 +368,7 @@ true -- !query select 'a' collate unicode_ci_ai = 'Å' -- !query schema -struct<(collate(a) = Å):boolean> +struct<(collate(a, unicode_ci_ai) = Å):boolean> -- !query output true @@ -376,7 +376,7 @@ true -- !query select 'a' collate en < 'A' -- !query schema -struct<(collate(a) < A):boolean> +struct<(collate(a, en) < A):boolean> -- !query output true @@ -384,7 +384,7 @@ true -- !query select 'a' collate en_ci = 'A' -- !query schema -struct<(collate(a) = A):boolean> +struct<(collate(a, en_ci) = A):boolean> -- !query output true @@ -392,7 +392,7 @@ true -- !query select 'a' collate en_ai = 'å' -- !query schema -struct<(collate(a) = å):boolean> +struct<(collate(a, en_ai) = å):boolean> -- !query output true @@ -400,7 +400,7 @@ true -- !query select 'a' collate en_ci_ai = 'Å' -- !query schema -struct<(collate(a) = Å):boolean> +struct<(collate(a, en_ci_ai) = Å):boolean> -- !query output true @@ -408,7 +408,7 @@ true -- !query select 'Kypper' collate sv < 'Köpfe' -- !query schema -struct<(collate(Kypper) < Köpfe):boolean> +struct<(collate(Kypper, sv) < Köpfe):boolean> -- !query output true @@ -416,7 +416,7 @@ true -- !query select 'Kypper' collate de > 'Köpfe' -- !query schema -struct<(collate(Kypper) > Köpfe):boolean> +struct<(collate(Kypper, de) > Köpfe):boolean> -- !query output true @@ -424,6 +424,6 @@ true -- !query select 'I' collate tr_ci = 'ı' -- !query schema -struct<(collate(I) = ı):boolean> +struct<(collate(I, tr_ci) = ı):boolean> -- !query output true diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala index a639367e8ca5..e4f4bb6e8557 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationExpressionWalkerSuite.scala @@ -568,8 +568,8 @@ class CollationExpressionWalkerSuite extends SparkFunSuite with SharedSparkSessi case st if utf8BinaryResultChecked != null && utf8BinaryLcaseResultChecked != null && hasStringType(st) => // scalastyle:off caselocale - assert(utf8BinaryResultChecked.getRows(1, 0).map(_.map(_.toLowerCase)) === - utf8BinaryLcaseResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))) + assert(utf8BinaryResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))(1) === + utf8BinaryLcaseResultChecked.getRows(1, 0).map(_.map(_.toLowerCase))(1)) // scalastyle:on caselocale case _ => assert(utf8BinaryResultChecked.getRows(1, 0)(1) === diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 885ed3709868..8ff7bed60bbc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -50,10 +50,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT like(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"collate(ABC, UNICODE_CI) LIKE %b%\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABC, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"like(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 47) + ) }) } @@ -132,10 +144,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"ilike(collate(ABC, UNICODE_CI), %b%)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABC, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"ilike(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 48) + ) }) } @@ -160,10 +184,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ALL ('${t.p.mkString("','")}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"likeall(collate(Foo, UNICODE_CI))\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(Foo, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"LIKE ALL ('${t.p.mkString("','")}')", + start = 36, + stop = 59) + ) }) } @@ -188,10 +224,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ALL ('${t.p.mkString("','")}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"notlikeall(collate(Foo, UNICODE_CI))\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(Foo, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"NOT LIKE ALL ('${t.p.mkString("','")}')", + start = 36, + stop = 63) + ) }) } @@ -216,10 +264,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ANY ('${t.p.mkString("','")}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"likeany(collate(Foo, UNICODE_CI))\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(Foo, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"LIKE ANY ('${t.p.mkString("','")}')", + start = 36, + stop = 59) + ) }) } @@ -244,10 +304,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ANY ('${t.p.mkString("','")}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"notlikeany(collate(Foo, UNICODE_CI))\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(Foo, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"NOT LIKE ANY ('${t.p.mkString("','")}')", + start = 36, + stop = 63) + ) }) } @@ -272,10 +344,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"RLIKE(collate(ABC, UNICODE_CI), .b.)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABC, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"rlike(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 48) + ) }) } @@ -300,10 +384,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT split(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"split(collate(ABC, UNICODE_CI), [b], -1)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABC, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"split(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 48) + ) }) } @@ -329,10 +425,13 @@ class CollationSQLRegexpSuite }) // Collation mismatch val (c1, c2) = ("UTF8_BINARY", "UTF8_LCASE") - val collationMismatch = intercept[AnalysisException] { - sql(s"SELECT regexp_replace(collate('ABCDE','$c1'), '.c.', collate('FFF','$c2'))") - } - assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") + checkError( + exception = intercept[AnalysisException] { + sql(s"SELECT regexp_replace(collate('ABCDE','$c1'), '.c.', collate('FFF','$c2'))") + }, + errorClass = "COLLATION_MISMATCH.EXPLICIT", + parameters = Map("explicitTypes" -> "`string`.`string collate UTF8_LCASE`") + ) // Unsupported collations case class RegExpReplaceTestFail(l: String, r: String, c: String) val failCases = Seq( @@ -341,10 +440,22 @@ class CollationSQLRegexpSuite failCases.foreach(t => { val query = s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', 'FFF')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"regexp_replace(collate(ABCDE, UNICODE_CI), .c., FFF, 1)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', 'FFF')", + start = 7, + stop = 66) + ) }) } @@ -371,10 +482,22 @@ class CollationSQLRegexpSuite failCases.foreach(t => { val query = s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), '${t.r}', 0)" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"regexp_extract(collate(ABCDE, UNICODE_CI), .c., 0)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"regexp_extract(collate('${t.l}', '${t.c}'), '${t.r}', 0)", + start = 7, + stop = 62) + ) }) } @@ -401,10 +524,22 @@ class CollationSQLRegexpSuite failCases.foreach(t => { val query = s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), '${t.r}', 0)" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"regexp_extract_all(collate(ABCDE, UNICODE_CI), .c., 0)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"regexp_extract_all(collate('${t.l}', '${t.c}'), '${t.r}', 0)", + start = 7, + stop = 66) + ) }) } @@ -429,10 +564,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"regexp_count(collate(ABCDE, UNICODE_CI), .c.)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 57) + ) }) } @@ -457,10 +604,22 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"regexp_substr(collate(ABCDE, UNICODE_CI), .c.)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 58) + ) }) } @@ -485,12 +644,23 @@ class CollationSQLRegexpSuite ) failCases.foreach(t => { val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')" - val unsupportedCollation = intercept[AnalysisException] { - sql(query) - } - assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + checkError( + exception = intercept[AnalysisException] { + sql(query) + }, + errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + parameters = Map( + "sqlExpr" -> "\"regexp_instr(collate(ABCDE, UNICODE_CI), .c., 0)\"", + "paramIndex" -> "first", + "inputSql" -> "\"collate(ABCDE, UNICODE_CI)\"", + "inputType" -> "\"STRING COLLATE UNICODE_CI\"", + "requiredType" -> "\"STRING\""), + context = ExpectedContext( + fragment = s"regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')", + start = 7, + stop = 57) + ) }) } - } // scalastyle:on nonascii diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index f662b86eaf81..af3169932bfc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -135,7 +135,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", sqlState = "42K09", parameters = Map( - "sqlExpr" -> "\"collate(1)\"", + "sqlExpr" -> "\"collate(1, UTF8_BINARY)\"", "paramIndex" -> "first", "inputSql" -> "\"1\"", "inputType" -> "\"INT\"", @@ -1026,7 +1026,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { exception = intercept[AnalysisException](sql(query)), errorClass = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", parameters = Map( - "sqlExpr" -> "\"map(collate(aaa), 1, collate(AAA), 2)[AaA]\"", + "sqlExpr" -> "\"map(collate(aaa, utf8_lcase), 1, collate(AAA, utf8_lcase), 2)[AaA]\"", "paramIndex" -> "second", "inputSql" -> "\"AaA\"", "inputType" -> toSQLType(StringType), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org