(spark) branch master updated: [SPARK-47423][SQL] Collations - Set operation support for strings with collations

maxgekk Fri, 15 Mar 2024 21:21:29 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 653ac5b729e2 [SPARK-47423][SQL] Collations - Set operation support for 
strings with collations
653ac5b729e2 is described below

commit 653ac5b729e2eba9bf097905b3fd136603b7a298
Author: Aleksandar Tomic <aleksandar.to...@databricks.com>
AuthorDate: Sat Mar 16 09:21:08 2024 +0500

    [SPARK-47423][SQL] Collations - Set operation support for strings with 
collations
    
    ### What changes were proposed in this pull request?
    
    This PR fixes support for set operations for strings with collations 
different from `UTF8_BINARY`. The fix is not strictly related to set operations 
and may resolve other problems in collation space. The fix is to add default 
value for `StringType` with collation. Previously the matching pattern would 
not catch the `StringType` with collation case and fix is simply to do pattern 
matching on `st: StringType` instead of relying on `StringType` match.
    
    ### Why are the changes needed?
    
    Fixing behaviour of set operations.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes - fixing the logic that previously didn't work.
    
    ### How was this patch tested?
    
    Golden file tests are added.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #45536 from dbatomic/collations_and_set_ops.
    
    Authored-by: Aleksandar Tomic <aleksandar.to...@databricks.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../spark/sql/catalyst/expressions/literals.scala  |  2 +-
 .../sql-tests/analyzer-results/collations.sql.out  | 51 +++++++++++++++++++++
 .../test/resources/sql-tests/inputs/collations.sql |  7 +++
 .../resources/sql-tests/results/collations.sql.out | 53 ++++++++++++++++++++++
 4 files changed, 112 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 9603647db06f..eadd4c04f4b3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -195,7 +195,7 @@ object Literal {
     case TimestampNTZType => create(0L, TimestampNTZType)
     case it: DayTimeIntervalType => create(0L, it)
     case it: YearMonthIntervalType => create(0, it)
-    case StringType => Literal("")
+    case st: StringType => Literal(UTF8String.fromString(""), st)
     case BinaryType => Literal("".getBytes(StandardCharsets.UTF_8))
     case CalendarIntervalType => Literal(new CalendarInterval(0, 0, 0))
     case arr: ArrayType => create(Array(), arr)
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
index fff2d4eab717..6d9bb3470be6 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -149,6 +149,57 @@ DropTable false, false
 +- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1
 
 
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query analysis
+Except false
+:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+:  +- LocalRelation [col1#x]
++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+   +- LocalRelation [col1#x]
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query analysis
+Except All true
+:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+:  +- LocalRelation [col1#x]
++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+   +- LocalRelation [col1#x]
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query analysis
+Distinct
++- Union false, false
+   :- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+   :  +- LocalRelation [col1#x]
+   +- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+      +- LocalRelation [col1#x]
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query analysis
+Union false, false
+:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+:  +- LocalRelation [col1#x]
++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+   +- LocalRelation [col1#x]
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), 
('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values 
('aaa'), ('bbb')
+-- !query analysis
+Intersect false
+:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+:  +- LocalRelation [col1#x]
++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x]
+   +- LocalRelation [col1#x]
+
+
 -- !query
 create table t1 (c1 struct<utf8_binary: string collate utf8_binary, 
utf8_binary_lcase: string collate utf8_binary_lcase>) USING PARQUET
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql 
b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
index af87f7a321c2..52ce58b80823 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
@@ -39,6 +39,13 @@ select * from t1 anti join t2 on t1.utf8_binary_lcase = 
t2.utf8_binary_lcase;
 drop table t2;
 drop table t1;
 
+-- set operations
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb');
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb');
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb');
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb');
+select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), 
('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values 
('aaa'), ('bbb');
+
 -- create table with struct field
 create table t1 (c1 struct<utf8_binary: string collate utf8_binary, 
utf8_binary_lcase: string collate utf8_binary_lcase>) USING PARQUET;
 
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out 
b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
index 70ea4058655a..7d7c054c2b08 100644
--- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -158,6 +158,59 @@ struct<>
 
 
 
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query schema
+struct<collate(col1):string collate UTF8_BINARY_LCASE>
+-- !query output
+zzz
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query schema
+struct<collate(col1):string collate UTF8_BINARY_LCASE>
+-- !query output
+aaa
+bbb
+zzz
+zzz
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query schema
+struct<collate(col1):string collate UTF8_BINARY_LCASE>
+-- !query output
+aaa
+bbb
+zzz
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), 
('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from 
values ('aaa'), ('bbb')
+-- !query schema
+struct<collate(col1):string collate UTF8_BINARY_LCASE>
+-- !query output
+AAA
+BBB
+ZZZ
+aaa
+aaa
+bbb
+bbb
+zzz
+
+
+-- !query
+select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), 
('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values 
('aaa'), ('bbb')
+-- !query schema
+struct<collate(col1):string collate UTF8_BINARY_LCASE>
+-- !query output
+aaa
+bbb
+
+
 -- !query
 create table t1 (c1 struct<utf8_binary: string collate utf8_binary, 
utf8_binary_lcase: string collate utf8_binary_lcase>) USING PARQUET
 -- !query schema


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-47423][SQL] Collations - Set operation support for strings with collations

Reply via email to