(spark) branch master updated: [SPARK-47340][SQL] Change "collate" in StringType typename to lowercase

maxgekk Tue, 12 Mar 2024 22:30:27 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 78531ebb8064 [SPARK-47340][SQL] Change "collate" in StringType 
typename to lowercase
78531ebb8064 is described below

commit 78531ebb8064e05f2cfd44b76af8b5d894e44f33
Author: Nikola Mandic <nikola.man...@databricks.com>
AuthorDate: Wed Mar 13 10:29:12 2024 +0500

    [SPARK-47340][SQL] Change "collate" in StringType typename to lowercase
    
    ### What changes were proposed in this pull request?
    
    Change 
https://github.com/apache/spark/blob/37bdf5179e01f6c8cb8c83445a155646794aabef/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala#L50
 `COLLATE` to lowercase in order to follow convention. Example from 
DecimalType: 
https://github.com/apache/spark/blob/2a51242b1bdb4a1d469fc81dc0d21c4cde305753/sql/api/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L59-L63
    
    ### Why are the changes needed?
    
    To follow convention from other data types.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, displayed collated string typename changes from `string COLLATE 
<collation_name>` to `string collate <collation_name>`.
    
    ### How was this patch tested?
    
    Update golden files and rerun tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #45475 from nikolamand-db/SPARK-47340.
    
    Authored-by: Nikola Mandic <nikola.man...@databricks.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../query-tests/queries/select_collated_string.json     |   2 +-
 .../queries/select_collated_string.proto.bin            | Bin 65 -> 65 bytes
 python/pyspark/sql/tests/test_types.py                  |   1 +
 python/pyspark/sql/types.py                             |   4 ++--
 .../scala/org/apache/spark/sql/types/DataType.scala     |   2 +-
 .../scala/org/apache/spark/sql/types/StringType.scala   |   2 +-
 .../sql-tests/analyzer-results/collations.sql.out       |  16 ++++++++--------
 .../test/resources/sql-tests/results/collations.sql.out |  12 ++++++------
 8 files changed, 20 insertions(+), 19 deletions(-)

diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
index 7bf1e0a7bb10..86595d46654c 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
@@ -8,7 +8,7 @@
         "planId": "0"
       },
       "localRelation": {
-        "schema": "struct\u003cs:string COLLATE UTF8_BINARY_LCASE\u003e"
+        "schema": "struct\u003cs:string collate UTF8_BINARY_LCASE\u003e"
       }
     },
     "expressions": [{
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
index bbe6d999c47c..30d816526cce 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
 differ
diff --git a/python/pyspark/sql/tests/test_types.py 
b/python/pyspark/sql/tests/test_types.py
index f4a857ac3ee0..ac24978a59bc 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -862,6 +862,7 @@ class TypesTestsMixin:
             if k != "varchar" and k != "char":
                 self.assertEqual(t(), _parse_datatype_string(k))
         self.assertEqual(IntegerType(), _parse_datatype_string("int"))
+        self.assertEqual(StringType(), _parse_datatype_string("string collate 
UTF8_BINARY"))
         self.assertEqual(StringType(), _parse_datatype_string("string COLLATE 
UTF8_BINARY"))
         self.assertEqual(StringType(0), _parse_datatype_string("string"))
         self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE 
UTF8_BINARY"))
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 0b98ad346576..fda2823709ba 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -261,7 +261,7 @@ class StringType(AtomicType):
 
     def collationIdToName(self) -> str:
         return (
-            " COLLATE %s" % StringType.collationNames[self.collationId]
+            " collate %s" % StringType.collationNames[self.collationId]
             if self.collationId != 0
             else ""
         )
@@ -1486,7 +1486,7 @@ _all_complex_types: Dict[str, Type[Union[ArrayType, 
MapType, StructType]]] = dic
     (v.typeName(), v) for v in _complex_types
 )
 
-_COLLATED_STRING = re.compile(r"string\s+COLLATE\s+([\w_]+|`[\w_]`)")
+_COLLATED_STRING = re.compile(r"string\s+collate\s+([\w_]+|`[\w_]`)")
 _LENGTH_CHAR = re.compile(r"char\(\s*(\d+)\s*\)")
 _LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)")
 _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)")
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 1956edba77f1..b37924a6d353 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -117,7 +117,7 @@ object DataType {
   private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r
   private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r
   private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r
-  private val COLLATED_STRING_TYPE = 
"""string\s+COLLATE\s+([\w_]+|`[\w_]`)""".r
+  private val COLLATED_STRING_TYPE = 
"""string\s+collate\s+([\w_]+|`[\w_]`)""".r
 
   def fromDDL(ddl: String): DataType = {
     parseTypeWithFallback(
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
index 35dfbf758ad4..d046195bcfd1 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -47,7 +47,7 @@ class StringType private(val collationId: Int) extends 
AtomicType with Serializa
    */
   override def typeName: String =
     if (isDefaultCollation) "string"
-    else s"string COLLATE 
${CollationFactory.fetchCollation(collationId).collationName}"
+    else s"string collate 
${CollationFactory.fetchCollation(collationId).collationName}"
 
   override def equals(obj: Any): Boolean =
     obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId 
== collationId
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
index 102755f68ad1..fff2d4eab717 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -9,7 +9,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, 
false
 insert into t1 values('aaa', 'aaa')
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase]
-+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
    +- LocalRelation [col1#x, col2#x]
 
 
@@ -17,7 +17,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_d
 insert into t1 values('AAA', 'AAA')
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase]
-+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
    +- LocalRelation [col1#x, col2#x]
 
 
@@ -25,7 +25,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_d
 insert into t1 values('bbb', 'bbb')
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase]
-+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
    +- LocalRelation [col1#x, col2#x]
 
 
@@ -33,7 +33,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_d
 insert into t1 values('BBB', 'BBB')
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase]
-+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
    +- LocalRelation [col1#x, col2#x]
 
 
@@ -112,7 +112,7 @@ CreateDataSourceTableCommand 
`spark_catalog`.`default`.`t2`, false
 insert into t2 values('aaa', 'aaa')
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_binary_lcase]
-+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
    +- LocalRelation [col1#x, col2#x]
 
 
@@ -120,7 +120,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_d
 insert into t2 values('bbb', 'bbb')
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_binary_lcase]
-+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string 
collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x]
    +- LocalRelation [col1#x, col2#x]
 
 
@@ -159,7 +159,7 @@ CreateDataSourceTableCommand 
`spark_catalog`.`default`.`t1`, false
 INSERT INTO t1 VALUES (named_struct('utf8_binary', 'aaa', 'utf8_binary_lcase', 
'aaa'))
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t1), [c1]
-+- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, 
cast(col1#x.utf8_binary_lcase as string COLLATE UTF8_BINARY_LCASE)) AS c1#x]
++- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, 
cast(col1#x.utf8_binary_lcase as string collate UTF8_BINARY_LCASE)) AS c1#x]
    +- LocalRelation [col1#x]
 
 
@@ -167,7 +167,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_d
 INSERT INTO t1 VALUES (named_struct('utf8_binary', 'AAA', 'utf8_binary_lcase', 
'AAA'))
 -- !query analysis
 InsertIntoHadoopFsRelationCommand file:[not included in 
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in 
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, 
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included 
in comparison]/{warehouse_dir}/t1), [c1]
-+- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, 
cast(col1#x.utf8_binary_lcase as string COLLATE UTF8_BINARY_LCASE)) AS c1#x]
++- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, 
cast(col1#x.utf8_binary_lcase as string collate UTF8_BINARY_LCASE)) AS c1#x]
    +- LocalRelation [col1#x]
 
 
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out 
b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
index d6e41bd2b0c9..70ea4058655a 100644
--- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -62,7 +62,7 @@ struct<count(1):bigint>
 -- !query
 select * from t1 where utf8_binary = 'aaa'
 -- !query schema
-struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE>
+struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE>
 -- !query output
 aaa    aaa
 
@@ -70,7 +70,7 @@ aaa   aaa
 -- !query
 select * from t1 where utf8_binary_lcase = 'aaa' collate utf8_binary_lcase
 -- !query schema
-struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE>
+struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE>
 -- !query output
 AAA    AAA
 aaa    aaa
@@ -79,7 +79,7 @@ aaa   aaa
 -- !query
 select * from t1 where utf8_binary < 'bbb'
 -- !query schema
-struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE>
+struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE>
 -- !query output
 AAA    AAA
 BBB    BBB
@@ -89,7 +89,7 @@ aaa   aaa
 -- !query
 select * from t1 where utf8_binary_lcase < 'bbb' collate utf8_binary_lcase
 -- !query schema
-struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE>
+struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE>
 -- !query output
 AAA    AAA
 aaa    aaa
@@ -98,7 +98,7 @@ aaa   aaa
 -- !query
 select l.utf8_binary, r.utf8_binary_lcase from t1 l join t1 r on 
l.utf8_binary_lcase = r.utf8_binary_lcase
 -- !query schema
-struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE>
+struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE>
 -- !query output
 AAA    AAA
 AAA    aaa
@@ -137,7 +137,7 @@ struct<>
 -- !query
 select * from t1 anti join t2 on t1.utf8_binary_lcase = t2.utf8_binary_lcase
 -- !query schema
-struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE>
+struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE>
 -- !query output
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-47340][SQL] Change "collate" in StringType typename to lowercase

Reply via email to