This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 78531ebb8064 [SPARK-47340][SQL] Change "collate" in StringType typename to lowercase 78531ebb8064 is described below commit 78531ebb8064e05f2cfd44b76af8b5d894e44f33 Author: Nikola Mandic <nikola.man...@databricks.com> AuthorDate: Wed Mar 13 10:29:12 2024 +0500 [SPARK-47340][SQL] Change "collate" in StringType typename to lowercase ### What changes were proposed in this pull request? Change https://github.com/apache/spark/blob/37bdf5179e01f6c8cb8c83445a155646794aabef/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala#L50 `COLLATE` to lowercase in order to follow convention. Example from DecimalType: https://github.com/apache/spark/blob/2a51242b1bdb4a1d469fc81dc0d21c4cde305753/sql/api/src/main/scala/org/apache/spark/sql/types/DecimalType.scala#L59-L63 ### Why are the changes needed? To follow convention from other data types. ### Does this PR introduce _any_ user-facing change? Yes, displayed collated string typename changes from `string COLLATE <collation_name>` to `string collate <collation_name>`. ### How was this patch tested? Update golden files and rerun tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45475 from nikolamand-db/SPARK-47340. Authored-by: Nikola Mandic <nikola.man...@databricks.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../query-tests/queries/select_collated_string.json | 2 +- .../queries/select_collated_string.proto.bin | Bin 65 -> 65 bytes python/pyspark/sql/tests/test_types.py | 1 + python/pyspark/sql/types.py | 4 ++-- .../scala/org/apache/spark/sql/types/DataType.scala | 2 +- .../scala/org/apache/spark/sql/types/StringType.scala | 2 +- .../sql-tests/analyzer-results/collations.sql.out | 16 ++++++++-------- .../test/resources/sql-tests/results/collations.sql.out | 12 ++++++------ 8 files changed, 20 insertions(+), 19 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json index 7bf1e0a7bb10..86595d46654c 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json @@ -8,7 +8,7 @@ "planId": "0" }, "localRelation": { - "schema": "struct\u003cs:string COLLATE UTF8_BINARY_LCASE\u003e" + "schema": "struct\u003cs:string collate UTF8_BINARY_LCASE\u003e" } }, "expressions": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin index bbe6d999c47c..30d816526cce 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin differ diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index f4a857ac3ee0..ac24978a59bc 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -862,6 +862,7 @@ class TypesTestsMixin: if k != "varchar" and k != "char": self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) + self.assertEqual(StringType(), _parse_datatype_string("string collate UTF8_BINARY")) self.assertEqual(StringType(), _parse_datatype_string("string COLLATE UTF8_BINARY")) self.assertEqual(StringType(0), _parse_datatype_string("string")) self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE UTF8_BINARY")) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 0b98ad346576..fda2823709ba 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -261,7 +261,7 @@ class StringType(AtomicType): def collationIdToName(self) -> str: return ( - " COLLATE %s" % StringType.collationNames[self.collationId] + " collate %s" % StringType.collationNames[self.collationId] if self.collationId != 0 else "" ) @@ -1486,7 +1486,7 @@ _all_complex_types: Dict[str, Type[Union[ArrayType, MapType, StructType]]] = dic (v.typeName(), v) for v in _complex_types ) -_COLLATED_STRING = re.compile(r"string\s+COLLATE\s+([\w_]+|`[\w_]`)") +_COLLATED_STRING = re.compile(r"string\s+collate\s+([\w_]+|`[\w_]`)") _LENGTH_CHAR = re.compile(r"char\(\s*(\d+)\s*\)") _LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)") _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)") diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala index 1956edba77f1..b37924a6d353 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -117,7 +117,7 @@ object DataType { private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r - private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+([\w_]+|`[\w_]`)""".r + private val COLLATED_STRING_TYPE = """string\s+collate\s+([\w_]+|`[\w_]`)""".r def fromDDL(ddl: String): DataType = { parseTypeWithFallback( diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala index 35dfbf758ad4..d046195bcfd1 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -47,7 +47,7 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa */ override def typeName: String = if (isDefaultCollation) "string" - else s"string COLLATE ${CollationFactory.fetchCollation(collationId).collationName}" + else s"string collate ${CollationFactory.fetchCollation(collationId).collationName}" override def equals(obj: Any): Boolean = obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId == collationId diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out index 102755f68ad1..fff2d4eab717 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out @@ -9,7 +9,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false insert into t1 values('aaa', 'aaa') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -17,7 +17,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t1 values('AAA', 'AAA') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -25,7 +25,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t1 values('bbb', 'bbb') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -33,7 +33,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t1 values('BBB', 'BBB') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -112,7 +112,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false insert into t2 values('aaa', 'aaa') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -120,7 +120,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t2 values('bbb', 'bbb') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [utf8_binary, utf8_binary_lcase] -+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string COLLATE UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] ++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_BINARY_LCASE) AS utf8_binary_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -159,7 +159,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false INSERT INTO t1 VALUES (named_struct('utf8_binary', 'aaa', 'utf8_binary_lcase', 'aaa')) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [c1] -+- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, cast(col1#x.utf8_binary_lcase as string COLLATE UTF8_BINARY_LCASE)) AS c1#x] ++- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, cast(col1#x.utf8_binary_lcase as string collate UTF8_BINARY_LCASE)) AS c1#x] +- LocalRelation [col1#x] @@ -167,7 +167,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d INSERT INTO t1 VALUES (named_struct('utf8_binary', 'AAA', 'utf8_binary_lcase', 'AAA')) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [c1] -+- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, cast(col1#x.utf8_binary_lcase as string COLLATE UTF8_BINARY_LCASE)) AS c1#x] ++- Project [named_struct(utf8_binary, col1#x.utf8_binary, utf8_binary_lcase, cast(col1#x.utf8_binary_lcase as string collate UTF8_BINARY_LCASE)) AS c1#x] +- LocalRelation [col1#x] diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index d6e41bd2b0c9..70ea4058655a 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -62,7 +62,7 @@ struct<count(1):bigint> -- !query select * from t1 where utf8_binary = 'aaa' -- !query schema -struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE> +struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE> -- !query output aaa aaa @@ -70,7 +70,7 @@ aaa aaa -- !query select * from t1 where utf8_binary_lcase = 'aaa' collate utf8_binary_lcase -- !query schema -struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE> +struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE> -- !query output AAA AAA aaa aaa @@ -79,7 +79,7 @@ aaa aaa -- !query select * from t1 where utf8_binary < 'bbb' -- !query schema -struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE> +struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE> -- !query output AAA AAA BBB BBB @@ -89,7 +89,7 @@ aaa aaa -- !query select * from t1 where utf8_binary_lcase < 'bbb' collate utf8_binary_lcase -- !query schema -struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE> +struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE> -- !query output AAA AAA aaa aaa @@ -98,7 +98,7 @@ aaa aaa -- !query select l.utf8_binary, r.utf8_binary_lcase from t1 l join t1 r on l.utf8_binary_lcase = r.utf8_binary_lcase -- !query schema -struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE> +struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE> -- !query output AAA AAA AAA aaa @@ -137,7 +137,7 @@ struct<> -- !query select * from t1 anti join t2 on t1.utf8_binary_lcase = t2.utf8_binary_lcase -- !query schema -struct<utf8_binary:string,utf8_binary_lcase:string COLLATE UTF8_BINARY_LCASE> +struct<utf8_binary:string,utf8_binary_lcase:string collate UTF8_BINARY_LCASE> -- !query output --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org