This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-4.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push: new 14b3b82a8487 [SPARK-51525][SQL] Collation field for Desc As JSON StringType 14b3b82a8487 is described below commit 14b3b82a8487a96419ae86b81db4f9c2619486f6 Author: Amanda Liu <amanda....@databricks.com> AuthorDate: Tue Mar 18 08:39:41 2025 +0800 [SPARK-51525][SQL] Collation field for Desc As JSON StringType ### What changes were proposed in this pull request? Add a collation field for Desc As JSON StringType. For example: ``` "columns":[{"name":"c1","type":{"name":"string", "collation":"UNICODE_CI"} ``` or the default collation value: ``` "columns":[{"name":"c1","type":{"name":"string", "collation":"UTF8_BINARY"} ``` ### Why are the changes needed? Add support for collation data type in Desc As JSON ### Does this PR introduce _any_ user-facing change? Yes, it affects the output of Desc As JSON for collation data type. ### How was this patch tested? Added test in DescribeTableSuite ### Was this patch authored or co-authored using generative AI tooling? No Closes #50290 from asl3/asl3/collation-descasjson. Authored-by: Amanda Liu <amanda....@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 513a080ba0924a8831a3aa011a9a9852afe012d2) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../command/DescribeRelationJsonCommand.scala | 6 ++ .../resources/sql-tests/results/describe.sql.out | 6 +- .../execution/command/v1/DescribeTableSuite.scala | 91 +++++++++++++++++----- 3 files changed, 82 insertions(+), 21 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala index 1aba10e40c36..d76f46629fbb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala @@ -223,6 +223,12 @@ case class DescribeRelationJsonCommand( "end_unit" -> JString(getFieldName(dayTimeIntervalType.endField)) ) + case stringType: StringType => + JObject( + "name" -> JString("string"), + "collation" -> JString(stringType.collationName) + ) + case _ => JObject("name" -> JString(dataType.simpleString)) } diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index d5a3dd50dc7e..ca98bd730d45 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -76,7 +76,7 @@ DESCRIBE EXTENDED t AS JSON -- !query schema struct<json_metadata:string> -- !query output -{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not included in comparison]/{warehouse_dir}/t","storage_properties":{"a":"1","b":"2","passwo [...] +{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"d","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not included i [...] -- !query @@ -303,7 +303,7 @@ DESC EXTENDED t PARTITION (c='Us', d=1) AS JSON -- !query schema struct<json_metadata:string> -- !query output -{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not included in comparison]/{warehouse_dir}/t/c=Us/d=1","storage_properties":{"a":"1","b":"2","password":"******** [...] +{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"d","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not included in comparison]/{warehou [...] -- !query @@ -929,7 +929,7 @@ DESC TABLE EXTENDED f PARTITION (B='SPARK', C=TIMESTAMP'2018-11-17 13:33:33') AS -- !query schema struct<json_metadata:string> -- !query output -{"table_name":"f","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"A","type":{"name":"string"},"nullable":true},{"name":"B","type":{"name":"binary"},"nullable":true},{"name":"C","type":{"name":"timestamp_ltz"},"nullable":true}],"partition_values":{"B":"SPARK","C":"2018-11-17 13:33:33"},"location":"file:[not included in comparison]/{warehouse_dir}/f/B=SPARK/C=2018-11-17 13%3A33%3A33","partition_parameters":{"numFiles":"1","totalSize":"15", [...] +{"table_name":"f","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"A","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"B","type":{"name":"binary"},"nullable":true},{"name":"C","type":{"name":"timestamp_ltz"},"nullable":true}],"partition_values":{"B":"SPARK","C":"2018-11-17 13:33:33"},"location":"file:[not included in comparison]/{warehouse_dir}/f/B=SPARK/C=2018-11-17 13%3A33%3A33","partition_parameters":{"numFi [...] -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala index b3767071ad9d..fce017f358d3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala @@ -240,8 +240,8 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase schema_name = Some("ns"), columns = Some(List( TableColumn("employee_id", Type("int"), true), - TableColumn("employee_name", Type("string"), true), - TableColumn("department", Type("string"), true), + TableColumn("employee_name", Type("string", collation = Some("UTF8_BINARY")), true), + TableColumn("department", Type("string", collation = Some("UTF8_BINARY")), true), TableColumn("hire_date", Type("date"), true) )), last_access = Some("UNKNOWN"), @@ -305,9 +305,9 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase schema_name = Some("ns"), columns = Some(List( TableColumn("id", Type("int"), true), - TableColumn("name", Type("string"), true), - TableColumn("region", Type("string"), true), - TableColumn("category", Type("string"), true) + TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), true), + TableColumn("region", Type("string", collation = Some("UTF8_BINARY")), true), + TableColumn("category", Type("string", collation = Some("UTF8_BINARY")), true) )), last_access = Some("UNKNOWN"), created_by = Some(s"Spark $SPARK_VERSION"), @@ -369,9 +369,9 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase schema_name = Some("ns"), columns = Some(List( TableColumn("id", Type("int"), true), - TableColumn("name", Type("string"), true), - TableColumn("region", Type("string"), true), - TableColumn("category", Type("string"), true) + TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), true), + TableColumn("region", Type("string", collation = Some("UTF8_BINARY")), true), + TableColumn("category", Type("string", collation = Some("UTF8_BINARY")), true) )), last_access = Some("UNKNOWN"), created_by = Some(s"Spark $SPARK_VERSION"), @@ -399,6 +399,58 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase } } + test("DESCRIBE AS JSON collation") { + withNamespaceAndTable("ns", "table") { t => + val tableCreationStr = + s""" + |CREATE TABLE $t ( + | c1 STRING COLLATE UNICODE_CI, + | c2 STRING COLLATE UNICODE_RTRIM, + | c3 STRING COLLATE FR, + | c4 STRING, + | id INT + |) + |USING parquet COMMENT 'table_comment' + |""".stripMargin + spark.sql(tableCreationStr) + + val descriptionDf = spark.sql(s"DESC EXTENDED $t AS JSON") + val firstRow = descriptionDf.select("json_metadata").head() + val jsonValue = firstRow.getString(0) + val parsedOutput = parse(jsonValue).extract[DescribeTableJson] + + val expectedOutput = DescribeTableJson( + table_name = Some("table"), + catalog_name = Some("spark_catalog"), + namespace = Some(List("ns")), + schema_name = Some("ns"), + columns = Some(List( + TableColumn("c1", Type("string", collation = Some("UNICODE_CI"))), + TableColumn("c2", Type("string", collation = Some("UNICODE_RTRIM"))), + TableColumn("c3", Type("string", collation = Some("fr"))), + TableColumn("c4", Type("string", collation = Some("UTF8_BINARY"))), + TableColumn("id", Type("int")))), + last_access = Some("UNKNOWN"), + created_by = Some(s"Spark $SPARK_VERSION"), + `type` = Some("MANAGED"), + storage_properties = None, + provider = Some("parquet"), + bucket_columns = Some(Nil), + sort_columns = Some(Nil), + comment = Some("table_comment"), + serde_library = if (getProvider() == "hive") { + Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe") + } else { + None + }, + table_properties = None + ) + assert(parsedOutput.location.isDefined) + assert(iso8601Regex.matches(parsedOutput.created_time.get)) + assert(expectedOutput == parsedOutput.copy(location = None, created_time = None)) + } + } + test("DESCRIBE AS JSON default values") { withNamespaceAndTable("ns", "table") { t => val tableCreationStr = @@ -425,7 +477,8 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase schema_name = Some("ns"), columns = Some(List( TableColumn("id", Type("int"), default = Some("1")), - TableColumn("name", Type("string"), default = Some("'unknown'")), + TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), + default = Some("'unknown'")), TableColumn("created_at", Type("timestamp_ltz"), default = Some("CURRENT_TIMESTAMP")), TableColumn("is_active", Type("boolean"), default = Some("true")) )), @@ -478,7 +531,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase schema_name = if (isTemp) Some("session") else Some("default"), columns = Some(List( TableColumn("id", Type("int")), - TableColumn("name", Type("string")), + TableColumn("name", Type("string", collation = Some("UTF8_BINARY"))), TableColumn("created_at", Type("timestamp_ltz")) )), last_access = Some("UNKNOWN"), @@ -578,7 +631,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase fields = Some(List( Field( name = "name", - `type` = Type("string") + `type` = Type("string", collation = Some("UTF8_BINARY")) ), Field( name = "age", @@ -591,13 +644,13 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase fields = Some(List( Field( name = "email", - `type` = Type("string") + `type` = Type("string", collation = Some("UTF8_BINARY")) ), Field( name = "phone_numbers", `type` = Type( name = "array", - element_type = Some(Type("string")), + element_type = Some(Type("string", collation = Some("UTF8_BINARY"))), element_nullable = Some(true) ) ), @@ -610,11 +663,11 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase fields = Some(List( Field( name = "street", - `type` = Type("string") + `type` = Type("string", collation = Some("UTF8_BINARY")) ), Field( name = "city", - `type` = Type("string") + `type` = Type("string", collation = Some("UTF8_BINARY")) ), Field( name = "zip", @@ -636,10 +689,10 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase name = "preferences", `type` = Type( name = "map", - key_type = Some(Type("string")), + key_type = Some(Type("string", collation = Some("UTF8_BINARY"))), value_type = Some(Type( name = "array", - element_type = Some(Type("string")), + element_type = Some(Type("string", collation = Some("UTF8_BINARY"))), element_nullable = Some(true) )), value_nullable = Some(true) @@ -648,7 +701,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase ), TableColumn( name = "id", - `type` = Type("string"), + `type` = Type("string", collation = Some("UTF8_BINARY")), default = None ) )), @@ -811,6 +864,8 @@ case class TableColumn( case class Type( name: String, + collation: Option[String] = None, + length: Option[Int] = None, fields: Option[List[Field]] = None, `type`: Option[Type] = None, element_type: Option[Type] = None, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org