This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 14b3b82a8487 [SPARK-51525][SQL] Collation field for Desc As JSON 
StringType
14b3b82a8487 is described below

commit 14b3b82a8487a96419ae86b81db4f9c2619486f6
Author: Amanda Liu <amanda....@databricks.com>
AuthorDate: Tue Mar 18 08:39:41 2025 +0800

    [SPARK-51525][SQL] Collation field for Desc As JSON StringType
    
    ### What changes were proposed in this pull request?
    
    Add a collation field for Desc As JSON StringType.
    
    For example:
    
    ```
    "columns":[{"name":"c1","type":{"name":"string", "collation":"UNICODE_CI"}
    ```
    
    or the default collation value:
    
    ```
    "columns":[{"name":"c1","type":{"name":"string", "collation":"UTF8_BINARY"}
    ```
    
    ### Why are the changes needed?
    
    Add support for collation data type in Desc As JSON
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it affects the output of Desc As JSON for collation data type.
    
    ### How was this patch tested?
    
    Added test in DescribeTableSuite
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #50290 from asl3/asl3/collation-descasjson.
    
    Authored-by: Amanda Liu <amanda....@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit 513a080ba0924a8831a3aa011a9a9852afe012d2)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../command/DescribeRelationJsonCommand.scala      |  6 ++
 .../resources/sql-tests/results/describe.sql.out   |  6 +-
 .../execution/command/v1/DescribeTableSuite.scala  | 91 +++++++++++++++++-----
 3 files changed, 82 insertions(+), 21 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala
index 1aba10e40c36..d76f46629fbb 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala
@@ -223,6 +223,12 @@ case class DescribeRelationJsonCommand(
           "end_unit" -> JString(getFieldName(dayTimeIntervalType.endField))
         )
 
+      case stringType: StringType =>
+        JObject(
+          "name" -> JString("string"),
+          "collation" -> JString(stringType.collationName)
+        )
+
       case _ =>
         JObject("name" -> JString(dataType.simpleString))
     }
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out 
b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index d5a3dd50dc7e..ca98bd730d45 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -76,7 +76,7 @@ DESCRIBE EXTENDED t AS JSON
 -- !query schema
 struct<json_metadata:string>
 -- !query output
-{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not
 included in 
comparison]/{warehouse_dir}/t","storage_properties":{"a":"1","b":"2","passwo 
[...]
+{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"d","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not
 included i [...]
 
 
 -- !query
@@ -303,7 +303,7 @@ DESC EXTENDED t PARTITION (c='Us', d=1) AS JSON
 -- !query schema
 struct<json_metadata:string>
 -- !query output
-{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not
 included in 
comparison]/{warehouse_dir}/t/c=Us/d=1","storage_properties":{"a":"1","b":"2","password":"********
 [...]
+{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"d","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not
 included in comparison]/{warehou [...]
 
 
 -- !query
@@ -929,7 +929,7 @@ DESC TABLE EXTENDED f PARTITION (B='SPARK', 
C=TIMESTAMP'2018-11-17 13:33:33') AS
 -- !query schema
 struct<json_metadata:string>
 -- !query output
-{"table_name":"f","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"A","type":{"name":"string"},"nullable":true},{"name":"B","type":{"name":"binary"},"nullable":true},{"name":"C","type":{"name":"timestamp_ltz"},"nullable":true}],"partition_values":{"B":"SPARK","C":"2018-11-17
 13:33:33"},"location":"file:[not included in 
comparison]/{warehouse_dir}/f/B=SPARK/C=2018-11-17 
13%3A33%3A33","partition_parameters":{"numFiles":"1","totalSize":"15", [...]
+{"table_name":"f","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"A","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"B","type":{"name":"binary"},"nullable":true},{"name":"C","type":{"name":"timestamp_ltz"},"nullable":true}],"partition_values":{"B":"SPARK","C":"2018-11-17
 13:33:33"},"location":"file:[not included in 
comparison]/{warehouse_dir}/f/B=SPARK/C=2018-11-17 
13%3A33%3A33","partition_parameters":{"numFi [...]
 
 
 -- !query
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala
index b3767071ad9d..fce017f358d3 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala
@@ -240,8 +240,8 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
         schema_name = Some("ns"),
         columns = Some(List(
           TableColumn("employee_id", Type("int"), true),
-          TableColumn("employee_name", Type("string"), true),
-          TableColumn("department", Type("string"), true),
+          TableColumn("employee_name", Type("string", collation = 
Some("UTF8_BINARY")), true),
+          TableColumn("department", Type("string", collation = 
Some("UTF8_BINARY")), true),
           TableColumn("hire_date", Type("date"), true)
         )),
         last_access = Some("UNKNOWN"),
@@ -305,9 +305,9 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
         schema_name = Some("ns"),
         columns = Some(List(
           TableColumn("id", Type("int"), true),
-          TableColumn("name", Type("string"), true),
-          TableColumn("region", Type("string"), true),
-          TableColumn("category", Type("string"), true)
+          TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), 
true),
+          TableColumn("region", Type("string", collation = 
Some("UTF8_BINARY")), true),
+          TableColumn("category", Type("string", collation = 
Some("UTF8_BINARY")), true)
         )),
         last_access = Some("UNKNOWN"),
         created_by = Some(s"Spark $SPARK_VERSION"),
@@ -369,9 +369,9 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
         schema_name = Some("ns"),
         columns = Some(List(
           TableColumn("id", Type("int"), true),
-          TableColumn("name", Type("string"), true),
-          TableColumn("region", Type("string"), true),
-          TableColumn("category", Type("string"), true)
+          TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), 
true),
+          TableColumn("region", Type("string", collation = 
Some("UTF8_BINARY")), true),
+          TableColumn("category", Type("string", collation = 
Some("UTF8_BINARY")), true)
         )),
         last_access = Some("UNKNOWN"),
         created_by = Some(s"Spark $SPARK_VERSION"),
@@ -399,6 +399,58 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
     }
   }
 
+  test("DESCRIBE AS JSON collation") {
+    withNamespaceAndTable("ns", "table") { t =>
+      val tableCreationStr =
+        s"""
+           |CREATE TABLE $t (
+           |  c1 STRING COLLATE UNICODE_CI,
+           |  c2 STRING COLLATE UNICODE_RTRIM,
+           |  c3 STRING COLLATE FR,
+           |  c4 STRING,
+           |  id INT
+           |)
+           |USING parquet COMMENT 'table_comment'
+           |""".stripMargin
+      spark.sql(tableCreationStr)
+
+      val descriptionDf = spark.sql(s"DESC EXTENDED $t AS JSON")
+      val firstRow = descriptionDf.select("json_metadata").head()
+      val jsonValue = firstRow.getString(0)
+      val parsedOutput = parse(jsonValue).extract[DescribeTableJson]
+
+      val expectedOutput = DescribeTableJson(
+        table_name = Some("table"),
+        catalog_name = Some("spark_catalog"),
+        namespace = Some(List("ns")),
+        schema_name = Some("ns"),
+        columns = Some(List(
+          TableColumn("c1", Type("string", collation = Some("UNICODE_CI"))),
+          TableColumn("c2", Type("string", collation = Some("UNICODE_RTRIM"))),
+          TableColumn("c3", Type("string", collation = Some("fr"))),
+          TableColumn("c4", Type("string", collation = Some("UTF8_BINARY"))),
+          TableColumn("id", Type("int")))),
+        last_access = Some("UNKNOWN"),
+        created_by = Some(s"Spark $SPARK_VERSION"),
+        `type` = Some("MANAGED"),
+        storage_properties = None,
+        provider = Some("parquet"),
+        bucket_columns = Some(Nil),
+        sort_columns = Some(Nil),
+        comment = Some("table_comment"),
+        serde_library = if (getProvider() == "hive") {
+          Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
+        } else {
+          None
+        },
+        table_properties = None
+      )
+      assert(parsedOutput.location.isDefined)
+      assert(iso8601Regex.matches(parsedOutput.created_time.get))
+      assert(expectedOutput == parsedOutput.copy(location = None, created_time 
= None))
+    }
+  }
+
   test("DESCRIBE AS JSON default values") {
     withNamespaceAndTable("ns", "table") { t =>
       val tableCreationStr =
@@ -425,7 +477,8 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
         schema_name = Some("ns"),
         columns = Some(List(
           TableColumn("id", Type("int"), default = Some("1")),
-          TableColumn("name", Type("string"), default = Some("'unknown'")),
+          TableColumn("name", Type("string", collation = Some("UTF8_BINARY")),
+            default = Some("'unknown'")),
           TableColumn("created_at", Type("timestamp_ltz"), default = 
Some("CURRENT_TIMESTAMP")),
           TableColumn("is_active", Type("boolean"), default = Some("true"))
         )),
@@ -478,7 +531,7 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
             schema_name = if (isTemp) Some("session") else Some("default"),
             columns = Some(List(
               TableColumn("id", Type("int")),
-              TableColumn("name", Type("string")),
+              TableColumn("name", Type("string", collation = 
Some("UTF8_BINARY"))),
               TableColumn("created_at", Type("timestamp_ltz"))
             )),
             last_access = Some("UNKNOWN"),
@@ -578,7 +631,7 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
               fields = Some(List(
                 Field(
                   name = "name",
-                  `type` = Type("string")
+                  `type` = Type("string", collation = Some("UTF8_BINARY"))
                 ),
                 Field(
                   name = "age",
@@ -591,13 +644,13 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
                     fields = Some(List(
                       Field(
                         name = "email",
-                        `type` = Type("string")
+                        `type` = Type("string", collation = 
Some("UTF8_BINARY"))
                       ),
                       Field(
                         name = "phone_numbers",
                         `type` = Type(
                           name = "array",
-                          element_type = Some(Type("string")),
+                          element_type = Some(Type("string", collation = 
Some("UTF8_BINARY"))),
                           element_nullable = Some(true)
                         )
                       ),
@@ -610,11 +663,11 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
                             fields = Some(List(
                               Field(
                                 name = "street",
-                                `type` = Type("string")
+                                `type` = Type("string", collation = 
Some("UTF8_BINARY"))
                               ),
                               Field(
                                 name = "city",
-                                `type` = Type("string")
+                                `type` = Type("string", collation = 
Some("UTF8_BINARY"))
                               ),
                               Field(
                                 name = "zip",
@@ -636,10 +689,10 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
             name = "preferences",
             `type` = Type(
               name = "map",
-              key_type = Some(Type("string")),
+              key_type = Some(Type("string", collation = Some("UTF8_BINARY"))),
               value_type = Some(Type(
                 name = "array",
-                element_type = Some(Type("string")),
+                element_type = Some(Type("string", collation = 
Some("UTF8_BINARY"))),
                 element_nullable = Some(true)
               )),
               value_nullable = Some(true)
@@ -648,7 +701,7 @@ trait DescribeTableSuiteBase extends 
command.DescribeTableSuiteBase
           ),
           TableColumn(
             name = "id",
-            `type` = Type("string"),
+            `type` = Type("string", collation = Some("UTF8_BINARY")),
             default = None
           )
         )),
@@ -811,6 +864,8 @@ case class TableColumn(
 
 case class Type(
    name: String,
+   collation: Option[String] = None,
+   length: Option[Int] = None,
    fields: Option[List[Field]] = None,
    `type`: Option[Type] = None,
    element_type: Option[Type] = None,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to