(spark) branch master updated: [SPARK-46934][SQL] Read/write roundtrip for struct type with special characters with HMS

dongjoon Mon, 19 Feb 2024 11:55:15 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 581b6a747923 [SPARK-46934][SQL] Read/write roundtrip for struct type 
with special characters with HMS
581b6a747923 is described below

commit 581b6a747923c11964db3304d7c5a33562db28a4
Author: Kent Yao <y...@apache.org>
AuthorDate: Mon Feb 19 11:54:08 2024 -0800

    [SPARK-46934][SQL] Read/write roundtrip for struct type with special 
characters with HMS
    
    ### What changes were proposed in this pull request?
    
    Hive metastore API uses unquoted element names for struct types, so the 
spark catalyst generates a catalog string to conform with.
    
    For example, both `struct<x:int,y.z:int>` and `struct<x:int,y.z:int>` are 
valid cases in `FieldSchema`.
    while the original form of the 2nd one, which is from user API, is 
```struct<x:int,`y.z`:int>```. Because we use `CatalystSqlParser.parseDataType` 
to verify the type, we need to convert the unquoted element names to quoted 
ones.
    
    - For verification
      - For the read path, a.k.a, getting an existing table from HMS. We need 
to convert `struct<x:int,y.z:int>` returned by HMS to 
```struct<x:int,`y.z`:int>```.
      - For the write path, a.k.a, creating/altering a table to HMS. We need to 
convert `struct<x:int,y.z:int>` generated by Spark catalogString to 
```struct<x:int,`y.z`:int>```.
    - For HMS API calls
       - keep the `struct<x:int,y.z:int>` AS-IS to conform with HMS, as 
```'`'``` can not be recognized by hive.
    
    ### Why are the changes needed?
    
    bugfix
    ### Does this PR introduce _any_ user-facing change?
    
    yes, users can now read/write/define/delete tables with struct columns with 
pecial characters in their elements
    
    ### How was this patch tested?
    
    new tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #45039 from yaooqinn/SPARK-46934.
    
    Authored-by: Kent Yao <y...@apache.org>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../spark/sql/hive/client/HiveClientImpl.scala     | 15 +++++++-
 .../spark/sql/hive/HiveMetastoreCatalogSuite.scala | 39 +++++++++++++++++++
 .../spark/sql/hive/execution/HiveDDLSuite.scala    | 45 ++++++++--------------
 3 files changed, 69 insertions(+), 30 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 8ad775c93c31..ce1d353298b2 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -1056,11 +1056,22 @@ private[hive] object HiveClientImpl extends Logging {
 
   /** Get the Spark SQL native DataType from Hive's FieldSchema. */
   private def getSparkSQLDataType(hc: FieldSchema): DataType = {
+    // For struct types, Hive metastore API uses unquoted element names, so 
does the spark catalyst
+    // generates catalog string to conform with.
+    // For example, both struct<x:int,y.z:int> and struct<x:int,y.z:int> are 
valid cases
+    // in `FieldSchema`, while the original form of the 2nd one, which from 
user API, is
+    // struct<x:int,`y.z`:int>. Because  we use 
`CatalystSqlParser.parseDataType` to verify the
+    // type, we need to covert the unquoted element names to quoted ones.
+    // Examples:
+    //   struct<x:int,y.z:int> -> struct<`x`:int,`y.z`:int>
+    //   array<struct<x:int,y.z:int>> -> array<struct<`x`:int,`y.z`:int>>
+    //   map<string,struct<x:int,y.z:int>> -> 
map<string,struct<`x`:int,`y.z`:int>>
+    val typeStr = hc.getType.replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`")
     try {
-      CatalystSqlParser.parseDataType(hc.getType)
+      CatalystSqlParser.parseDataType(typeStr)
     } catch {
       case e: ParseException =>
-        throw QueryExecutionErrors.cannotRecognizeHiveTypeError(e, hc.getType, 
hc.getName)
+        throw QueryExecutionErrors.cannotRecognizeHiveTypeError(e, typeStr, 
hc.getName)
     }
   }
 
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index c580fd0dfa58..f9a24f44b76c 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -415,4 +415,43 @@ class DataSourceWithHiveMetastoreCatalogSuite
       })
     }
   }
+
+  test("SPARK-46934: Handle special characters in struct types") {
+    withTable("t") {
+      val schema =
+        "a struct<" +
+          "`a.a`:int," +
+          "`a.b`:struct<" +
+          "  `a b b`:array<string>," +
+          "  `a b c`:map<int, string>" +
+          "  >" +
+          ">"
+      sql("CREATE TABLE t(" + schema + ")")
+      assert(spark.table("t").schema === 
CatalystSqlParser.parseTableSchema(schema))
+    }
+  }
+
+  test("SPARK-46934: Handle special characters in struct types with CTAS") {
+    withTable("t") {
+      val schema = "`a.b` struct<`a.b.b`:array<string>, `a b c`:map<int, 
string>>"
+      sql("CREATE TABLE t AS " +
+        "SELECT named_struct('a.b.b', array('a'), 'a b c', map(1, 'a')) AS 
`a.b`")
+      assert(spark.table("t").schema === 
CatalystSqlParser.parseTableSchema(schema))
+    }
+  }
+
+  test("SPARK-46934: Handle special characters in struct types with hive DDL") 
{
+    withTable("t") {
+      val schema =
+        "a struct<" +
+          "`a.a`:int," +
+          "`a.b`:struct<" +
+          "  `a.b.b`:array<string>," +
+          "  `a b c`:map<int, string>" +
+          "  >" +
+          ">"
+      sparkSession.metadataHive.runSqlHive(s"CREATE TABLE t($schema)")
+      assert(spark.table("t").schema === 
CatalystSqlParser.parseTableSchema(schema))
+    }
+  }
 }
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index de79e96c412f..15a7796a72b5 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.{AnalysisException, Row, SaveMode}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.parser.ParseException
+import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import 
org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
 import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
@@ -174,22 +174,18 @@ class HiveDDLSuite
     testAddColumnPartitioned("orc")
   }
 
-  test("SPARK-22431: illegal nested type") {
-    checkError(
-      exception = intercept[SparkException] {
-        spark.sql("CREATE TABLE t USING hive AS SELECT STRUCT('a' AS `$a`, 1 
AS b) q")
-      },
-      errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
-      parameters = Map("fieldType" -> "\"STRUCT<$A:STRING,B:INT>\"", 
"fieldName" -> "`q`")
-    )
+  test("SPARK-46934: quote element name before parsing struct") {
+    withTable("t") {
+      sql("CREATE TABLE t USING hive AS SELECT STRUCT('a' AS `$a`, 1 AS b) q")
+      assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema(
+        "q STRUCT<`$a`: STRING, b: INT>"))
+    }
 
-    checkError(
-      exception = intercept[SparkException] {
-        spark.sql("CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) 
USING hive")
-      },
-      errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
-      parameters = Map("fieldType" -> "\"STRUCT<$A:INT,COL2:STRING>\"", 
"fieldName" -> "`q`")
-    )
+    withTable("t") {
+      sql("CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING hive")
+      assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema(
+        "q STRUCT<`$a`:INT, col2:STRING>, i1 INT"))
+    }
 
     withView("v") {
       spark.sql("CREATE VIEW v AS SELECT STRUCT('a' AS `a`, 1 AS b) q")
@@ -245,19 +241,12 @@ class HiveDDLSuite
     }
   }
 
-  test("SPARK-22431: negative alter table tests with nested types") {
+  test("SPARK-46934: alter table tests with nested types") {
     withTable("t1") {
-      spark.sql("CREATE TABLE t1 (q STRUCT<col1:INT, col2:STRING>, i1 INT) 
USING hive")
-      checkError(
-        exception = intercept[SparkException] {
-          spark.sql("ALTER TABLE t1 ADD COLUMNS (newcol1 
STRUCT<`$col1`:STRING, col2:Int>)")
-        },
-        errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
-        parameters = Map(
-          "fieldType" -> "\"STRUCT<$COL1:STRING,COL2:INT>\"",
-          "fieldName" -> "`newcol1`"
-        )
-      )
+      sql("CREATE TABLE t1 (q STRUCT<col1:INT, col2:STRING>, i1 INT) USING 
hive")
+      sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, 
col2:Int>)")
+      assert(spark.table("t1").schema == CatalystSqlParser.parseTableSchema(
+        "q STRUCT<col1:INT, col2:STRING>, i1 INT,newcol1 
STRUCT<`$col1`:STRING, col2:Int>"))
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-46934][SQL] Read/write roundtrip for struct type with special characters with HMS

Reply via email to