This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 581b6a747923 [SPARK-46934][SQL] Read/write roundtrip for struct type with special characters with HMS 581b6a747923 is described below commit 581b6a747923c11964db3304d7c5a33562db28a4 Author: Kent Yao <y...@apache.org> AuthorDate: Mon Feb 19 11:54:08 2024 -0800 [SPARK-46934][SQL] Read/write roundtrip for struct type with special characters with HMS ### What changes were proposed in this pull request? Hive metastore API uses unquoted element names for struct types, so the spark catalyst generates a catalog string to conform with. For example, both `struct<x:int,y.z:int>` and `struct<x:int,y.z:int>` are valid cases in `FieldSchema`. while the original form of the 2nd one, which is from user API, is ```struct<x:int,`y.z`:int>```. Because we use `CatalystSqlParser.parseDataType` to verify the type, we need to convert the unquoted element names to quoted ones. - For verification - For the read path, a.k.a, getting an existing table from HMS. We need to convert `struct<x:int,y.z:int>` returned by HMS to ```struct<x:int,`y.z`:int>```. - For the write path, a.k.a, creating/altering a table to HMS. We need to convert `struct<x:int,y.z:int>` generated by Spark catalogString to ```struct<x:int,`y.z`:int>```. - For HMS API calls - keep the `struct<x:int,y.z:int>` AS-IS to conform with HMS, as ```'`'``` can not be recognized by hive. ### Why are the changes needed? bugfix ### Does this PR introduce _any_ user-facing change? yes, users can now read/write/define/delete tables with struct columns with pecial characters in their elements ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #45039 from yaooqinn/SPARK-46934. Authored-by: Kent Yao <y...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../spark/sql/hive/client/HiveClientImpl.scala | 15 +++++++- .../spark/sql/hive/HiveMetastoreCatalogSuite.scala | 39 +++++++++++++++++++ .../spark/sql/hive/execution/HiveDDLSuite.scala | 45 ++++++++-------------- 3 files changed, 69 insertions(+), 30 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 8ad775c93c31..ce1d353298b2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -1056,11 +1056,22 @@ private[hive] object HiveClientImpl extends Logging { /** Get the Spark SQL native DataType from Hive's FieldSchema. */ private def getSparkSQLDataType(hc: FieldSchema): DataType = { + // For struct types, Hive metastore API uses unquoted element names, so does the spark catalyst + // generates catalog string to conform with. + // For example, both struct<x:int,y.z:int> and struct<x:int,y.z:int> are valid cases + // in `FieldSchema`, while the original form of the 2nd one, which from user API, is + // struct<x:int,`y.z`:int>. Because we use `CatalystSqlParser.parseDataType` to verify the + // type, we need to covert the unquoted element names to quoted ones. + // Examples: + // struct<x:int,y.z:int> -> struct<`x`:int,`y.z`:int> + // array<struct<x:int,y.z:int>> -> array<struct<`x`:int,`y.z`:int>> + // map<string,struct<x:int,y.z:int>> -> map<string,struct<`x`:int,`y.z`:int>> + val typeStr = hc.getType.replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`") try { - CatalystSqlParser.parseDataType(hc.getType) + CatalystSqlParser.parseDataType(typeStr) } catch { case e: ParseException => - throw QueryExecutionErrors.cannotRecognizeHiveTypeError(e, hc.getType, hc.getName) + throw QueryExecutionErrors.cannotRecognizeHiveTypeError(e, typeStr, hc.getName) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index c580fd0dfa58..f9a24f44b76c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -415,4 +415,43 @@ class DataSourceWithHiveMetastoreCatalogSuite }) } } + + test("SPARK-46934: Handle special characters in struct types") { + withTable("t") { + val schema = + "a struct<" + + "`a.a`:int," + + "`a.b`:struct<" + + " `a b b`:array<string>," + + " `a b c`:map<int, string>" + + " >" + + ">" + sql("CREATE TABLE t(" + schema + ")") + assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema(schema)) + } + } + + test("SPARK-46934: Handle special characters in struct types with CTAS") { + withTable("t") { + val schema = "`a.b` struct<`a.b.b`:array<string>, `a b c`:map<int, string>>" + sql("CREATE TABLE t AS " + + "SELECT named_struct('a.b.b', array('a'), 'a b c', map(1, 'a')) AS `a.b`") + assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema(schema)) + } + } + + test("SPARK-46934: Handle special characters in struct types with hive DDL") { + withTable("t") { + val schema = + "a struct<" + + "`a.a`:int," + + "`a.b`:struct<" + + " `a.b.b`:array<string>," + + " `a b c`:map<int, string>" + + " >" + + ">" + sparkSession.metadataHive.runSqlHive(s"CREATE TABLE t($schema)") + assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema(schema)) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index de79e96c412f..15a7796a72b5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.{AnalysisException, Row, SaveMode} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER @@ -174,22 +174,18 @@ class HiveDDLSuite testAddColumnPartitioned("orc") } - test("SPARK-22431: illegal nested type") { - checkError( - exception = intercept[SparkException] { - spark.sql("CREATE TABLE t USING hive AS SELECT STRUCT('a' AS `$a`, 1 AS b) q") - }, - errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE", - parameters = Map("fieldType" -> "\"STRUCT<$A:STRING,B:INT>\"", "fieldName" -> "`q`") - ) + test("SPARK-46934: quote element name before parsing struct") { + withTable("t") { + sql("CREATE TABLE t USING hive AS SELECT STRUCT('a' AS `$a`, 1 AS b) q") + assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema( + "q STRUCT<`$a`: STRING, b: INT>")) + } - checkError( - exception = intercept[SparkException] { - spark.sql("CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING hive") - }, - errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE", - parameters = Map("fieldType" -> "\"STRUCT<$A:INT,COL2:STRING>\"", "fieldName" -> "`q`") - ) + withTable("t") { + sql("CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING hive") + assert(spark.table("t").schema === CatalystSqlParser.parseTableSchema( + "q STRUCT<`$a`:INT, col2:STRING>, i1 INT")) + } withView("v") { spark.sql("CREATE VIEW v AS SELECT STRUCT('a' AS `a`, 1 AS b) q") @@ -245,19 +241,12 @@ class HiveDDLSuite } } - test("SPARK-22431: negative alter table tests with nested types") { + test("SPARK-46934: alter table tests with nested types") { withTable("t1") { - spark.sql("CREATE TABLE t1 (q STRUCT<col1:INT, col2:STRING>, i1 INT) USING hive") - checkError( - exception = intercept[SparkException] { - spark.sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, col2:Int>)") - }, - errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE", - parameters = Map( - "fieldType" -> "\"STRUCT<$COL1:STRING,COL2:INT>\"", - "fieldName" -> "`newcol1`" - ) - ) + sql("CREATE TABLE t1 (q STRUCT<col1:INT, col2:STRING>, i1 INT) USING hive") + sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, col2:Int>)") + assert(spark.table("t1").schema == CatalystSqlParser.parseTableSchema( + "q STRUCT<col1:INT, col2:STRING>, i1 INT,newcol1 STRUCT<`$col1`:STRING, col2:Int>")) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org