This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new bdb4d5e4da5 [SPARK-41180][SQL] Reuse `INVALID_SCHEMA` instead of `_LEGACY_ERROR_TEMP_1227` bdb4d5e4da5 is described below commit bdb4d5e4da558775df2be712dd8760d5f5f14747 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Mon Nov 28 20:26:27 2022 +0300 [SPARK-41180][SQL] Reuse `INVALID_SCHEMA` instead of `_LEGACY_ERROR_TEMP_1227` ### What changes were proposed in this pull request? This pr aims rename `_LEGACY_ERROR_TEMP_1227` to `INVALID_SCHEMA` ### Why are the changes needed? Proper names of error classes to improve user experience with Spark SQL. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #38754 from LuciferYang/SPARK-41180. Lead-authored-by: yangjie01 <yangji...@baidu.com> Co-authored-by: YangJie <yangji...@baidu.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- core/src/main/resources/error/error-classes.json | 23 ++++--- project/MimaExcludes.scala | 5 +- .../spark/sql/catalyst/expressions/ExprUtils.scala | 2 +- .../spark/sql/errors/QueryCompilationErrors.scala | 23 ++++--- .../apache/spark/sql/errors/QueryErrorsBase.scala | 4 ++ .../org/apache/spark/sql/types/DataType.scala | 13 ++-- .../scala/org/apache/spark/sql/functions.scala | 1 - .../sql-tests/results/csv-functions.sql.out | 12 ++-- .../sql-tests/results/json-functions.sql.out | 12 ++-- .../org/apache/spark/sql/CsvFunctionsSuite.scala | 4 +- .../apache/spark/sql/DataFrameFunctionsSuite.scala | 4 +- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 73 ++++++++++++++++------ 12 files changed, 115 insertions(+), 61 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 9f4337d0618..89728777201 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -780,8 +780,21 @@ }, "INVALID_SCHEMA" : { "message" : [ - "The expression <expr> is not a valid schema string." - ] + "The input schema <inputSchema> is not a valid schema string." + ], + "subClass" : { + "NON_STRING_LITERAL" : { + "message" : [ + "The input expression must be string literal and not null." + ] + }, + "PARSE_ERROR" : { + "message" : [ + "Cannot parse the schema:", + "<reason>" + ] + } + } }, "INVALID_SQL_SYNTAX" : { "message" : [ @@ -2844,12 +2857,6 @@ "The SQL config '<configName>' was removed in the version <version>. <comment>" ] }, - "_LEGACY_ERROR_TEMP_1227" : { - "message" : [ - "<msg><e1>", - "Failed fallback parsing: <e2>" - ] - }, "_LEGACY_ERROR_TEMP_1228" : { "message" : [ "Decimal scale (<scale>) cannot be greater than precision (<precision>)." diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index d8f87a504fa..eed79d1f204 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -120,7 +120,10 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.storage.ShuffleBlockFetcherIterator#FetchRequest.apply"), // [SPARK-41072][SS] Add the error class STREAM_FAILED to StreamingQueryException - ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this") + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"), + + // [SPARK-41180][SQL] Reuse INVALID_SCHEMA instead of _LEGACY_ERROR_TEMP_1227 + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.types.DataType.parseTypeWithFallback") ) // Defulat exclude rules diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala index 3e10b820aa6..e9084442b22 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala @@ -35,7 +35,7 @@ object ExprUtils extends QueryErrorsBase { case s: UTF8String if s != null => val dataType = DataType.fromDDL(s.toString) CharVarcharUtils.failIfHasCharVarchar(dataType) - case _ => throw QueryCompilationErrors.invalidSchemaStringError(exp) + case _ => throw QueryCompilationErrors.unexpectedSchemaTypeError(exp) } } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 486bd21b844..ce99bf4aa47 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -993,10 +993,20 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("key" -> key, "details" -> details)) } - def invalidSchemaStringError(exp: Expression): Throwable = { + def schemaFailToParseError(schema: String, e: Throwable): Throwable = { new AnalysisException( - errorClass = "INVALID_SCHEMA", - messageParameters = Map("expr" -> toSQLExpr(exp))) + errorClass = "INVALID_SCHEMA.PARSE_ERROR", + messageParameters = Map( + "inputSchema" -> toSQLSchema(schema), + "reason" -> e.getMessage + ), + cause = Some(e)) + } + + def unexpectedSchemaTypeError(exp: Expression): Throwable = { + new AnalysisException( + errorClass = "INVALID_SCHEMA.NON_STRING_LITERAL", + messageParameters = Map("inputSchema" -> toSQLExpr(exp))) } def schemaNotFoldableError(exp: Expression): Throwable = { @@ -2229,13 +2239,6 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { "comment" -> comment)) } - def failedFallbackParsingError(msg: String, e1: Throwable, e2: Throwable): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_1227", - messageParameters = Map("msg" -> msg, "e1" -> e1.getMessage, "e2" -> e2.getMessage), - cause = Some(e1.getCause)) - } - def decimalCannotGreaterThanPrecisionError(scale: Int, precision: Int): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_1228", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala index 560c7d9beb9..5460de77a14 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryErrorsBase.scala @@ -104,6 +104,10 @@ private[sql] trait QueryErrorsBase { quoteByDefault(toPrettySQL(e)) } + def toSQLSchema(schema: String): String = { + quoteByDefault(schema) + } + def getSummary(sqlContext: SQLQueryContext): String = { if (sqlContext == null) "" else sqlContext.summary } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index ef7f1553be9..ecd1cc77515 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -27,6 +27,7 @@ import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ +import org.apache.spark.SparkThrowable import org.apache.spark.annotation.Stable import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.expressions.{Cast, Expression} @@ -133,7 +134,6 @@ object DataType { parseTypeWithFallback( ddl, CatalystSqlParser.parseDataType, - "Cannot parse the data type: ", fallbackParser = str => CatalystSqlParser.parseTableSchema(str)) } @@ -144,24 +144,25 @@ object DataType { * * @param schema The schema string to parse by `parser` or `fallbackParser`. * @param parser The function that should be invoke firstly. - * @param errorMsg The error message for `parser`. * @param fallbackParser The function that is called when `parser` fails. * @return The data type parsed from the `schema` schema. */ def parseTypeWithFallback( schema: String, parser: String => DataType, - errorMsg: String, fallbackParser: String => DataType): DataType = { try { parser(schema) } catch { - case NonFatal(e1) => + case NonFatal(e) => try { fallbackParser(schema) } catch { - case NonFatal(e2) => - throw QueryCompilationErrors.failedFallbackParsingError(errorMsg, e1, e2) + case NonFatal(_) => + if (e.isInstanceOf[SparkThrowable]) { + throw e + } + throw QueryCompilationErrors.schemaFailToParseError(schema, e) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index f38f24920fa..dc47e358341 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4563,7 +4563,6 @@ object functions { val dataType = parseTypeWithFallback( schema, DataType.fromJson, - "Cannot parse the schema in JSON format: ", fallbackParser = DataType.fromDDL) from_json(e, dataType, options) } diff --git a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out index 200ddd837e1..bddd8ccc37c 100644 --- a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out @@ -22,9 +22,9 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "INVALID_SCHEMA", + "errorClass" : "INVALID_SCHEMA.NON_STRING_LITERAL", "messageParameters" : { - "expr" : "\"1\"" + "inputSchema" : "\"1\"" }, "queryContext" : [ { "objectType" : "", @@ -43,11 +43,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1227", + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42000", "messageParameters" : { - "e1" : "\n[PARSE_SYNTAX_ERROR] Syntax error at or near 'InvalidType': extra input 'InvalidType'(line 1, pos 2)\n\n== SQL ==\na InvalidType\n--^^^\n", - "e2" : "\nDataType invalidtype is not supported.(line 1, pos 2)\n\n== SQL ==\na InvalidType\n--^^^\n", - "msg" : "Cannot parse the data type: " + "error" : "'InvalidType'", + "hint" : ": extra input 'InvalidType'" }, "queryContext" : [ { "objectType" : "", diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index a9c4dd0b9fd..c8c8bee3925 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -148,9 +148,9 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "INVALID_SCHEMA", + "errorClass" : "INVALID_SCHEMA.NON_STRING_LITERAL", "messageParameters" : { - "expr" : "\"1\"" + "inputSchema" : "\"1\"" }, "queryContext" : [ { "objectType" : "", @@ -169,11 +169,11 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "_LEGACY_ERROR_TEMP_1227", + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42000", "messageParameters" : { - "e1" : "\n[PARSE_SYNTAX_ERROR] Syntax error at or near 'InvalidType': extra input 'InvalidType'(line 1, pos 2)\n\n== SQL ==\na InvalidType\n--^^^\n", - "e2" : "\nDataType invalidtype is not supported.(line 1, pos 2)\n\n== SQL ==\na InvalidType\n--^^^\n", - "msg" : "Cannot parse the data type: " + "error" : "'InvalidType'", + "hint" : ": extra input 'InvalidType'" }, "queryContext" : [ { "objectType" : "", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala index 2a3058d9395..940eaaed6ac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala @@ -367,8 +367,8 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { exception = intercept[AnalysisException] { Seq("1").toDF("csv").select(from_csv($"csv", lit(1), options)).collect() }, - errorClass = "INVALID_SCHEMA", - parameters = Map("expr" -> "\"1\"") + errorClass = "INVALID_SCHEMA.NON_STRING_LITERAL", + parameters = Map("inputSchema" -> "\"1\"") ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index e9455ea51c1..2b747735f18 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -5226,9 +5226,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { exception = intercept[AnalysisException] { sql("select from_json('{\"a\":1}', 1)") }, - errorClass = "INVALID_SCHEMA", + errorClass = "INVALID_SCHEMA.NON_STRING_LITERAL", parameters = Map( - "expr" -> "\"1\"" + "inputSchema" -> "\"1\"" ), context = ExpectedContext( fragment = "from_json('{\"a\":1}', 1)", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 56bdefc98ba..800a602425b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -443,8 +443,8 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { exception = intercept[AnalysisException] { df3.selectExpr("from_json(value, 1)") }, - errorClass = "INVALID_SCHEMA", - parameters = Map("expr" -> "\"1\""), + errorClass = "INVALID_SCHEMA.NON_STRING_LITERAL", + parameters = Map("inputSchema" -> "\"1\""), context = ExpectedContext( fragment = "from_json(value, 1)", start = 0, @@ -452,10 +452,22 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { ) ) - val errMsg2 = intercept[AnalysisException] { - df3.selectExpr("""from_json(value, 'time InvalidType')""") - } - assert(errMsg2.getMessage.contains("DataType invalidtype is not supported")) + checkError( + exception = intercept[AnalysisException] { + df3.selectExpr("""from_json(value, 'time InvalidType')""") + }, + errorClass = "PARSE_SYNTAX_ERROR", + sqlState = "42000", + parameters = Map( + "error" -> "'InvalidType'", + "hint" -> ": extra input 'InvalidType'" + ), + context = ExpectedContext( + fragment = "from_json(value, 'time InvalidType')", + start = 0, + stop = 35 + ) + ) checkError( exception = intercept[AnalysisException] { df3.selectExpr("from_json(value, 'time Timestamp', named_struct('a', 1))") @@ -991,22 +1003,47 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { test("SPARK-33286: from_json - combined error messages") { val df = Seq("""{"a":1}""").toDF("json") val invalidJsonSchema = """{"fields": [{"a":123}], "type": "struct"}""" - val errMsg1 = intercept[AnalysisException] { - df.select(from_json($"json", invalidJsonSchema, Map.empty[String, String])).collect() - }.getMessage - assert(errMsg1.contains("""Failed to convert the JSON string '{"a":123}' to a field""")) + val invalidJsonSchemaReason = "Failed to convert the JSON string '{\"a\":123}' to a field." + checkError( + exception = intercept[AnalysisException] { + df.select(from_json($"json", invalidJsonSchema, Map.empty[String, String])).collect() + }, + errorClass = "INVALID_SCHEMA.PARSE_ERROR", + parameters = Map( + "inputSchema" -> "\"{\"fields\": [{\"a\":123}], \"type\": \"struct\"}\"", + "reason" -> invalidJsonSchemaReason + ) + ) val invalidDataType = "MAP<INT, cow>" - val errMsg2 = intercept[AnalysisException] { - df.select(from_json($"json", invalidDataType, Map.empty[String, String])).collect() - }.getMessage - assert(errMsg2.contains("DataType cow is not supported")) + val invalidDataTypeReason = "Unrecognized token 'MAP': " + + "was expecting (JSON String, Number, Array, Object or token 'null', 'true' or 'false')\n " + + "at [Source: (String)\"MAP<INT, cow>\"; line: 1, column: 4]" + checkError( + exception = intercept[AnalysisException] { + df.select(from_json($"json", invalidDataType, Map.empty[String, String])).collect() + }, + errorClass = "INVALID_SCHEMA.PARSE_ERROR", + parameters = Map( + "inputSchema" -> "\"MAP<INT, cow>\"", + "reason" -> invalidDataTypeReason + ) + ) val invalidTableSchema = "x INT, a cow" - val errMsg3 = intercept[AnalysisException] { - df.select(from_json($"json", invalidTableSchema, Map.empty[String, String])).collect() - }.getMessage - assert(errMsg3.contains("DataType cow is not supported")) + val invalidTableSchemaReason = "Unrecognized token 'x': " + + "was expecting (JSON String, Number, Array, Object or token 'null', 'true' or 'false')\n" + + " at [Source: (String)\"x INT, a cow\"; line: 1, column: 2]" + checkError( + exception = intercept[AnalysisException] { + df.select(from_json($"json", invalidTableSchema, Map.empty[String, String])).collect() + }, + errorClass = "INVALID_SCHEMA.PARSE_ERROR", + parameters = Map( + "inputSchema" -> "\"x INT, a cow\"", + "reason" -> invalidTableSchemaReason + ) + ) } test("SPARK-33907: bad json input with json pruning optimization: GetStructField") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org