MaxGekk commented on a change in pull request #33297: URL: https://github.com/apache/spark/pull/33297#discussion_r667964651
########## File path: sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala ########## @@ -595,6 +595,31 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { } } + test("[SPARK-36069] from_json invalid json schema - check field name and field value") { + withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") { + val schema = new StructType() + .add("a", IntegerType) + .add("b", IntegerType) + .add("_unparsed", StringType) + val badRec = """{"a": "1", "b": 11}""" + val df = Seq(badRec, """{"a": 2, "b": 12}""").toDS() + + checkAnswer( + df.select(from_json($"value", schema, Map("mode" -> "PERMISSIVE"))), + Row(Row(null, 11, badRec)) :: Row(Row(2, 12, null)) :: Nil) + + val exception = intercept[SparkException] { + df.select(from_json($"value", schema, Map("mode" -> "FAILFAST"))).collect() + } + exception.printStackTrace() Review comment: Could you remove this, please. ########## File path: sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala ########## @@ -595,6 +595,31 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { } } + test("[SPARK-36069] from_json invalid json schema - check field name and field value") { + withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") { + val schema = new StructType() + .add("a", IntegerType) + .add("b", IntegerType) + .add("_unparsed", StringType) + val badRec = """{"a": "1", "b": 11}""" + val df = Seq(badRec, """{"a": 2, "b": 12}""").toDS() + + checkAnswer( + df.select(from_json($"value", schema, Map("mode" -> "PERMISSIVE"))), + Row(Row(null, 11, badRec)) :: Row(Row(2, 12, null)) :: Nil) + + val exception = intercept[SparkException] { + df.select(from_json($"value", schema, Map("mode" -> "FAILFAST"))).collect() + } + exception.printStackTrace() + assert(exception.getMessage.contains( Review comment: Store `exception.getMessage` in a val, and reuse it in both asserts. For example: ```scala val errMsg = intercept[SparkException] { df.select(from_json($"value", schema, Map("mode" -> "FAILFAST"))).collect() }. getMessage ``` ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala ########## @@ -958,18 +958,25 @@ object QueryExecutionErrors { new RuntimeException("Parsing JSON arrays as structs is forbidden.") } - def cannotParseStringAsDataTypeError(str: String, dataType: DataType): Throwable = { - new RuntimeException(s"Cannot parse $str as ${dataType.catalogString}.") + def cannotParseStringAsDataTypeError(parser: JsonParser, token: JsonToken, dataType: DataType) + : Throwable = { + new RuntimeException( + s"Cannot parse field name [${parser.getCurrentName}], " + + s"field value [${parser.getText}], " + Review comment: Any reason to output field name and values in such form? Could you check other places in Spark code base, and look for other forms of field names and values in error messages. ########## File path: sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala ########## @@ -595,6 +595,31 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { } } + test("[SPARK-36069] from_json invalid json schema - check field name and field value") { Review comment: ```suggestion test("SPARK-36069: from_json invalid json schema - check field name and field value") { ``` ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala ########## @@ -958,18 +958,25 @@ object QueryExecutionErrors { new RuntimeException("Parsing JSON arrays as structs is forbidden.") } - def cannotParseStringAsDataTypeError(str: String, dataType: DataType): Throwable = { - new RuntimeException(s"Cannot parse $str as ${dataType.catalogString}.") + def cannotParseStringAsDataTypeError(parser: JsonParser, token: JsonToken, dataType: DataType) + : Throwable = { + new RuntimeException( + s"Cannot parse field name [${parser.getCurrentName}], " + + s"field value [${parser.getText}], " + + s"[${token.toString}] as target spark data type [${dataType}].") } def failToParseEmptyStringForDataTypeError(dataType: DataType): Throwable = { new RuntimeException( s"Failed to parse an empty string for data type ${dataType.catalogString}") } - def failToParseValueForDataTypeError(dataType: DataType, token: JsonToken): Throwable = { + def failToParseValueForDataTypeError(parser: JsonParser, token: JsonToken, dataType: DataType) + : Throwable = { new RuntimeException( - s"Failed to parse a value for data type ${dataType.catalogString} (current token: $token).") + s"Failed to parse field name [${parser.getCurrentName}], " + + s"field value [${parser.getText}], " + + s"[${token.toString}] to target spark data type [${dataType}].") Review comment: ```suggestion s"[$token] to target spark data type [$dataType].") ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org