[GitHub] [spark] MaxGekk commented on a change in pull request #33297: [SPARK-36069] from_json's exception should contain field name, type and value

GitBox Mon, 12 Jul 2021 12:59:14 -0700


MaxGekk commented on a change in pull request #33297:
URL: https://github.com/apache/spark/pull/33297#discussion_r667964651




##########
File path: sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
##########
@@ -595,6 +595,31 @@ class JsonFunctionsSuite extends QueryTest with 
SharedSparkSession {
     }
   }
 
+  test("[SPARK-36069] from_json invalid json schema - check field name and 
field value") {
+    withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") {
+      val schema = new StructType()
+        .add("a", IntegerType)
+        .add("b", IntegerType)
+        .add("_unparsed", StringType)
+      val badRec = """{"a": "1", "b": 11}"""
+      val df = Seq(badRec, """{"a": 2, "b": 12}""").toDS()
+
+      checkAnswer(
+        df.select(from_json($"value", schema, Map("mode" -> "PERMISSIVE"))),
+        Row(Row(null, 11, badRec)) :: Row(Row(2, 12, null)) :: Nil)
+
+      val exception = intercept[SparkException] {
+        df.select(from_json($"value", schema, Map("mode" -> 
"FAILFAST"))).collect()
+      }
+      exception.printStackTrace()

Review comment:
       Could you remove this, please.

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
##########
@@ -595,6 +595,31 @@ class JsonFunctionsSuite extends QueryTest with 
SharedSparkSession {
     }
   }
 
+  test("[SPARK-36069] from_json invalid json schema - check field name and 
field value") {
+    withSQLConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD.key -> "_unparsed") {
+      val schema = new StructType()
+        .add("a", IntegerType)
+        .add("b", IntegerType)
+        .add("_unparsed", StringType)
+      val badRec = """{"a": "1", "b": 11}"""
+      val df = Seq(badRec, """{"a": 2, "b": 12}""").toDS()
+
+      checkAnswer(
+        df.select(from_json($"value", schema, Map("mode" -> "PERMISSIVE"))),
+        Row(Row(null, 11, badRec)) :: Row(Row(2, 12, null)) :: Nil)
+
+      val exception = intercept[SparkException] {
+        df.select(from_json($"value", schema, Map("mode" -> 
"FAILFAST"))).collect()
+      }
+      exception.printStackTrace()
+      assert(exception.getMessage.contains(

Review comment:
       Store `exception.getMessage` in a val, and reuse it in both asserts. For 
example:
   ```scala
         val errMsg = intercept[SparkException] {
           df.select(from_json($"value", schema, Map("mode" -> 
"FAILFAST"))).collect()
         }. getMessage
   ```

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
##########
@@ -958,18 +958,25 @@ object QueryExecutionErrors {
     new RuntimeException("Parsing JSON arrays as structs is forbidden.")
   }
 
-  def cannotParseStringAsDataTypeError(str: String, dataType: DataType): 
Throwable = {
-    new RuntimeException(s"Cannot parse $str as ${dataType.catalogString}.")
+  def cannotParseStringAsDataTypeError(parser: JsonParser, token: JsonToken, 
dataType: DataType)
+  : Throwable = {
+    new RuntimeException(
+      s"Cannot parse field name [${parser.getCurrentName}], " +
+        s"field value [${parser.getText}], " +

Review comment:
       Any reason to output field name and values in such form? Could you check 
other places in Spark code base, and look for other forms of field names and 
values in error messages.

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
##########
@@ -595,6 +595,31 @@ class JsonFunctionsSuite extends QueryTest with 
SharedSparkSession {
     }
   }
 
+  test("[SPARK-36069] from_json invalid json schema - check field name and 
field value") {

Review comment:
       ```suggestion
     test("SPARK-36069: from_json invalid json schema - check field name and 
field value") {
   ```

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
##########
@@ -958,18 +958,25 @@ object QueryExecutionErrors {
     new RuntimeException("Parsing JSON arrays as structs is forbidden.")
   }
 
-  def cannotParseStringAsDataTypeError(str: String, dataType: DataType): 
Throwable = {
-    new RuntimeException(s"Cannot parse $str as ${dataType.catalogString}.")
+  def cannotParseStringAsDataTypeError(parser: JsonParser, token: JsonToken, 
dataType: DataType)
+  : Throwable = {
+    new RuntimeException(
+      s"Cannot parse field name [${parser.getCurrentName}], " +
+        s"field value [${parser.getText}], " +
+        s"[${token.toString}] as target spark data type [${dataType}].")
   }
 
   def failToParseEmptyStringForDataTypeError(dataType: DataType): Throwable = {
     new RuntimeException(
       s"Failed to parse an empty string for data type 
${dataType.catalogString}")
   }
 
-  def failToParseValueForDataTypeError(dataType: DataType, token: JsonToken): 
Throwable = {
+  def failToParseValueForDataTypeError(parser: JsonParser, token: JsonToken, 
dataType: DataType)
+  : Throwable = {
     new RuntimeException(
-      s"Failed to parse a value for data type ${dataType.catalogString} 
(current token: $token).")
+      s"Failed to parse field name [${parser.getCurrentName}], " +
+        s"field value [${parser.getText}], " +
+        s"[${token.toString}] to target spark data type [${dataType}].")

Review comment:
       ```suggestion
           s"[$token] to target spark data type [$dataType].")
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] MaxGekk commented on a change in pull request #33297: [SPARK-36069] from_json's exception should contain field name, type and value

Reply via email to