(spark) branch master updated: [SPARK-46820][PYTHON] Fix error message regression by restoring `new_msg`

gurwls223 Mon, 19 Feb 2024 18:06:16 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 9389c6b00f5f [SPARK-46820][PYTHON] Fix error message regression by 
restoring `new_msg`
9389c6b00f5f is described below

commit 9389c6b00f5f186fd4e9c3cc01bf9b3e2153f01d
Author: Haejoon Lee <haejoon....@databricks.com>
AuthorDate: Tue Feb 20 11:05:59 2024 +0900

    [SPARK-46820][PYTHON] Fix error message regression by restoring `new_msg`
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to fix error message regression by restoring `new_msg`.
    
    ### Why are the changes needed?
    
    In the past few PRs, we mistakenly remove `new_msg` which introduces error 
message regression.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No API change, but the user-facing error message is improved
    
    **Before**
    ```python
    >>> from pyspark.sql.types import StructType, StructField, StringType, 
IntegerType
    >>> schema = StructType([
    ...     StructField("name", StringType(), nullable=True),
    ...     StructField("age", IntegerType(), nullable=False)
    ... ])
    >>> df = spark.createDataFrame([(["asd", None])], schema)
    pyspark.errors.exceptions.base.PySparkValueError: [CANNOT_BE_NONE] Argument 
`obj` cannot be None.
    ```
    
    **After**
    ```python
    >>> from pyspark.sql.types import StructType, StructField, StringType, 
IntegerType
    >>> schema = StructType([
    ...     StructField("name", StringType(), nullable=True),
    ...     StructField("age", IntegerType(), nullable=False)
    ... ])
    >>> df = spark.createDataFrame([(["asd", None])], schema)
    pyspark.errors.exceptions.base.PySparkValueError: field age: This field is 
not nullable, but got None
    ```
    
    ### How was this patch tested?
    
    The existing CI should pass
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #44859 from itholic/SPARK-46820.
    
    Authored-by: Haejoon Lee <haejoon....@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/errors/error_classes.py     | 45 +++++++++++++++--
 python/pyspark/sql/tests/test_dataframe.py |  4 +-
 python/pyspark/sql/tests/test_types.py     | 11 +++--
 python/pyspark/sql/types.py                | 77 +++++++++++++++++++++++-------
 4 files changed, 108 insertions(+), 29 deletions(-)

diff --git a/python/pyspark/errors/error_classes.py 
b/python/pyspark/errors/error_classes.py
index a933a5de5c61..c16041939093 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -286,6 +286,46 @@ ERROR_CLASSES_JSON = '''
       "An error occurred while calling <func_name>: <error_msg>."
     ]
   },
+  "FIELD_DATA_TYPE_UNACCEPTABLE": {
+    "message": [
+      "<data_type> can not accept object <obj> in type <obj_type>."
+    ]
+  },
+  "FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME": {
+    "message": [
+      "<field_name>: <data_type> can not accept object <obj> in type 
<obj_type>."
+    ]
+  },
+  "FIELD_NOT_NULLABLE": {
+    "message": [
+      "Field is not nullable, but got None."
+    ]
+  },
+  "FIELD_NOT_NULLABLE_WITH_NAME": {
+    "message": [
+      "<field_name>: This field is not nullable, but got None."
+    ]
+  },
+  "FIELD_STRUCT_LENGTH_MISMATCH": {
+    "message": [
+      "Length of object (<object_length>) does not match with length of fields 
(<field_length>)."
+    ]
+  },
+  "FIELD_STRUCT_LENGTH_MISMATCH_WITH_NAME": {
+    "message": [
+      "<field_name>: Length of object (<object_length>) does not match with 
length of fields (<field_length>)."
+    ]
+  },
+  "FIELD_TYPE_MISMATCH": {
+    "message": [
+      "<obj> is not an instance of type <data_type>."
+    ]
+  },
+  "FIELD_TYPE_MISMATCH_WITH_NAME": {
+    "message": [
+      "<field_name>: <obj> is not an instance of type <data_type>."
+    ]
+  },
   "HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN": {
     "message": [
       "Function `<func_name>` should return Column, got <return_type>."
@@ -612,11 +652,6 @@ ERROR_CLASSES_JSON = '''
       "<feature> is not implemented."
     ]
   },
-  "NOT_INSTANCE_OF": {
-    "message": [
-      "<value> is not an instance of type <type>."
-    ]
-  },
   "NOT_INT": {
     "message": [
       "Argument `<arg_name>` should be an int, got <arg_type>."
diff --git a/python/pyspark/sql/tests/test_dataframe.py 
b/python/pyspark/sql/tests/test_dataframe.py
index 2134c1fe4615..6b790bc568da 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -1271,13 +1271,13 @@ class DataFrameTestsMixin:
 
         # number of fields must match.
         self.assertRaisesRegex(
-            Exception, "LENGTH_SHOULD_BE_THE_SAME", lambda: rdd.toDF("key: 
int").collect()
+            Exception, "FIELD_STRUCT_LENGTH_MISMATCH", lambda: rdd.toDF("key: 
int").collect()
         )
 
         # field types mismatch will cause exception at runtime.
         self.assertRaisesRegex(
             Exception,
-            "CANNOT_ACCEPT_OBJECT_IN_TYPE",
+            "FIELD_DATA_TYPE_UNACCEPTABLE",
             lambda: rdd.toDF("key: float, value: string").collect(),
         )
 
diff --git a/python/pyspark/sql/tests/test_types.py 
b/python/pyspark/sql/tests/test_types.py
index 4316e4962c9d..b0242033b051 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -1458,9 +1458,9 @@ class DataTypeVerificationTests(unittest.TestCase, 
PySparkErrorTestUtils):
 
         self.check_error(
             exception=pe.exception,
-            error_class="CANNOT_BE_NONE",
+            error_class="FIELD_NOT_NULLABLE_WITH_NAME",
             message_parameters={
-                "arg_name": "obj",
+                "field_name": "test_name",
             },
         )
 
@@ -1470,11 +1470,12 @@ class DataTypeVerificationTests(unittest.TestCase, 
PySparkErrorTestUtils):
 
         self.check_error(
             exception=pe.exception,
-            error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
+            error_class="FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME",
             message_parameters={
                 "data_type": "IntegerType()",
-                "obj_name": "data",
-                "obj_type": "str",
+                "field_name": "field b in field a",
+                "obj": "'data'",
+                "obj_type": "<class 'str'>",
             },
         )
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index df254ac42379..72c7b62bb2ce 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -2196,9 +2196,16 @@ def _make_type_verifier(
             if nullable:
                 return True
             else:
+                if name is not None:
+                    raise PySparkValueError(
+                        error_class="FIELD_NOT_NULLABLE_WITH_NAME",
+                        message_parameters={
+                            "field_name": str(name),
+                        },
+                    )
                 raise PySparkValueError(
-                    error_class="CANNOT_BE_NONE",
-                    message_parameters={"arg_name": "obj"},
+                    error_class="FIELD_NOT_NULLABLE",
+                    message_parameters={},
                 )
         else:
             return False
@@ -2213,12 +2220,22 @@ def _make_type_verifier(
     def verify_acceptable_types(obj: Any) -> None:
         # subclass of them can not be fromInternal in JVM
         if type(obj) not in _acceptable_types[_type]:
+            if name is not None:
+                raise PySparkTypeError(
+                    error_class="FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME",
+                    message_parameters={
+                        "field_name": str(name),
+                        "data_type": str(dataType),
+                        "obj": repr(obj),
+                        "obj_type": str(type(obj)),
+                    },
+                )
             raise PySparkTypeError(
-                error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
+                error_class="FIELD_DATA_TYPE_UNACCEPTABLE",
                 message_parameters={
                     "data_type": str(dataType),
-                    "obj_name": str(obj),
-                    "obj_type": type(obj).__name__,
+                    "obj": repr(obj),
+                    "obj_type": str(type(obj)),
                 },
             )
 
@@ -2232,11 +2249,20 @@ def _make_type_verifier(
 
         def verify_udf(obj: Any) -> None:
             if not (hasattr(obj, "__UDT__") and obj.__UDT__ == dataType):
+                if name is not None:
+                    raise PySparkValueError(
+                        error_class="FIELD_TYPE_MISMATCH_WITH_NAME",
+                        message_parameters={
+                            "field_name": str(name),
+                            "obj": str(obj),
+                            "data_type": str(dataType),
+                        },
+                    )
                 raise PySparkValueError(
-                    error_class="NOT_INSTANCE_OF",
+                    error_class="FIELD_TYPE_MISMATCH",
                     message_parameters={
-                        "value": str(obj),
-                        "type": str(dataType),
+                        "obj": str(obj),
+                        "data_type": str(dataType),
                     },
                 )
             verifier(dataType.toInternal(obj))
@@ -2365,13 +2391,20 @@ def _make_type_verifier(
                     verifier(obj.get(f))
             elif isinstance(obj, (tuple, list)):
                 if len(obj) != len(verifiers):
+                    if name is not None:
+                        raise PySparkValueError(
+                            
error_class="FIELD_STRUCT_LENGTH_MISMATCH_WITH_NAME",
+                            message_parameters={
+                                "field_name": str(name),
+                                "object_length": str(len(obj)),
+                                "field_length": str(len(verifiers)),
+                            },
+                        )
                     raise PySparkValueError(
-                        error_class="LENGTH_SHOULD_BE_THE_SAME",
+                        error_class="FIELD_STRUCT_LENGTH_MISMATCH",
                         message_parameters={
-                            "arg1": "obj",
-                            "arg2": "fields",
-                            "arg1_length": str(len(obj)),
-                            "arg2_length": str(len(verifiers)),
+                            "object_length": str(len(obj)),
+                            "field_length": str(len(verifiers)),
                         },
                     )
                 for v, (_, verifier) in zip(obj, verifiers):
@@ -2381,12 +2414,22 @@ def _make_type_verifier(
                 for f, verifier in verifiers:
                     verifier(d.get(f))
             else:
+                if name is not None:
+                    raise PySparkTypeError(
+                        error_class="FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME",
+                        message_parameters={
+                            "field_name": str(name),
+                            "data_type": str(dataType),
+                            "obj": repr(obj),
+                            "obj_type": str(type(obj)),
+                        },
+                    )
                 raise PySparkTypeError(
-                    error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE",
+                    error_class="FIELD_DATA_TYPE_UNACCEPTABLE",
                     message_parameters={
-                        "data_type": "StructType",
-                        "obj_name": str(obj),
-                        "obj_type": type(obj).__name__,
+                        "data_type": str(dataType),
+                        "obj": repr(obj),
+                        "obj_type": str(type(obj)),
                     },
                 )
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-46820][PYTHON] Fix error message regression by restoring `new_msg`

Reply via email to