This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 9389c6b00f5f [SPARK-46820][PYTHON] Fix error message regression by restoring `new_msg` 9389c6b00f5f is described below commit 9389c6b00f5f186fd4e9c3cc01bf9b3e2153f01d Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Tue Feb 20 11:05:59 2024 +0900 [SPARK-46820][PYTHON] Fix error message regression by restoring `new_msg` ### What changes were proposed in this pull request? This PR proposes to fix error message regression by restoring `new_msg`. ### Why are the changes needed? In the past few PRs, we mistakenly remove `new_msg` which introduces error message regression. ### Does this PR introduce _any_ user-facing change? No API change, but the user-facing error message is improved **Before** ```python >>> from pyspark.sql.types import StructType, StructField, StringType, IntegerType >>> schema = StructType([ ... StructField("name", StringType(), nullable=True), ... StructField("age", IntegerType(), nullable=False) ... ]) >>> df = spark.createDataFrame([(["asd", None])], schema) pyspark.errors.exceptions.base.PySparkValueError: [CANNOT_BE_NONE] Argument `obj` cannot be None. ``` **After** ```python >>> from pyspark.sql.types import StructType, StructField, StringType, IntegerType >>> schema = StructType([ ... StructField("name", StringType(), nullable=True), ... StructField("age", IntegerType(), nullable=False) ... ]) >>> df = spark.createDataFrame([(["asd", None])], schema) pyspark.errors.exceptions.base.PySparkValueError: field age: This field is not nullable, but got None ``` ### How was this patch tested? The existing CI should pass ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44859 from itholic/SPARK-46820. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/errors/error_classes.py | 45 +++++++++++++++-- python/pyspark/sql/tests/test_dataframe.py | 4 +- python/pyspark/sql/tests/test_types.py | 11 +++-- python/pyspark/sql/types.py | 77 +++++++++++++++++++++++------- 4 files changed, 108 insertions(+), 29 deletions(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index a933a5de5c61..c16041939093 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -286,6 +286,46 @@ ERROR_CLASSES_JSON = ''' "An error occurred while calling <func_name>: <error_msg>." ] }, + "FIELD_DATA_TYPE_UNACCEPTABLE": { + "message": [ + "<data_type> can not accept object <obj> in type <obj_type>." + ] + }, + "FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME": { + "message": [ + "<field_name>: <data_type> can not accept object <obj> in type <obj_type>." + ] + }, + "FIELD_NOT_NULLABLE": { + "message": [ + "Field is not nullable, but got None." + ] + }, + "FIELD_NOT_NULLABLE_WITH_NAME": { + "message": [ + "<field_name>: This field is not nullable, but got None." + ] + }, + "FIELD_STRUCT_LENGTH_MISMATCH": { + "message": [ + "Length of object (<object_length>) does not match with length of fields (<field_length>)." + ] + }, + "FIELD_STRUCT_LENGTH_MISMATCH_WITH_NAME": { + "message": [ + "<field_name>: Length of object (<object_length>) does not match with length of fields (<field_length>)." + ] + }, + "FIELD_TYPE_MISMATCH": { + "message": [ + "<obj> is not an instance of type <data_type>." + ] + }, + "FIELD_TYPE_MISMATCH_WITH_NAME": { + "message": [ + "<field_name>: <obj> is not an instance of type <data_type>." + ] + }, "HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN": { "message": [ "Function `<func_name>` should return Column, got <return_type>." @@ -612,11 +652,6 @@ ERROR_CLASSES_JSON = ''' "<feature> is not implemented." ] }, - "NOT_INSTANCE_OF": { - "message": [ - "<value> is not an instance of type <type>." - ] - }, "NOT_INT": { "message": [ "Argument `<arg_name>` should be an int, got <arg_type>." diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 2134c1fe4615..6b790bc568da 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -1271,13 +1271,13 @@ class DataFrameTestsMixin: # number of fields must match. self.assertRaisesRegex( - Exception, "LENGTH_SHOULD_BE_THE_SAME", lambda: rdd.toDF("key: int").collect() + Exception, "FIELD_STRUCT_LENGTH_MISMATCH", lambda: rdd.toDF("key: int").collect() ) # field types mismatch will cause exception at runtime. self.assertRaisesRegex( Exception, - "CANNOT_ACCEPT_OBJECT_IN_TYPE", + "FIELD_DATA_TYPE_UNACCEPTABLE", lambda: rdd.toDF("key: float, value: string").collect(), ) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 4316e4962c9d..b0242033b051 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -1458,9 +1458,9 @@ class DataTypeVerificationTests(unittest.TestCase, PySparkErrorTestUtils): self.check_error( exception=pe.exception, - error_class="CANNOT_BE_NONE", + error_class="FIELD_NOT_NULLABLE_WITH_NAME", message_parameters={ - "arg_name": "obj", + "field_name": "test_name", }, ) @@ -1470,11 +1470,12 @@ class DataTypeVerificationTests(unittest.TestCase, PySparkErrorTestUtils): self.check_error( exception=pe.exception, - error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE", + error_class="FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME", message_parameters={ "data_type": "IntegerType()", - "obj_name": "data", - "obj_type": "str", + "field_name": "field b in field a", + "obj": "'data'", + "obj_type": "<class 'str'>", }, ) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index df254ac42379..72c7b62bb2ce 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -2196,9 +2196,16 @@ def _make_type_verifier( if nullable: return True else: + if name is not None: + raise PySparkValueError( + error_class="FIELD_NOT_NULLABLE_WITH_NAME", + message_parameters={ + "field_name": str(name), + }, + ) raise PySparkValueError( - error_class="CANNOT_BE_NONE", - message_parameters={"arg_name": "obj"}, + error_class="FIELD_NOT_NULLABLE", + message_parameters={}, ) else: return False @@ -2213,12 +2220,22 @@ def _make_type_verifier( def verify_acceptable_types(obj: Any) -> None: # subclass of them can not be fromInternal in JVM if type(obj) not in _acceptable_types[_type]: + if name is not None: + raise PySparkTypeError( + error_class="FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME", + message_parameters={ + "field_name": str(name), + "data_type": str(dataType), + "obj": repr(obj), + "obj_type": str(type(obj)), + }, + ) raise PySparkTypeError( - error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE", + error_class="FIELD_DATA_TYPE_UNACCEPTABLE", message_parameters={ "data_type": str(dataType), - "obj_name": str(obj), - "obj_type": type(obj).__name__, + "obj": repr(obj), + "obj_type": str(type(obj)), }, ) @@ -2232,11 +2249,20 @@ def _make_type_verifier( def verify_udf(obj: Any) -> None: if not (hasattr(obj, "__UDT__") and obj.__UDT__ == dataType): + if name is not None: + raise PySparkValueError( + error_class="FIELD_TYPE_MISMATCH_WITH_NAME", + message_parameters={ + "field_name": str(name), + "obj": str(obj), + "data_type": str(dataType), + }, + ) raise PySparkValueError( - error_class="NOT_INSTANCE_OF", + error_class="FIELD_TYPE_MISMATCH", message_parameters={ - "value": str(obj), - "type": str(dataType), + "obj": str(obj), + "data_type": str(dataType), }, ) verifier(dataType.toInternal(obj)) @@ -2365,13 +2391,20 @@ def _make_type_verifier( verifier(obj.get(f)) elif isinstance(obj, (tuple, list)): if len(obj) != len(verifiers): + if name is not None: + raise PySparkValueError( + error_class="FIELD_STRUCT_LENGTH_MISMATCH_WITH_NAME", + message_parameters={ + "field_name": str(name), + "object_length": str(len(obj)), + "field_length": str(len(verifiers)), + }, + ) raise PySparkValueError( - error_class="LENGTH_SHOULD_BE_THE_SAME", + error_class="FIELD_STRUCT_LENGTH_MISMATCH", message_parameters={ - "arg1": "obj", - "arg2": "fields", - "arg1_length": str(len(obj)), - "arg2_length": str(len(verifiers)), + "object_length": str(len(obj)), + "field_length": str(len(verifiers)), }, ) for v, (_, verifier) in zip(obj, verifiers): @@ -2381,12 +2414,22 @@ def _make_type_verifier( for f, verifier in verifiers: verifier(d.get(f)) else: + if name is not None: + raise PySparkTypeError( + error_class="FIELD_DATA_TYPE_UNACCEPTABLE_WITH_NAME", + message_parameters={ + "field_name": str(name), + "data_type": str(dataType), + "obj": repr(obj), + "obj_type": str(type(obj)), + }, + ) raise PySparkTypeError( - error_class="CANNOT_ACCEPT_OBJECT_IN_TYPE", + error_class="FIELD_DATA_TYPE_UNACCEPTABLE", message_parameters={ - "data_type": "StructType", - "obj_name": str(obj), - "obj_type": type(obj).__name__, + "data_type": str(dataType), + "obj": repr(obj), + "obj_type": str(type(obj)), }, ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org