This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d574dbcc859 [SPARK-44413][PYTHON] Clarify error for unsupported arg data type in assertDataFrameEqual d574dbcc859 is described below commit d574dbcc85965df4a48d608230e591cc23adb525 Author: Amanda Liu <amanda....@databricks.com> AuthorDate: Tue Jul 18 08:50:49 2023 +0900 [SPARK-44413][PYTHON] Clarify error for unsupported arg data type in assertDataFrameEqual ### What changes were proposed in this pull request? This PR adds an error class, `INVALID_TYPE_DF_EQUALITY_ARG`, to clarify the error message for unsupported argument data types when calling `assertDataFrameEqual`. ### Why are the changes needed? The fix helps clarify why an error is thrown and what is wrong when a user passes unsupported arg types into the `assertDataFrameEqual` util function. ### Does this PR introduce any user-facing change? Yes, the PR modifies error message seen by users. ### How was this patch tested? Modified tests in `runtime/python/pyspark/sql/tests/test_utils.py` and `runtime/python/pyspark/sql/tests/connect/test_utils.py` Closes #42027 from asl3/datatype-error-clarify. Authored-by: Amanda Liu <amanda....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/errors/error_classes.py | 5 +++++ python/pyspark/sql/tests/test_utils.py | 33 +++++++++++++++++++++++-------- python/pyspark/testing/utils.py | 36 ++++++++++++++++++++++++---------- 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index 2cecee4da44..e45bc0797c9 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -263,6 +263,11 @@ ERROR_CLASSES_JSON = """ "StructField does not have typeName. Use typeName on its type explicitly instead." ] }, + "INVALID_TYPE_DF_EQUALITY_ARG" : { + "message" : [ + "Expected type <expected_type> for `<arg_name>` but got type <actual_type>." + ] + }, "INVALID_UDF_EVAL_TYPE" : { "message" : [ "Eval type for UDF must be <eval_type>." diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py index 9c31eb4d6bd..a1cefe7c840 100644 --- a/python/pyspark/sql/tests/test_utils.py +++ b/python/pyspark/sql/tests/test_utils.py @@ -39,6 +39,7 @@ from pyspark.sql.types import ( IntegerType, BooleanType, ) +from pyspark.sql.dataframe import DataFrame import difflib @@ -633,8 +634,12 @@ class UtilsTestsMixin: self.check_error( exception=pe.exception, - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": pd.DataFrame}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": DataFrame, + "arg_name": "df", + "actual_type": pd.DataFrame, + }, ) with self.assertRaises(PySparkAssertionError) as pe: @@ -642,8 +647,12 @@ class UtilsTestsMixin: self.check_error( exception=pe.exception, - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": pd.DataFrame}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": DataFrame, + "arg_name": "df", + "actual_type": pd.DataFrame, + }, ) def test_assert_error_non_pyspark_df(self): @@ -655,8 +664,12 @@ class UtilsTestsMixin: self.check_error( exception=pe.exception, - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(dict1)}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": DataFrame, + "arg_name": "df", + "actual_type": type(dict1), + }, ) with self.assertRaises(PySparkAssertionError) as pe: @@ -664,8 +677,12 @@ class UtilsTestsMixin: self.check_error( exception=pe.exception, - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(dict1)}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": DataFrame, + "arg_name": "df", + "actual_type": type(dict1), + }, ) def test_row_order_ignored(self): diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index acbfb522f69..b8977b6fffd 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -35,7 +35,7 @@ from itertools import zip_longest from pyspark import SparkContext, SparkConf from pyspark.errors import PySparkAssertionError, PySparkException from pyspark.find_spark_home import _find_spark_home -from pyspark.sql.dataframe import DataFrame as DataFrame +from pyspark.sql.dataframe import DataFrame from pyspark.sql import Row from pyspark.sql.types import StructType, AtomicType, StructField @@ -322,7 +322,7 @@ def assertDataFrameEqual( ): r""" A util function to assert equality between `actual` (DataFrame) and `expected` - (either DataFrame or list of Rows), with optional parameter `checkRowOrder`. + (DataFrame or list of Rows), with optional parameters `checkRowOrder`, `rtol`, and `atol`. .. versionadded:: 3.5.0 @@ -401,8 +401,12 @@ def assertDataFrameEqual( if not isinstance(actual, DataFrame) and not isinstance(actual, ConnectDataFrame): raise PySparkAssertionError( - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(actual)}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": DataFrame, + "arg_name": "df", + "actual_type": type(actual), + }, ) elif ( not isinstance(expected, DataFrame) @@ -410,19 +414,31 @@ def assertDataFrameEqual( and not isinstance(expected, List) ): raise PySparkAssertionError( - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(expected)}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, List[Row]], + "arg_name": "expected", + "actual_type": type(expected), + }, ) except Exception: if not isinstance(actual, DataFrame): raise PySparkAssertionError( - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(actual)}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": DataFrame, + "arg_name": "df", + "actual_type": type(actual), + }, ) elif not isinstance(expected, DataFrame) and not isinstance(expected, List): raise PySparkAssertionError( - error_class="UNSUPPORTED_DATA_TYPE", - message_parameters={"data_type": type(expected)}, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, List[Row]], + "arg_name": "expected", + "actual_type": type(expected), + }, ) # special cases: empty datasets, datasets with 0 columns --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org