This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 16b031eb144 [SPARK-44652] Raise error when only one df is None 16b031eb144 is described below commit 16b031eb144f6ba1c1103be5dcf00d6209adaa85 Author: Amanda Liu <amanda....@databricks.com> AuthorDate: Fri Aug 4 08:42:39 2023 +0900 [SPARK-44652] Raise error when only one df is None ### What changes were proposed in this pull request? Adds a "raise PySparkAssertionError" for the case when one of `actual` or `expected` is None, instead of just returning False. ### Why are the changes needed? The PR ensures that an error is thrown in the assertion for the edge case when one of `actual` or `expected` is None ### Does this PR introduce _any_ user-facing change? Yes, the PR affects the user-facing API `assertDataFrameEqual` ### How was this patch tested? Added tests to `python/pyspark/sql/tests/test_utils.py` Closes #42314 from asl3/raise-none-error. Authored-by: Amanda Liu <amanda....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/tests/test_utils.py | 82 ++++++++++++++++++++++++++++++---- python/pyspark/testing/utils.py | 32 ++++++++++--- 2 files changed, 99 insertions(+), 15 deletions(-) diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py index 76d397e3ade..93895465de7 100644 --- a/python/pyspark/sql/tests/test_utils.py +++ b/python/pyspark/sql/tests/test_utils.py @@ -41,6 +41,7 @@ from pyspark.sql.types import ( BooleanType, ) from pyspark.sql.dataframe import DataFrame +import pyspark.pandas as ps import difflib from typing import List, Union @@ -672,9 +673,79 @@ class UtilsTestsMixin: assertDataFrameEqual(df1, df2, checkRowOrder=False) assertDataFrameEqual(df1, df2, checkRowOrder=True) - def test_assert_equal_exact_pandas_df(self): - import pyspark.pandas as ps + def test_assert_unequal_null_actual(self): + df1 = None + df2 = self.spark.createDataFrame( + data=[ + ("1", 1000), + ("2", 3000), + ], + schema=["id", "amount"], + ) + + with self.assertRaises(PySparkAssertionError) as pe: + assertDataFrameEqual(df1, df2) + + self.check_error( + exception=pe.exception, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, ps.DataFrame, List[Row]], + "arg_name": "actual", + "actual_type": None, + }, + ) + + with self.assertRaises(PySparkAssertionError) as pe: + assertDataFrameEqual(df1, df2, checkRowOrder=True) + + self.check_error( + exception=pe.exception, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, ps.DataFrame, List[Row]], + "arg_name": "actual", + "actual_type": None, + }, + ) + + def test_assert_unequal_null_expected(self): + df1 = self.spark.createDataFrame( + data=[ + ("1", 1000), + ("2", 3000), + ], + schema=["id", "amount"], + ) + df2 = None + + with self.assertRaises(PySparkAssertionError) as pe: + assertDataFrameEqual(df1, df2) + self.check_error( + exception=pe.exception, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, ps.DataFrame, List[Row]], + "arg_name": "expected", + "actual_type": None, + }, + ) + + with self.assertRaises(PySparkAssertionError) as pe: + assertDataFrameEqual(df1, df2, checkRowOrder=True) + + self.check_error( + exception=pe.exception, + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, ps.DataFrame, List[Row]], + "arg_name": "expected", + "actual_type": None, + }, + ) + + def test_assert_equal_exact_pandas_df(self): df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"]) df2 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"]) @@ -682,16 +753,12 @@ class UtilsTestsMixin: assertDataFrameEqual(df1, df2, checkRowOrder=True) def test_assert_equal_exact_pandas_df(self): - import pyspark.pandas as ps - df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"]) df2 = ps.DataFrame(data=[30, 20, 10], columns=["Numbers"]) assertDataFrameEqual(df1, df2) def test_assert_equal_approx_pandas_df(self): - import pyspark.pandas as ps - df1 = ps.DataFrame(data=[10.0001, 20.32, 30.1], columns=["Numbers"]) df2 = ps.DataFrame(data=[10.0, 20.32, 30.1], columns=["Numbers"]) @@ -699,7 +766,6 @@ class UtilsTestsMixin: assertDataFrameEqual(df1, df2, checkRowOrder=True) def test_assert_error_pandas_pyspark_df(self): - import pyspark.pandas as ps import pandas as pd df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"]) @@ -742,8 +808,6 @@ class UtilsTestsMixin: ) def test_assert_error_non_pyspark_df(self): - import pyspark.pandas as ps - dict1 = {"a": 1, "b": 2} dict2 = {"a": 1, "b": 2} diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index 5461577fad1..8e02803efe5 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -292,6 +292,7 @@ def assertSchemaEqual(actual: StructType, expected: StructType): >>> s1 = StructType([StructField("names", ArrayType(DoubleType(), True), True)]) >>> s2 = StructType([StructField("names", ArrayType(DoubleType(), True), True)]) >>> assertSchemaEqual(s1, s2) # pass, schemas are identical + >>> df1 = spark.createDataFrame(data=[(1, 1000), (2, 3000)], schema=["id", "number"]) >>> df2 = spark.createDataFrame(data=[("1", 1000), ("2", 5000)], schema=["id", "amount"]) >>> assertSchemaEqual(df1.schema, df2.schema) # doctest: +IGNORE_EXCEPTION_DETAIL @@ -414,16 +415,20 @@ def assertDataFrameEqual( >>> df1 = spark.createDataFrame(data=[("1", 1000), ("2", 3000)], schema=["id", "amount"]) >>> df2 = spark.createDataFrame(data=[("1", 1000), ("2", 3000)], schema=["id", "amount"]) >>> assertDataFrameEqual(df1, df2) # pass, DataFrames are identical + >>> df1 = spark.createDataFrame(data=[("1", 0.1), ("2", 3.23)], schema=["id", "amount"]) >>> df2 = spark.createDataFrame(data=[("1", 0.109), ("2", 3.23)], schema=["id", "amount"]) >>> assertDataFrameEqual(df1, df2, rtol=1e-1) # pass, DataFrames are approx equal by rtol + >>> df1 = spark.createDataFrame(data=[(1, 1000), (2, 3000)], schema=["id", "amount"]) >>> list_of_rows = [Row(1, 1000), Row(2, 3000)] >>> assertDataFrameEqual(df1, list_of_rows) # pass, actual and expected data are equal + >>> import pyspark.pandas as ps >>> df1 = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> df2 = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> assertDataFrameEqual(df1, df2) # pass, pandas-on-Spark DataFrames are equal + >>> df1 = spark.createDataFrame( ... data=[("1", 1000.00), ("2", 3000.00), ("3", 2000.00)], schema=["id", "amount"]) >>> df2 = spark.createDataFrame( @@ -436,20 +441,35 @@ def assertDataFrameEqual( ! Row(id='1', amount=1000.0) Row(id='2', amount=3000.0) ! Row(id='3', amount=2000.0) - *** expected *** ! Row(id='1', amount=1001.0) Row(id='2', amount=3000.0) ! Row(id='3', amount=2003.0) """ - if actual is None and expected is None: - return True - elif actual is None or expected is None: - return False - import pyspark.pandas as ps from pyspark.testing.pandasutils import assertPandasOnSparkEqual + if actual is None and expected is None: + return True + elif actual is None: + raise PySparkAssertionError( + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, ps.DataFrame, List[Row]], + "arg_name": "actual", + "actual_type": None, + }, + ) + elif expected is None: + raise PySparkAssertionError( + error_class="INVALID_TYPE_DF_EQUALITY_ARG", + message_parameters={ + "expected_type": Union[DataFrame, ps.DataFrame, List[Row]], + "arg_name": "expected", + "actual_type": None, + }, + ) + try: # If Spark Connect dependencies are available, allow Spark Connect DataFrame from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org