This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new aa688106df0 [SPARK-44453][PYTHON] Use difflib to display errors in assertDataFrameEqual aa688106df0 is described below commit aa688106df02c9d31e3c93be4c5e28a8e8aec92b Author: Amanda Liu <amanda....@databricks.com> AuthorDate: Tue Jul 18 08:21:35 2023 +0900 [SPARK-44453][PYTHON] Use difflib to display errors in assertDataFrameEqual ### What changes were proposed in this pull request? This PR uses the built-in Python library, difflib, to display errors in the testing util `assertDataFrameEqual` ### Why are the changes needed? The change makes the error message output more user-friendly, as well as consistent with `assertSchemaEqual` ### Does this PR introduce _any_ user-facing change? Yes, the PR changes the test util output for the user-facing util function `assertDataFrameEqual`. ### How was this patch tested? Existing tests in `runtime/python/pyspark/sql/tests/test_utils.py` and `runtime/python/pyspark/sql/tests/connect/test_utils.py` Example output: <img width="891" alt="Screenshot 2023-07-16 at 8 20 31 PM" src="https://github.com/apache/spark/assets/68875504/2d7a9d02-bb9e-4c21-b330-5ec01b2e9ec8"> <img width="868" alt="Screenshot 2023-07-16 at 8 20 41 PM" src="https://github.com/apache/spark/assets/68875504/eba9f3e8-e147-491c-934b-34e8351df012"> Closes #42031 from asl3/difflib-assertdfequal. Authored-by: Amanda Liu <amanda....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/tests/test_utils.py | 177 ++++++++++++--------------------- python/pyspark/testing/utils.py | 33 +++--- 2 files changed, 82 insertions(+), 128 deletions(-) diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py index eae3f528504..9c31eb4d6bd 100644 --- a/python/pyspark/sql/tests/test_utils.py +++ b/python/pyspark/sql/tests/test_utils.py @@ -151,19 +151,14 @@ class UtilsTestsMixin: expected_error_message = "Results do not match: " percent_diff = (1 / 2) * 100 expected_error_message += "( %.5f %% )" % percent_diff - diff_msg = ( - "[actual]" - + "\n" - + str(df1.collect()[1]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[1]) - + "\n\n" - + "********************" - + "\n\n" - ) - expected_error_message += "\n" + diff_msg + + generated_diff = difflib.ndiff( + str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines() + ) + diff_msg = "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + + expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg with self.assertRaises(PySparkAssertionError) as pe: assertDataFrameEqual(df1, df2) @@ -294,19 +289,14 @@ class UtilsTestsMixin: expected_error_message = "Results do not match: " percent_diff = (1 / 2) * 100 expected_error_message += "( %.5f %% )" % percent_diff - diff_msg = ( - "[actual]" - + "\n" - + str(df1.collect()[1]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[1]) - + "\n\n" - + "********************" - + "\n\n" - ) - expected_error_message += "\n" + diff_msg + + generated_diff = difflib.ndiff( + str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines() + ) + diff_msg = "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + + expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg with self.assertRaises(PySparkAssertionError) as pe: assertDataFrameEqual(df1, df2) @@ -598,19 +588,14 @@ class UtilsTestsMixin: expected_error_message = "Results do not match: " percent_diff = (1 / 2) * 100 expected_error_message += "( %.5f %% )" % percent_diff - diff_msg = ( - "[actual]" - + "\n" - + str(df1.collect()[1]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[1]) - + "\n\n" - + "********************" - + "\n\n" - ) - expected_error_message += "\n" + diff_msg + + generated_diff = difflib.ndiff( + str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines() + ) + diff_msg = "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + + expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg with self.assertRaises(PySparkAssertionError) as pe: assertDataFrameEqual(df1, df2) @@ -722,31 +707,19 @@ class UtilsTestsMixin: expected_error_message = "Results do not match: " percent_diff = (2 / 2) * 100 expected_error_message += "( %.5f %% )" % percent_diff - diff_msg = ( - "[actual]" - + "\n" - + str(df1.collect()[0]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[0]) - + "\n\n" - + "********************" - + "\n\n" - ) - diff_msg += ( - "[actual]" - + "\n" - + str(df1.collect()[1]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[1]) - + "\n\n" - + "********************" - + "\n\n" - ) - expected_error_message += "\n" + diff_msg + + generated_diff = difflib.ndiff( + str(df1.collect()[0]).splitlines(), str(df2.collect()[0]).splitlines() + ) + diff_msg = "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + generated_diff = difflib.ndiff( + str(df1.collect()[1]).splitlines(), str(df2.collect()[1]).splitlines() + ) + diff_msg += "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + + expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg with self.assertRaises(PySparkAssertionError) as pe: assertDataFrameEqual(df1, df2, checkRowOrder=True) @@ -829,31 +802,19 @@ class UtilsTestsMixin: expected_error_message = "Results do not match: " percent_diff = (2 / 3) * 100 expected_error_message += "( %.5f %% )" % percent_diff - diff_msg = ( - "[actual]" - + "\n" - + str(df1.collect()[0]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[0]) - + "\n\n" - + "********************" - + "\n\n" - ) - diff_msg += ( - "[actual]" - + "\n" - + str(df1.collect()[2]) - + "\n\n" - + "[expected]" - + "\n" - + str(df2.collect()[2]) - + "\n\n" - + "********************" - + "\n\n" - ) - expected_error_message += "\n" + diff_msg + + generated_diff = difflib.ndiff( + str(df1.collect()[0]).splitlines(), str(df2.collect()[0]).splitlines() + ) + diff_msg = "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + generated_diff = difflib.ndiff( + str(df1.collect()[2]).splitlines(), str(df2.collect()[2]).splitlines() + ) + diff_msg += "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + + expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg with self.assertRaises(PySparkAssertionError) as pe: assertDataFrameEqual(df1, df2) @@ -1197,31 +1158,19 @@ class UtilsTestsMixin: expected_error_message = "Results do not match: " percent_diff = (2 / 2) * 100 expected_error_message += "( %.5f %% )" % percent_diff - diff_msg = ( - "[actual]" - + "\n" - + str(df1.collect()[0]) - + "\n\n" - + "[expected]" - + "\n" - + str(list_of_rows[0]) - + "\n\n" - + "********************" - + "\n\n" - ) - diff_msg += ( - "[actual]" - + "\n" - + str(df1.collect()[1]) - + "\n\n" - + "[expected]" - + "\n" - + str(list_of_rows[1]) - + "\n\n" - + "********************" - + "\n\n" - ) - expected_error_message += "\n" + diff_msg + + generated_diff = difflib.ndiff( + str(df1.collect()[0]).splitlines(), str(list_of_rows[0]).splitlines() + ) + diff_msg = "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + generated_diff = difflib.ndiff( + str(df1.collect()[1]).splitlines(), str(list_of_rows[1]).splitlines() + ) + diff_msg += "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" + + expected_error_message += "\n" + "--- actual\n+++ expected\n" + diff_msg with self.assertRaises(PySparkAssertionError) as pe: assertDataFrameEqual(df1, list_of_rows) diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index 14db9264209..acbfb522f69 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -346,6 +346,9 @@ def assertDataFrameEqual( Notes ----- + When assertDataFrameEqual fails, the error message uses the Python `difflib` library to display + a diff log of each row that differs in `actual` and `expected`. + For checkRowOrder, note that PySpark DataFrame ordering is non-deterministic, unless explicitly sorted. @@ -374,15 +377,18 @@ def assertDataFrameEqual( >>> assertDataFrameEqual(df1, df2) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... - PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 66.667 % ) - [actual] - Row(id='1', amount=1000.0) - [expected] - Row(id='1', amount=1001.0) - [actual] - Row(id='3', amount=2000.0) - [expected] - Row(id='3', amount=2003.0) + PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 66.66667 % ) + --- actual + +++ expected + - Row(id='1', amount=1000.0) + ? ^ + + Row(id='1', amount=1001.0) + ? ^ + - Row(id='3', amount=2000.0) + ? ^ + + Row(id='3', amount=2003.0) + ? ^ + """ if actual is None and expected is None: return True @@ -471,15 +477,14 @@ def assertDataFrameEqual( if not compare_rows(r1, r2): rows_equal = False diff_rows_cnt += 1 - diff_msg += ( - "[actual]" + "\n" + str(r1) + "\n\n" + "[expected]" + "\n" + str(r2) + "\n\n" - ) - diff_msg += "********************" + "\n\n" + generated_diff = difflib.ndiff(str(r1).splitlines(), str(r2).splitlines()) + diff_msg += "\n" + "\n".join(generated_diff) + "\n" + diff_msg += "********************" + "\n" if not rows_equal: percent_diff = (diff_rows_cnt / len(zipped)) * 100 error_msg += "( %.5f %% )" % percent_diff - error_msg += "\n" + diff_msg + error_msg += "\n" + "--- actual\n+++ expected\n" + diff_msg raise PySparkAssertionError( error_class="DIFFERENT_ROWS", message_parameters={"error_msg": error_msg}, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org