This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 9accb5c539f [SPARK-44363][PYTHON] Display percent of unequal rows in 
DataFrame comparison
9accb5c539f is described below

commit 9accb5c539f6783c3e9e0147f2199ea370af26c4
Author: Amanda Liu <amanda....@databricks.com>
AuthorDate: Tue Jul 11 12:49:10 2023 +0900

    [SPARK-44363][PYTHON] Display percent of unequal rows in DataFrame 
comparison
    
    ### What changes were proposed in this pull request?
    This PR fixes error message display of percent of unequal rows for unequal 
DataFrames, in the `assertDataFrameEqual` util function.
    
    ### Why are the changes needed?
    The correction is needed to provide accurate error message output.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, the PR modifies user-facing error message for the 
`assertDataFrameEqual` util function.
    
    ### How was this patch tested?
    Modified existing tests in `runtime/python/pyspark/sql/tests/test_utils.py` 
and `runtime/python/pyspark/sql/tests/connect/test_utils.py`
    
    Closes #41926 from asl3/fix-percent-diff.
    
    Authored-by: Amanda Liu <amanda....@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/sql/tests/test_utils.py | 8 ++++----
 python/pyspark/testing/utils.py        | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/sql/tests/test_utils.py 
b/python/pyspark/sql/tests/test_utils.py
index 6666fa64858..1757d8dd2e1 100644
--- a/python/pyspark/sql/tests/test_utils.py
+++ b/python/pyspark/sql/tests/test_utils.py
@@ -142,7 +142,7 @@ class UtilsTestsMixin:
         )
 
         expected_error_message = "Results do not match: "
-        percent_diff = 1 / 2
+        percent_diff = (1 / 2) * 100
         expected_error_message += "( %.5f %% )" % percent_diff
         diff_msg = (
             "[df]"
@@ -457,7 +457,7 @@ class UtilsTestsMixin:
         )
 
         expected_error_message = "Results do not match: "
-        percent_diff = 1 / 2
+        percent_diff = (1 / 2) * 100
         expected_error_message += "( %.5f %% )" % percent_diff
         diff_msg = (
             "[df]"
@@ -553,7 +553,7 @@ class UtilsTestsMixin:
         )
 
         expected_error_message = "Results do not match: "
-        percent_diff = 2 / 2
+        percent_diff = (2 / 2) * 100
         expected_error_message += "( %.5f %% )" % percent_diff
         diff_msg = (
             "[df]"
@@ -641,7 +641,7 @@ class UtilsTestsMixin:
         )
 
         expected_error_message = "Results do not match: "
-        percent_diff = 2 / 3
+        percent_diff = (2 / 3) * 100
         expected_error_message += "( %.5f %% )" % percent_diff
         diff_msg = (
             "[df]"
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
index c6ec6adc8af..651d57bb11d 100644
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@@ -239,7 +239,7 @@ def assertDataFrameEqual(df: DataFrame, expected: 
DataFrame, check_row_order: bo
         The expected result of the operation, for comparison with the actual 
result.
 
     check_row_order : bool, optional
-        A flag indicates whether the order of rows should be considered in the 
comparison.
+        A flag indicating whether the order of rows should be considered in 
the comparison.
         If set to `False` (default), the row order is not taken into account.
         If set to `True`, the order of rows is important and will be checked 
during comparison.
 
@@ -258,7 +258,7 @@ def assertDataFrameEqual(df: DataFrame, expected: 
DataFrame, check_row_order: bo
     >>> assertDataFrameEqual(df1, df2) # fail  # doctest: 
+IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
     ...
-    PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 0.66667 % )
+    PySparkAssertionError: [DIFFERENT_ROWS] Results do not match: ( 66.667 % )
     [df]
     Row(id='1', amount=1000.0)
     <BLANKLINE>
@@ -370,7 +370,7 @@ def assertDataFrameEqual(df: DataFrame, expected: 
DataFrame, check_row_order: bo
                 diff_msg += "********************" + "\n\n"
 
         if not rows_equal:
-            percent_diff = diff_rows_cnt / len(zipped)
+            percent_diff = (diff_rows_cnt / len(zipped)) * 100
             error_msg += "( %.5f %% )" % percent_diff
             error_msg += "\n" + diff_msg
             raise PySparkAssertionError(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to