[spark] branch branch-3.5 updated: [SPARK-44652] Raise error when only one df is None

gurwls223 Thu, 03 Aug 2023 16:43:24 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new 1d0d8f5fa6c [SPARK-44652] Raise error when only one df is None
1d0d8f5fa6c is described below

commit 1d0d8f5fa6cd1ebf35301443d5119de752ccd18f
Author: Amanda Liu <amanda....@databricks.com>
AuthorDate: Fri Aug 4 08:42:39 2023 +0900

    [SPARK-44652] Raise error when only one df is None
    
    ### What changes were proposed in this pull request?
    Adds a "raise PySparkAssertionError" for the case when one of `actual` or 
`expected` is None, instead of just returning False.
    
    ### Why are the changes needed?
    The PR ensures that an error is thrown in the assertion for the edge case 
when one of `actual` or `expected` is None
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, the PR affects the user-facing API `assertDataFrameEqual`
    
    ### How was this patch tested?
    Added tests to `python/pyspark/sql/tests/test_utils.py`
    
    Closes #42314 from asl3/raise-none-error.
    
    Authored-by: Amanda Liu <amanda....@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
    (cherry picked from commit 16b031eb144f6ba1c1103be5dcf00d6209adaa85)
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/sql/tests/test_utils.py | 82 ++++++++++++++++++++++++++++++----
 python/pyspark/testing/utils.py        | 32 ++++++++++---
 2 files changed, 99 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/sql/tests/test_utils.py 
b/python/pyspark/sql/tests/test_utils.py
index 76d397e3ade..93895465de7 100644
--- a/python/pyspark/sql/tests/test_utils.py
+++ b/python/pyspark/sql/tests/test_utils.py
@@ -41,6 +41,7 @@ from pyspark.sql.types import (
     BooleanType,
 )
 from pyspark.sql.dataframe import DataFrame
+import pyspark.pandas as ps
 
 import difflib
 from typing import List, Union
@@ -672,9 +673,79 @@ class UtilsTestsMixin:
         assertDataFrameEqual(df1, df2, checkRowOrder=False)
         assertDataFrameEqual(df1, df2, checkRowOrder=True)
 
-    def test_assert_equal_exact_pandas_df(self):
-        import pyspark.pandas as ps
+    def test_assert_unequal_null_actual(self):
+        df1 = None
+        df2 = self.spark.createDataFrame(
+            data=[
+                ("1", 1000),
+                ("2", 3000),
+            ],
+            schema=["id", "amount"],
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="INVALID_TYPE_DF_EQUALITY_ARG",
+            message_parameters={
+                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "arg_name": "actual",
+                "actual_type": None,
+            },
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2, checkRowOrder=True)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="INVALID_TYPE_DF_EQUALITY_ARG",
+            message_parameters={
+                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "arg_name": "actual",
+                "actual_type": None,
+            },
+        )
+
+    def test_assert_unequal_null_expected(self):
+        df1 = self.spark.createDataFrame(
+            data=[
+                ("1", 1000),
+                ("2", 3000),
+            ],
+            schema=["id", "amount"],
+        )
+        df2 = None
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2)
 
+        self.check_error(
+            exception=pe.exception,
+            error_class="INVALID_TYPE_DF_EQUALITY_ARG",
+            message_parameters={
+                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "arg_name": "expected",
+                "actual_type": None,
+            },
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2, checkRowOrder=True)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="INVALID_TYPE_DF_EQUALITY_ARG",
+            message_parameters={
+                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "arg_name": "expected",
+                "actual_type": None,
+            },
+        )
+
+    def test_assert_equal_exact_pandas_df(self):
         df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
         df2 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
 
@@ -682,16 +753,12 @@ class UtilsTestsMixin:
         assertDataFrameEqual(df1, df2, checkRowOrder=True)
 
     def test_assert_equal_exact_pandas_df(self):
-        import pyspark.pandas as ps
-
         df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
         df2 = ps.DataFrame(data=[30, 20, 10], columns=["Numbers"])
 
         assertDataFrameEqual(df1, df2)
 
     def test_assert_equal_approx_pandas_df(self):
-        import pyspark.pandas as ps
-
         df1 = ps.DataFrame(data=[10.0001, 20.32, 30.1], columns=["Numbers"])
         df2 = ps.DataFrame(data=[10.0, 20.32, 30.1], columns=["Numbers"])
 
@@ -699,7 +766,6 @@ class UtilsTestsMixin:
         assertDataFrameEqual(df1, df2, checkRowOrder=True)
 
     def test_assert_error_pandas_pyspark_df(self):
-        import pyspark.pandas as ps
         import pandas as pd
 
         df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
@@ -742,8 +808,6 @@ class UtilsTestsMixin:
         )
 
     def test_assert_error_non_pyspark_df(self):
-        import pyspark.pandas as ps
-
         dict1 = {"a": 1, "b": 2}
         dict2 = {"a": 1, "b": 2}
 
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
index 5461577fad1..8e02803efe5 100644
--- a/python/pyspark/testing/utils.py
+++ b/python/pyspark/testing/utils.py
@@ -292,6 +292,7 @@ def assertSchemaEqual(actual: StructType, expected: 
StructType):
     >>> s1 = StructType([StructField("names", ArrayType(DoubleType(), True), 
True)])
     >>> s2 = StructType([StructField("names", ArrayType(DoubleType(), True), 
True)])
     >>> assertSchemaEqual(s1, s2)  # pass, schemas are identical
+
     >>> df1 = spark.createDataFrame(data=[(1, 1000), (2, 3000)], schema=["id", 
"number"])
     >>> df2 = spark.createDataFrame(data=[("1", 1000), ("2", 5000)], 
schema=["id", "amount"])
     >>> assertSchemaEqual(df1.schema, df2.schema)  # doctest: 
+IGNORE_EXCEPTION_DETAIL
@@ -414,16 +415,20 @@ def assertDataFrameEqual(
     >>> df1 = spark.createDataFrame(data=[("1", 1000), ("2", 3000)], 
schema=["id", "amount"])
     >>> df2 = spark.createDataFrame(data=[("1", 1000), ("2", 3000)], 
schema=["id", "amount"])
     >>> assertDataFrameEqual(df1, df2)  # pass, DataFrames are identical
+
     >>> df1 = spark.createDataFrame(data=[("1", 0.1), ("2", 3.23)], 
schema=["id", "amount"])
     >>> df2 = spark.createDataFrame(data=[("1", 0.109), ("2", 3.23)], 
schema=["id", "amount"])
     >>> assertDataFrameEqual(df1, df2, rtol=1e-1)  # pass, DataFrames are 
approx equal by rtol
+
     >>> df1 = spark.createDataFrame(data=[(1, 1000), (2, 3000)], schema=["id", 
"amount"])
     >>> list_of_rows = [Row(1, 1000), Row(2, 3000)]
     >>> assertDataFrameEqual(df1, list_of_rows)  # pass, actual and expected 
data are equal
+
     >>> import pyspark.pandas as ps
     >>> df1 = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
     >>> df2 = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
     >>> assertDataFrameEqual(df1, df2)  # pass, pandas-on-Spark DataFrames are 
equal
+
     >>> df1 = spark.createDataFrame(
     ...     data=[("1", 1000.00), ("2", 3000.00), ("3", 2000.00)], 
schema=["id", "amount"])
     >>> df2 = spark.createDataFrame(
@@ -436,20 +441,35 @@ def assertDataFrameEqual(
     ! Row(id='1', amount=1000.0)
     Row(id='2', amount=3000.0)
     ! Row(id='3', amount=2000.0)
-
     *** expected ***
     ! Row(id='1', amount=1001.0)
     Row(id='2', amount=3000.0)
     ! Row(id='3', amount=2003.0)
     """
-    if actual is None and expected is None:
-        return True
-    elif actual is None or expected is None:
-        return False
-
     import pyspark.pandas as ps
     from pyspark.testing.pandasutils import assertPandasOnSparkEqual
 
+    if actual is None and expected is None:
+        return True
+    elif actual is None:
+        raise PySparkAssertionError(
+            error_class="INVALID_TYPE_DF_EQUALITY_ARG",
+            message_parameters={
+                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "arg_name": "actual",
+                "actual_type": None,
+            },
+        )
+    elif expected is None:
+        raise PySparkAssertionError(
+            error_class="INVALID_TYPE_DF_EQUALITY_ARG",
+            message_parameters={
+                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "arg_name": "expected",
+                "actual_type": None,
+            },
+        )
+
     try:
         # If Spark Connect dependencies are available, allow Spark Connect 
DataFrame
         from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.5 updated: [SPARK-44652] Raise error when only one df is None

Reply via email to