[GitHub] [spark] asl3 commented on a diff in pull request #42332: [SPARK-44665][PYTHON] Add support for pandas DataFrame assertDataFrameEqual

via GitHub Mon, 07 Aug 2023 06:36:02 -0700


asl3 commented on code in PR #42332:
URL: https://github.com/apache/spark/pull/42332#discussion_r1285877854



##########
python/pyspark/sql/tests/test_utils.py:
##########
@@ -739,33 +738,154 @@ def test_assert_unequal_null_expected(self):
             exception=pe.exception,
             error_class="INVALID_TYPE_DF_EQUALITY_ARG",
             message_parameters={
-                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
                 "arg_name": "expected",
                 "actual_type": None,
             },
         )
 
+    @unittest.skipIf(not have_pandas, "no pandas dependency")
     def test_assert_equal_exact_pandas_df(self):
+        import pandas as pd
+        import numpy as np
+
+        df1 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)]), columns=["a", 
"b", "c"]
+        )
+        df2 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)]), columns=["a", 
"b", "c"]
+        )
+
+        assertDataFrameEqual(df1, df2, checkRowOrder=False)
+        assertDataFrameEqual(df1, df2, checkRowOrder=True)
+
+    @unittest.skipIf(not have_pandas, "no pandas dependency")
+    def test_assert_approx_equal_exact_pandas_df(self):
+        import pandas as pd
+        import numpy as np
+
+        # test that asserts close enough equality for pandas df
+        df1 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 59)]), columns=["a", 
"b", "c"]
+        )
+        df2 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 59.0001)]), 
columns=["a", "b", "c"]
+        )
+
+        assertDataFrameEqual(df1, df2, checkRowOrder=False)
+        assertDataFrameEqual(df1, df2, checkRowOrder=True)
+
+    @unittest.skipIf(not have_pandas, "no pandas dependency")
+    def test_assert_unequal_pandas_df(self):
+        import pandas as pd
+        import numpy as np
+
+        df1 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (6, 5, 4)]), columns=["a", 
"b", "c"]
+        )
+        df2 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)]), columns=["a", 
"b", "c"]
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2, checkRowOrder=False)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="DIFFERENT_PANDAS_DATAFRAME",
+            message_parameters={
+                "left": df1.to_string(),
+                "left_dtype": str(df1.dtypes),
+                "right": df2.to_string(),
+                "right_dtype": str(df2.dtypes),
+            },
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2, checkRowOrder=True)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="DIFFERENT_PANDAS_DATAFRAME",
+            message_parameters={
+                "left": df1.to_string(),
+                "left_dtype": str(df1.dtypes),
+                "right": df2.to_string(),
+                "right_dtype": str(df2.dtypes),
+            },
+        )
+
+    @unittest.skipIf(not have_pandas, "no pandas dependency")
+    def test_assert_type_error_pandas_df(self):
+        import pyspark.pandas as ps
+        import pandas as pd
+        import numpy as np
+
+        df1 = ps.DataFrame(data=[10, 20, 30], columns=["Numbers"])
+        df2 = pd.DataFrame(
+            data=np.array([(1, 2, 3), (4, 5, 6), (6, 5, 4)]), columns=["a", 
"b", "c"]
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2, checkRowOrder=False)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="DIFFERENT_PANDAS_DATAFRAME",
+            message_parameters={
+                "left": df1.to_string(),
+                "left_dtype": str(df1.dtypes),
+                "right": df2.to_string(),
+                "right_dtype": str(df2.dtypes),
+            },
+        )
+
+        with self.assertRaises(PySparkAssertionError) as pe:
+            assertDataFrameEqual(df1, df2, checkRowOrder=True)
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="DIFFERENT_PANDAS_DATAFRAME",
+            message_parameters={
+                "left": df1.to_string(),
+                "left_dtype": str(df1.dtypes),
+                "right": df2.to_string(),
+                "right_dtype": str(df2.dtypes),
+            },
+        )
+
+    @unittest.skipIf(not have_pandas, "no pandas dependency")

Review Comment:
   checking here since `pyspark.pandas` uses pandas as a dependency



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] asl3 commented on a diff in pull request #42332: [SPARK-44665][PYTHON] Add support for pandas DataFrame assertDataFrameEqual

Reply via email to