allisonwang-db commented on code in PR #42332:
URL: https://github.com/apache/spark/pull/42332#discussion_r1287428596


##########
python/pyspark/testing/utils.py:
##########
@@ -464,23 +467,42 @@ def assertDataFrameEqual(
         raise PySparkAssertionError(
             error_class="INVALID_TYPE_DF_EQUALITY_ARG",
             message_parameters={
-                "expected_type": Union[DataFrame, ps.DataFrame, List[Row]],
+                "expected_type": "Union[DataFrame, ps.DataFrame, List[Row]]",
                 "arg_name": "expected",
                 "actual_type": None,
             },
         )
 
+    has_pandas = False
     try:
-        # If Spark Connect dependencies are available, allow Spark Connect 
DataFrame
-        from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
+        # If pandas dependencies are available, allow pandas or 
pandas-on-Spark DataFrame
+        import pyspark.pandas as ps
+        import pandas as pd
+        from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
-        if isinstance(actual, ps.DataFrame) or isinstance(expected, 
ps.DataFrame):
+        has_pandas = True
+    except Exception:
+        # no pandas, so we won't call pandasutils functions
+        pass
+
+    if has_pandas:
+        if (
+            isinstance(actual, pd.DataFrame)
+            or isinstance(expected, pd.DataFrame)
+            or isinstance(actual, ps.DataFrame)
+            or isinstance(expected, ps.DataFrame)
+        ):
             # handle pandas DataFrames
             # assert approximate equality for float data
-            return assertPandasOnSparkEqual(
-                actual, expected, checkExact=False, checkRowOrder=checkRowOrder
+            return PandasOnSparkTestUtils().assert_eq(
+                actual, expected, almost=True, rtol=rtol, atol=atol, 
check_row_order=checkRowOrder
             )
-        elif not isinstance(actual, (DataFrame, ConnectDataFrame, list)):
+
+    try:
+        # If Spark Connect dependencies are available, allow Spark Connect 
DataFrame
+        from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
+
+        if not isinstance(actual, (DataFrame, ConnectDataFrame, list)):

Review Comment:
   One thing I find useful is comparing a regular spark dataframe with a spark 
connect dataframe, for example, in this test case:
   
https://github.com/apache/spark/blob/418bba5ad6053449a141f3c9c31ed3ad998995b8/python/pyspark/sql/tests/connect/test_connect_function.py#L2346-L2355
   
   Just wondering, if we can convert both dataframes into pandas dataframes to 
compare them, can we also support this in `assertDataFrameEqual`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to