This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ebc24e06cd0d [SPARK-55363][PS][TESTS] Make ops tests with 
"decimal_nan" columns ignore NaN vs. None
ebc24e06cd0d is described below

commit ebc24e06cd0d6c89ef6f717e94f2d81dfe9c94fb
Author: Takuya Ueshin <[email protected]>
AuthorDate: Thu Feb 5 10:57:52 2026 +0800

    [SPARK-55363][PS][TESTS] Make ops tests with "decimal_nan" columns ignore 
NaN vs. None
    
    ### What changes were proposed in this pull request?
    
    Makes ops tests with "decimal_nan" columns ignore `NaN` vs. `None`.
    
    ### Why are the changes needed?
    
    pandas 3 made `assert_frame_equal` strictly check `NaN` vs. `None`.
    
    ```py
    >>> pdf = pd.DataFrame([decimal.Decimal(np.nan)])
    >>> psdf = ps.from_pandas(pdf)
    >>>
    >>> pdf
         0
    0  NaN
    >>> psdf
          0
    0  None
    ```
    
    - pandas < 3
    
    ```py
    >>> pd.__version__
    '2.3.3'
    >>> assert_frame_equal(pdf, psdf.to_pandas())
    <stdin>:1: FutureWarning: Mismatched null-like values NaN and None found. 
In a future version, pandas equality-testing functions (e.g. 
assert_frame_equal) will consider these not-matching and raise.
    ```
    
    - pandas == 3
    
    ```py
    >>> pd.__version__
    '3.0.0'
    >>> assert_frame_equal(pdf, psdf.to_pandas())
    Traceback (most recent call last):
    ...
    AssertionError: DataFrame.iloc[:, 0] (column name="0") are different
    
    DataFrame.iloc[:, 0] (column name="0") values are different (100.0 %)
    [index]: [0]
    [left]:  [NaN]
    [right]: [None]
    At positional index 0, first diff: NaN != None
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Updated the related tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #54146 from ueshin/issues/SPARK-55363/ignore_null.
    
    Authored-by: Takuya Ueshin <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../pandas/tests/data_type_ops/test_boolean_ops.py | 20 ++++++++--
 .../tests/data_type_ops/test_num_arithmetic.py     | 46 ++++++++++++++++------
 .../pandas/tests/data_type_ops/test_num_mod.py     | 15 +++++--
 .../pandas/tests/data_type_ops/test_num_ops.py     | 11 ++++--
 .../pandas/tests/data_type_ops/test_num_reverse.py | 28 +++++++------
 .../pandas/tests/data_type_ops/testing_utils.py    |  4 ++
 6 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index 05ddc6587ac7..66ab8437d300 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -55,7 +55,10 @@ class BooleanOpsTestsMixin:
 
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(b_pser + pser, b_psser + psser, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(
+                b_pser + pser, b_psser + psser, check_exact=False, 
ignore_null=ignore_null
+            )
         for col in self.non_numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
             if col == "bool":
@@ -74,7 +77,10 @@ class BooleanOpsTestsMixin:
         self.assertRaises(TypeError, lambda: b_psser - True)
 
         for col in self.numeric_df_cols:
-            self.assert_eq(b_pser - pdf[col], b_psser - psdf[col], 
check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(
+                b_pser - pdf[col], b_psser - psdf[col], check_exact=False, 
ignore_null=ignore_null
+            )
 
         for col in self.non_numeric_df_cols:
             self.assertRaises(TypeError, lambda: b_psser - psdf[col])
@@ -91,7 +97,10 @@ class BooleanOpsTestsMixin:
         self.assert_eq(b_pser * False, b_psser * False)
 
         for col in self.numeric_df_cols:
-            self.assert_eq(b_pser * pdf[col], b_psser * psdf[col], 
check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(
+                b_pser * pdf[col], b_psser * psdf[col], check_exact=False, 
ignore_null=ignore_null
+            )
 
         for col in self.non_numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
@@ -149,7 +158,10 @@ class BooleanOpsTestsMixin:
         self.assertRaises(TypeError, lambda: b_psser % True)
 
         for col in self.numeric_df_cols:
-            self.assert_eq(b_pser % pdf[col], b_psser % psdf[col], 
check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(
+                b_pser % pdf[col], b_psser % psdf[col], check_exact=False, 
ignore_null=ignore_null
+            )
 
         for col in self.non_numeric_df_cols:
             self.assertRaises(TypeError, lambda: b_psser % psdf[col])
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py 
b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
index ab50fa4e5e6a..5d56b8275c4c 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
@@ -44,15 +44,26 @@ class ArithmeticTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser + pser, psser + psser, check_exact=False)
-            self.assert_eq(pser + 1, psser + 1, check_exact=False)
-            self.assert_eq(pser + pser.astype(bool), psser + 
psser.astype(bool), check_exact=False)
-            self.assert_eq(pser + True, psser + True, check_exact=False)
-            self.assert_eq(pser + False, psser + False, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(pser + pser, psser + psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(pser + 1, psser + 1, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(
+                pser + pser.astype(bool),
+                psser + psser.astype(bool),
+                check_exact=False,
+                ignore_null=ignore_null,
+            )
+            self.assert_eq(pser + True, psser + True, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(pser + False, psser + False, check_exact=False, 
ignore_null=ignore_null)
 
             for n_col in self.non_numeric_df_cols:
                 if n_col == "bool":
-                    self.assert_eq(pser + pdf[n_col], psser + psdf[n_col], 
check_exact=False)
+                    self.assert_eq(
+                        pser + pdf[n_col],
+                        psser + psdf[n_col],
+                        check_exact=False,
+                        ignore_null=ignore_null,
+                    )
                 else:
                     self.assertRaises(TypeError, lambda: psser + psdf[n_col])
 
@@ -63,15 +74,26 @@ class ArithmeticTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser - pser, psser - psser, check_exact=False)
-            self.assert_eq(pser - 1, psser - 1, check_exact=False)
-            self.assert_eq(pser - pser.astype(bool), psser - 
psser.astype(bool), check_exact=False)
-            self.assert_eq(pser - True, psser - True, check_exact=False)
-            self.assert_eq(pser - False, psser - False, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(pser - pser, psser - psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(pser - 1, psser - 1, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(
+                pser - pser.astype(bool),
+                psser - psser.astype(bool),
+                check_exact=False,
+                ignore_null=ignore_null,
+            )
+            self.assert_eq(pser - True, psser - True, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(pser - False, psser - False, check_exact=False, 
ignore_null=ignore_null)
 
             for n_col in self.non_numeric_df_cols:
                 if n_col == "bool":
-                    self.assert_eq(pser - pdf[n_col], psser - psdf[n_col], 
check_exact=False)
+                    self.assert_eq(
+                        pser - pdf[n_col],
+                        psser - psdf[n_col],
+                        check_exact=False,
+                        ignore_null=ignore_null,
+                    )
                 else:
                     self.assertRaises(TypeError, lambda: psser - psdf[n_col])
 
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py 
b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
index d222db2d7a38..d40b83ffea8d 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
@@ -37,10 +37,17 @@ class NumModTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser % pser, psser % psser, check_exact=False)
-            self.assert_eq(pser % pser.astype(bool), psser % 
psser.astype(bool), check_exact=False)
-            self.assert_eq(pser % True, psser % True, check_exact=False)
-            self.assert_eq(pser % 1, psser % 1, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(pser % pser, psser % psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(
+                pser % pser.astype(bool),
+                psser % psser.astype(bool),
+                check_exact=False,
+                ignore_null=ignore_null,
+            )
+            self.assert_eq(pser % True, psser % True, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(pser % 1, psser % 1, check_exact=False, 
ignore_null=ignore_null)
+
             if not col.startswith("decimal"):
                 self.assert_eq(pser % 0, psser % 0, check_exact=False)
             if col in ["int", "int32"]:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index a0a04c385303..eaf27fbe709a 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -102,7 +102,8 @@ class NumOpsTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(pser, psser._to_pandas(), check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(pser, psser._to_pandas(), check_exact=False, 
ignore_null=ignore_null)
             self.assert_eq(ps.from_pandas(pser), psser)
 
     def test_isnull(self):
@@ -113,12 +114,16 @@ class NumOpsTestsMixin:
     def test_neg(self):
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
-            self.assert_eq(-pdf[col], -psdf[col], check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(-pdf[col], -psdf[col], check_exact=False, 
ignore_null=ignore_null)
 
     def test_abs(self):
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
-            self.assert_eq(abs(pdf[col]), abs(psdf[col]), check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(
+                abs(pdf[col]), abs(psdf[col]), check_exact=False, 
ignore_null=ignore_null
+            )
 
     def test_invert(self):
         pdf, psdf = self.pdf, self.psdf
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py 
b/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
index 3a073c81f3aa..d3e1f74ff5a2 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
@@ -44,11 +44,12 @@ class ReverseTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(1 + pser, 1 + psser, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(1 + pser, 1 + psser, check_exact=False, 
ignore_null=ignore_null)
             # self.assert_eq(0.1 + pser, 0.1 + psser)
             self.assertRaises(TypeError, lambda: "x" + psser)
-            self.assert_eq(True + pser, True + psser, check_exact=False)
-            self.assert_eq(False + pser, False + psser, check_exact=False)
+            self.assert_eq(True + pser, True + psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(False + pser, False + psser, check_exact=False, 
ignore_null=ignore_null)
             self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + 
psser)
             self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) 
+ psser)
 
@@ -56,11 +57,12 @@ class ReverseTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(1 - pser, 1 - psser, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(1 - pser, 1 - psser, check_exact=False, 
ignore_null=ignore_null)
             # self.assert_eq(0.1 - pser, 0.1 - psser)
             self.assertRaises(TypeError, lambda: "x" - psser)
-            self.assert_eq(True - pser, True - psser, check_exact=False)
-            self.assert_eq(False - pser, False - psser, check_exact=False)
+            self.assert_eq(True - pser, True - psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(False - pser, False - psser, check_exact=False, 
ignore_null=ignore_null)
             self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - 
psser)
             self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) 
- psser)
 
@@ -68,11 +70,12 @@ class ReverseTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(1 * pser, 1 * psser, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(1 * pser, 1 * psser, check_exact=False, 
ignore_null=ignore_null)
             # self.assert_eq(0.1 * pser, 0.1 * psser)
             self.assertRaises(TypeError, lambda: "x" * psser)
-            self.assert_eq(True * pser, True * psser, check_exact=False)
-            self.assert_eq(False * pser, False * psser, check_exact=False)
+            self.assert_eq(True * pser, True * psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(False * pser, False * psser, check_exact=False, 
ignore_null=ignore_null)
             self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) * 
psser)
             self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) 
* psser)
 
@@ -116,10 +119,11 @@ class ReverseTestsMixin:
         pdf, psdf = self.pdf, self.psdf
         for col in self.numeric_df_cols:
             pser, psser = pdf[col], psdf[col]
-            self.assert_eq(1 % pser, 1 % psser, check_exact=False)
+            ignore_null = self.ignore_null(col)
+            self.assert_eq(1 % pser, 1 % psser, check_exact=False, 
ignore_null=ignore_null)
             # self.assert_eq(0.1 % pser, 0.1 % psser)
-            self.assert_eq(True % pser, True % psser, check_exact=False)
-            self.assert_eq(False % pser, False % psser, check_exact=False)
+            self.assert_eq(True % pser, True % psser, check_exact=False, 
ignore_null=ignore_null)
+            self.assert_eq(False % pser, False % psser, check_exact=False, 
ignore_null=ignore_null)
             self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % 
psser)
             self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) 
% psser)
 
diff --git a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py 
b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
index 04d03a05e02d..358178060029 100644
--- a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
+++ b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
@@ -22,6 +22,7 @@ import numpy as np
 import pandas as pd
 
 import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.pandas.typedef.typehints import (
     extension_dtypes_available,
     extension_float_dtypes_available,
@@ -219,3 +220,6 @@ class OpsTestBase:
         pandas versions. Please refer to 
https://github.com/pandas-dev/pandas/issues/39410.
         """
         self.assert_eq(left, right)
+
+    def ignore_null(self, col):
+        return LooseVersion(pd.__version__) >= LooseVersion("3.0") and col == 
"decimal_nan"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to