This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ebc24e06cd0d [SPARK-55363][PS][TESTS] Make ops tests with
"decimal_nan" columns ignore NaN vs. None
ebc24e06cd0d is described below
commit ebc24e06cd0d6c89ef6f717e94f2d81dfe9c94fb
Author: Takuya Ueshin <[email protected]>
AuthorDate: Thu Feb 5 10:57:52 2026 +0800
[SPARK-55363][PS][TESTS] Make ops tests with "decimal_nan" columns ignore
NaN vs. None
### What changes were proposed in this pull request?
Makes ops tests with "decimal_nan" columns ignore `NaN` vs. `None`.
### Why are the changes needed?
pandas 3 made `assert_frame_equal` strictly check `NaN` vs. `None`.
```py
>>> pdf = pd.DataFrame([decimal.Decimal(np.nan)])
>>> psdf = ps.from_pandas(pdf)
>>>
>>> pdf
0
0 NaN
>>> psdf
0
0 None
```
- pandas < 3
```py
>>> pd.__version__
'2.3.3'
>>> assert_frame_equal(pdf, psdf.to_pandas())
<stdin>:1: FutureWarning: Mismatched null-like values NaN and None found.
In a future version, pandas equality-testing functions (e.g.
assert_frame_equal) will consider these not-matching and raise.
```
- pandas == 3
```py
>>> pd.__version__
'3.0.0'
>>> assert_frame_equal(pdf, psdf.to_pandas())
Traceback (most recent call last):
...
AssertionError: DataFrame.iloc[:, 0] (column name="0") are different
DataFrame.iloc[:, 0] (column name="0") values are different (100.0 %)
[index]: [0]
[left]: [NaN]
[right]: [None]
At positional index 0, first diff: NaN != None
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Updated the related tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54146 from ueshin/issues/SPARK-55363/ignore_null.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../pandas/tests/data_type_ops/test_boolean_ops.py | 20 ++++++++--
.../tests/data_type_ops/test_num_arithmetic.py | 46 ++++++++++++++++------
.../pandas/tests/data_type_ops/test_num_mod.py | 15 +++++--
.../pandas/tests/data_type_ops/test_num_ops.py | 11 ++++--
.../pandas/tests/data_type_ops/test_num_reverse.py | 28 +++++++------
.../pandas/tests/data_type_ops/testing_utils.py | 4 ++
6 files changed, 89 insertions(+), 35 deletions(-)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index 05ddc6587ac7..66ab8437d300 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -55,7 +55,10 @@ class BooleanOpsTestsMixin:
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(b_pser + pser, b_psser + psser, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(
+ b_pser + pser, b_psser + psser, check_exact=False,
ignore_null=ignore_null
+ )
for col in self.non_numeric_df_cols:
pser, psser = pdf[col], psdf[col]
if col == "bool":
@@ -74,7 +77,10 @@ class BooleanOpsTestsMixin:
self.assertRaises(TypeError, lambda: b_psser - True)
for col in self.numeric_df_cols:
- self.assert_eq(b_pser - pdf[col], b_psser - psdf[col],
check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(
+ b_pser - pdf[col], b_psser - psdf[col], check_exact=False,
ignore_null=ignore_null
+ )
for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser - psdf[col])
@@ -91,7 +97,10 @@ class BooleanOpsTestsMixin:
self.assert_eq(b_pser * False, b_psser * False)
for col in self.numeric_df_cols:
- self.assert_eq(b_pser * pdf[col], b_psser * psdf[col],
check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(
+ b_pser * pdf[col], b_psser * psdf[col], check_exact=False,
ignore_null=ignore_null
+ )
for col in self.non_numeric_df_cols:
pser, psser = pdf[col], psdf[col]
@@ -149,7 +158,10 @@ class BooleanOpsTestsMixin:
self.assertRaises(TypeError, lambda: b_psser % True)
for col in self.numeric_df_cols:
- self.assert_eq(b_pser % pdf[col], b_psser % psdf[col],
check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(
+ b_pser % pdf[col], b_psser % psdf[col], check_exact=False,
ignore_null=ignore_null
+ )
for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser % psdf[col])
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
index ab50fa4e5e6a..5d56b8275c4c 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
@@ -44,15 +44,26 @@ class ArithmeticTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(pser + pser, psser + psser, check_exact=False)
- self.assert_eq(pser + 1, psser + 1, check_exact=False)
- self.assert_eq(pser + pser.astype(bool), psser +
psser.astype(bool), check_exact=False)
- self.assert_eq(pser + True, psser + True, check_exact=False)
- self.assert_eq(pser + False, psser + False, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(pser + pser, psser + psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(pser + 1, psser + 1, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(
+ pser + pser.astype(bool),
+ psser + psser.astype(bool),
+ check_exact=False,
+ ignore_null=ignore_null,
+ )
+ self.assert_eq(pser + True, psser + True, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(pser + False, psser + False, check_exact=False,
ignore_null=ignore_null)
for n_col in self.non_numeric_df_cols:
if n_col == "bool":
- self.assert_eq(pser + pdf[n_col], psser + psdf[n_col],
check_exact=False)
+ self.assert_eq(
+ pser + pdf[n_col],
+ psser + psdf[n_col],
+ check_exact=False,
+ ignore_null=ignore_null,
+ )
else:
self.assertRaises(TypeError, lambda: psser + psdf[n_col])
@@ -63,15 +74,26 @@ class ArithmeticTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(pser - pser, psser - psser, check_exact=False)
- self.assert_eq(pser - 1, psser - 1, check_exact=False)
- self.assert_eq(pser - pser.astype(bool), psser -
psser.astype(bool), check_exact=False)
- self.assert_eq(pser - True, psser - True, check_exact=False)
- self.assert_eq(pser - False, psser - False, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(pser - pser, psser - psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(pser - 1, psser - 1, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(
+ pser - pser.astype(bool),
+ psser - psser.astype(bool),
+ check_exact=False,
+ ignore_null=ignore_null,
+ )
+ self.assert_eq(pser - True, psser - True, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(pser - False, psser - False, check_exact=False,
ignore_null=ignore_null)
for n_col in self.non_numeric_df_cols:
if n_col == "bool":
- self.assert_eq(pser - pdf[n_col], psser - psdf[n_col],
check_exact=False)
+ self.assert_eq(
+ pser - pdf[n_col],
+ psser - psdf[n_col],
+ check_exact=False,
+ ignore_null=ignore_null,
+ )
else:
self.assertRaises(TypeError, lambda: psser - psdf[n_col])
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
index d222db2d7a38..d40b83ffea8d 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
@@ -37,10 +37,17 @@ class NumModTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(pser % pser, psser % psser, check_exact=False)
- self.assert_eq(pser % pser.astype(bool), psser %
psser.astype(bool), check_exact=False)
- self.assert_eq(pser % True, psser % True, check_exact=False)
- self.assert_eq(pser % 1, psser % 1, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(pser % pser, psser % psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(
+ pser % pser.astype(bool),
+ psser % psser.astype(bool),
+ check_exact=False,
+ ignore_null=ignore_null,
+ )
+ self.assert_eq(pser % True, psser % True, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(pser % 1, psser % 1, check_exact=False,
ignore_null=ignore_null)
+
if not col.startswith("decimal"):
self.assert_eq(pser % 0, psser % 0, check_exact=False)
if col in ["int", "int32"]:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index a0a04c385303..eaf27fbe709a 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -102,7 +102,8 @@ class NumOpsTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(pser, psser._to_pandas(), check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(pser, psser._to_pandas(), check_exact=False,
ignore_null=ignore_null)
self.assert_eq(ps.from_pandas(pser), psser)
def test_isnull(self):
@@ -113,12 +114,16 @@ class NumOpsTestsMixin:
def test_neg(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
- self.assert_eq(-pdf[col], -psdf[col], check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(-pdf[col], -psdf[col], check_exact=False,
ignore_null=ignore_null)
def test_abs(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
- self.assert_eq(abs(pdf[col]), abs(psdf[col]), check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(
+ abs(pdf[col]), abs(psdf[col]), check_exact=False,
ignore_null=ignore_null
+ )
def test_invert(self):
pdf, psdf = self.pdf, self.psdf
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
b/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
index 3a073c81f3aa..d3e1f74ff5a2 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
@@ -44,11 +44,12 @@ class ReverseTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(1 + pser, 1 + psser, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(1 + pser, 1 + psser, check_exact=False,
ignore_null=ignore_null)
# self.assert_eq(0.1 + pser, 0.1 + psser)
self.assertRaises(TypeError, lambda: "x" + psser)
- self.assert_eq(True + pser, True + psser, check_exact=False)
- self.assert_eq(False + pser, False + psser, check_exact=False)
+ self.assert_eq(True + pser, True + psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(False + pser, False + psser, check_exact=False,
ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) +
psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1)
+ psser)
@@ -56,11 +57,12 @@ class ReverseTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(1 - pser, 1 - psser, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(1 - pser, 1 - psser, check_exact=False,
ignore_null=ignore_null)
# self.assert_eq(0.1 - pser, 0.1 - psser)
self.assertRaises(TypeError, lambda: "x" - psser)
- self.assert_eq(True - pser, True - psser, check_exact=False)
- self.assert_eq(False - pser, False - psser, check_exact=False)
+ self.assert_eq(True - pser, True - psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(False - pser, False - psser, check_exact=False,
ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) -
psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1)
- psser)
@@ -68,11 +70,12 @@ class ReverseTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(1 * pser, 1 * psser, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(1 * pser, 1 * psser, check_exact=False,
ignore_null=ignore_null)
# self.assert_eq(0.1 * pser, 0.1 * psser)
self.assertRaises(TypeError, lambda: "x" * psser)
- self.assert_eq(True * pser, True * psser, check_exact=False)
- self.assert_eq(False * pser, False * psser, check_exact=False)
+ self.assert_eq(True * pser, True * psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(False * pser, False * psser, check_exact=False,
ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) *
psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1)
* psser)
@@ -116,10 +119,11 @@ class ReverseTestsMixin:
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
- self.assert_eq(1 % pser, 1 % psser, check_exact=False)
+ ignore_null = self.ignore_null(col)
+ self.assert_eq(1 % pser, 1 % psser, check_exact=False,
ignore_null=ignore_null)
# self.assert_eq(0.1 % pser, 0.1 % psser)
- self.assert_eq(True % pser, True % psser, check_exact=False)
- self.assert_eq(False % pser, False % psser, check_exact=False)
+ self.assert_eq(True % pser, True % psser, check_exact=False,
ignore_null=ignore_null)
+ self.assert_eq(False % pser, False % psser, check_exact=False,
ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) %
psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1)
% psser)
diff --git a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
index 04d03a05e02d..358178060029 100644
--- a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
+++ b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py
@@ -22,6 +22,7 @@ import numpy as np
import pandas as pd
import pyspark.pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.pandas.typedef.typehints import (
extension_dtypes_available,
extension_float_dtypes_available,
@@ -219,3 +220,6 @@ class OpsTestBase:
pandas versions. Please refer to
https://github.com/pandas-dev/pandas/issues/39410.
"""
self.assert_eq(left, right)
+
+ def ignore_null(self, col):
+ return LooseVersion(pd.__version__) >= LooseVersion("3.0") and col ==
"decimal_nan"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]