This is an automated email from the ASF dual-hosted git repository.
xinrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new adef6f30fe34 [SPARK-53479][PS] Align `==` behavior with pandas when
comparing against scalar under ANSI
adef6f30fe34 is described below
commit adef6f30fe34634f2e7f0bdd2a51ca6243a50cca
Author: Xinrong Meng <[email protected]>
AuthorDate: Mon Sep 8 17:37:50 2025 -0700
[SPARK-53479][PS] Align `==` behavior with pandas when comparing against
scalar under ANSI
### What changes were proposed in this pull request?
- Ensure `==` returns a nameless Series when comparing with another
Series/Index, but preserves the name for scalar comparisons.
- Add test cases to compare with `np.nan`
### Why are the changes needed?
Part of https://issues.apache.org/jira/browse/SPARK-53389
### Does this PR introduce _any_ user-facing change?
No, the feature is not released yet.
For example,
Before
```py
>>> psdf['int'] == 'x'
0 False
1 False
dtype: bool
```
After
```py
>>> psdf['int'] == 'x'
0 False
1 False
Name: int, dtype: bool
```
which follows native pandas
```py
>>> pdf['int'] == 'x'
0 False
1 False
Name: int, dtype: bool
```
### How was this patch tested?
Unit tests
Commands below passed
```py
1037 SPARK_ANSI_SQL_MODE=true ./python/run-tests
--python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_num_ops
NumOpsTests.test_comparison_dtype_compatibility"
1038 SPARK_ANSI_SQL_MODE=false ./python/run-tests
--python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_num_ops
NumOpsTests.test_comparison_dtype_compatibility"
1039 SPARK_ANSI_SQL_MODE=true ./python/run-tests
--python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_eq"
1040 SPARK_ANSI_SQL_MODE=false ./python/run-tests
--python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_eq"
1041 SPARK_ANSI_SQL_MODE=false ./python/run-tests
--python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_ne"
1042 SPARK_ANSI_SQL_MODE=true ./python/run-tests
--python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_ne"
```
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #52224 from xinrong-meng/cmp_op_test.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Xinrong Meng <[email protected]>
---
python/pyspark/pandas/data_type_ops/num_ops.py | 10 ++++++++--
python/pyspark/pandas/tests/data_type_ops/test_num_ops.py | 3 +++
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 54fc06a7901d..022a5114f8f0 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -17,7 +17,7 @@
import decimal
import numbers
-from typing import Any, Union, Callable
+from typing import Any, Union, Callable, cast
import numpy as np
import pandas as pd
@@ -275,7 +275,13 @@ class NumericOps(DataTypeOps):
if is_ansi_mode_enabled(left._internal.spark_frame.sparkSession):
if _should_return_all_false(left, right):
left_scol = left._with_new_scol(F.lit(False))
- return left_scol.rename(None) # type: ignore[attr-defined]
+ if isinstance(right, IndexOpsMixin):
+ # When comparing with another Series/Index, drop the
name
+ # to align with pandas behavior
+ return left_scol.rename(None) # type:
ignore[attr-defined]
+ else:
+ # When comparing with scalar-like, keep the name of
left operand
+ return cast(SeriesOrIndex, left_scol)
if _is_boolean_type(right): # numeric vs. bool
right = transform_boolean_operand_to_numeric(
right, spark_type=left.spark.data_type
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index 00fc04e36231..c19f39d541f7 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -140,16 +140,19 @@ class NumOpsTestsMixin:
if is_ansi_mode_test: # TODO: match non-ansi behavior with pandas
self.assert_eq(pdf["int"] == pdf["str"], psdf["int"] ==
psdf["str"])
self.assert_eq(pdf["float"] == pdf["bool"], psdf["float"] ==
psdf["bool"])
+ self.assert_eq(pdf["str"] == "x", psdf["str"] == "x")
def test_eq(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
self.assert_eq(pdf[col] == pdf[col], psdf[col] == psdf[col])
+ self.assert_eq(pdf[col] == np.nan, psdf[col] == np.nan)
def test_ne(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
self.assert_eq(pdf[col] != pdf[col], psdf[col] != psdf[col])
+ self.assert_eq(pdf[col] != np.nan, psdf[col] != np.nan)
def test_lt(self):
pdf, psdf = self.pdf, self.psdf
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]