This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0bcf9971652a [SPARK-55376][PS] Make numeric_only argument in groupby
functions accept only boolean with pandas 3
0bcf9971652a is described below
commit 0bcf9971652afa13c50b320453f405400547c7a5
Author: Takuya Ueshin <[email protected]>
AuthorDate: Fri Feb 6 13:16:12 2026 +0800
[SPARK-55376][PS] Make numeric_only argument in groupby functions accept
only boolean with pandas 3
### What changes were proposed in this pull request?
Makes `numeric_only` argument in `groupby` functions accept only boolean
with pandas 3.
### Why are the changes needed?
pandas 3 doesn't accept non-boolean values, like `None`, for `numeric_only`
argument in `groupby` functions anymore.
We should follow this behavior when working with pandas 3.
Note: some functions in `DataFrame` and `Series` also have `numeric_only`
argument, but they don't raise an exception or even show warnings, so they are
left as they are for now.
### Does this PR introduce _any_ user-facing change?
Yes, `numeric_only` argument in `groupby` functions won't accept
non-boolean values when working with pandas 3.
### How was this patch tested?
Updated the related tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54162 from ueshin/issues/SPARK-55376/numeric_only.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/groupby.py | 30 +++++++++++++
python/pyspark/pandas/tests/groupby/test_stat.py | 52 ++++++++++++++++++----
.../pyspark/pandas/tests/groupby/test_stat_adv.py | 12 +++--
.../pyspark/pandas/tests/groupby/test_stat_func.py | 29 +++++++++---
.../pyspark/pandas/tests/groupby/test_stat_prod.py | 10 ++++-
5 files changed, 116 insertions(+), 17 deletions(-)
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index b66f079897dc..cdc6aed99f88 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -492,6 +492,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
a 1.0 True 3.0
b NaN None NaN
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(min_count, int):
raise TypeError("min_count must be integer")
@@ -562,6 +565,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
a 2.0 True 4.0
b NaN None NaN
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(min_count, int):
raise TypeError("min_count must be integer")
@@ -626,6 +632,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
a 2.0 True 4.0
b NaN None NaN
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(min_count, int):
raise TypeError("min_count must be integer")
@@ -672,6 +681,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
1 3.0 1.333333 0.333333
2 4.0 1.500000 1.000000
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
self._validate_agg_columns(numeric_only=numeric_only,
function_name="median")
return self._reduce_for_stat_function(
@@ -802,6 +814,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
a 1.0 False 3.0
b NaN None NaN
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(min_count, int):
raise TypeError("min_count must be integer")
@@ -919,6 +934,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
pyspark.pandas.Series.groupby
pyspark.pandas.DataFrame.groupby
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if numeric_only is not None and not isinstance(numeric_only, bool):
raise TypeError("numeric_only must be None or bool")
if not isinstance(min_count, int):
@@ -980,6 +998,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
pyspark.pandas.Series.groupby
pyspark.pandas.DataFrame.groupby
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(ddof, int):
raise TypeError("ddof must be integer")
@@ -1248,6 +1269,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
1 NaN 2.0 0.0
2 NaN NaN NaN
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(min_count, int):
raise TypeError("min_count must be integer")
@@ -3495,6 +3519,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
3.0 7.0
Name: b, dtype: float64
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if not isinstance(accuracy, int):
raise TypeError(
"accuracy must be an integer; however, got [%s]" %
type(accuracy).__name__
@@ -4053,6 +4080,9 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
2 B 1.000000 NaN
C NaN 1.000000
"""
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if not isinstance(numeric_only, bool):
+ raise ValueError("numeric_only accepts only Boolean values")
if method not in ["pearson", "spearman", "kendall"]:
raise ValueError(f"Invalid method {method}")
diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py
b/python/pyspark/pandas/tests/groupby/test_stat.py
index c2c15ef7c6e4..c95cde1742fc 100644
--- a/python/pyspark/pandas/tests/groupby/test_stat.py
+++ b/python/pyspark/pandas/tests/groupby/test_stat.py
@@ -14,18 +14,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
+from typing import Optional, Type
import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
+using_pandas3 = LooseVersion(pd.__version__) >= "3.0.0"
+
+
class GroupbyStatTestingFuncMixin:
# TODO: All statistical functions should leverage this utility
- def _test_stat_func(self, func, check_exact=True):
+ def _test_stat_func(
+ self, func, check_exact=True, expected_error:
Optional[Type[Exception]] = None
+ ):
pdf, psdf = self.pdf, self.psdf
for p_groupby_obj, ps_groupby_obj in [
# Against DataFrameGroupBy
@@ -35,11 +42,17 @@ class GroupbyStatTestingFuncMixin:
# Against SeriesGroupBy
(pdf.groupby("A")["B"], psdf.groupby("A")["B"]),
]:
- self.assert_eq(
- func(p_groupby_obj).sort_index(),
- func(ps_groupby_obj).sort_index(),
- check_exact=check_exact,
- )
+ if expected_error is None:
+ self.assert_eq(
+ func(p_groupby_obj).sort_index(),
+ func(ps_groupby_obj).sort_index(),
+ check_exact=check_exact,
+ )
+ else:
+ with self.assertRaises(expected_error):
+ func(p_groupby_obj)
+ with self.assertRaises(expected_error):
+ func(ps_groupby_obj)
class GroupbyStatMixin(GroupbyStatTestingFuncMixin):
@@ -60,6 +73,12 @@ class GroupbyStatMixin(GroupbyStatTestingFuncMixin):
def test_mean(self):
self._test_stat_func(lambda groupby_obj:
groupby_obj.mean(numeric_only=True))
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ # pandas < 3 raises an error when numeric_only is False or None
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.mean(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
psdf = self.psdf
with self.assertRaises(TypeError):
psdf.groupby("A")["C"].mean()
@@ -67,14 +86,20 @@ class GroupbyStatMixin(GroupbyStatTestingFuncMixin):
def test_min(self):
self._test_stat_func(lambda groupby_obj: groupby_obj.min())
self._test_stat_func(lambda groupby_obj: groupby_obj.min(min_count=2))
- self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=None))
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.min(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=True))
self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=True, min_count=2))
def test_max(self):
self._test_stat_func(lambda groupby_obj: groupby_obj.max())
self._test_stat_func(lambda groupby_obj: groupby_obj.max(min_count=2))
- self._test_stat_func(lambda groupby_obj:
groupby_obj.max(numeric_only=None))
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.max(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
self._test_stat_func(lambda groupby_obj:
groupby_obj.max(numeric_only=True))
self._test_stat_func(lambda groupby_obj:
groupby_obj.max(numeric_only=True, min_count=2))
@@ -96,6 +121,10 @@ class GroupbyStatMixin(GroupbyStatTestingFuncMixin):
pdf.groupby("A").sum(min_count=3).sort_index(),
psdf.groupby("A").sum(min_count=3).sort_index(),
)
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.sum(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
def test_median(self):
psdf = ps.DataFrame(
@@ -121,6 +150,13 @@ class GroupbyStatMixin(GroupbyStatTestingFuncMixin):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer;
however"):
psdf.groupby("a").median(accuracy="a")
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ # pandas < 3 raises an error when numeric_only is False or None
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.median(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
+
class GroupbyStatTests(
GroupbyStatMixin,
diff --git a/python/pyspark/pandas/tests/groupby/test_stat_adv.py
b/python/pyspark/pandas/tests/groupby/test_stat_adv.py
index 70b9b8797ef9..e3f17d22b11c 100644
--- a/python/pyspark/pandas/tests/groupby/test_stat_adv.py
+++ b/python/pyspark/pandas/tests/groupby/test_stat_adv.py
@@ -21,7 +21,7 @@ import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
-from pyspark.pandas.tests.groupby.test_stat import GroupbyStatTestingFuncMixin
+from pyspark.pandas.tests.groupby.test_stat import
GroupbyStatTestingFuncMixin, using_pandas3
class GroupbyStatAdvMixin(GroupbyStatTestingFuncMixin):
@@ -84,7 +84,10 @@ class GroupbyStatAdvMixin(GroupbyStatTestingFuncMixin):
def test_first(self):
self._test_stat_func(lambda groupby_obj: groupby_obj.first())
- self._test_stat_func(lambda groupby_obj:
groupby_obj.first(numeric_only=None))
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.first(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
self._test_stat_func(lambda groupby_obj:
groupby_obj.first(numeric_only=True))
pdf = pd.DataFrame(
@@ -108,7 +111,10 @@ class GroupbyStatAdvMixin(GroupbyStatTestingFuncMixin):
def test_last(self):
self._test_stat_func(lambda groupby_obj: groupby_obj.last())
- self._test_stat_func(lambda groupby_obj:
groupby_obj.last(numeric_only=None))
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.last(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
self._test_stat_func(lambda groupby_obj:
groupby_obj.last(numeric_only=True))
pdf = pd.DataFrame(
diff --git a/python/pyspark/pandas/tests/groupby/test_stat_func.py
b/python/pyspark/pandas/tests/groupby/test_stat_func.py
index e2a914e57e24..7eca1e53918a 100644
--- a/python/pyspark/pandas/tests/groupby/test_stat_func.py
+++ b/python/pyspark/pandas/tests/groupby/test_stat_func.py
@@ -19,8 +19,9 @@
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
-from pyspark.pandas.tests.groupby.test_stat import GroupbyStatTestingFuncMixin
+from pyspark.pandas.tests.groupby.test_stat import
GroupbyStatTestingFuncMixin, using_pandas3
class FuncTestsMixin(GroupbyStatTestingFuncMixin):
@@ -43,6 +44,12 @@ class FuncTestsMixin(GroupbyStatTestingFuncMixin):
self._test_stat_func(
lambda groupby_obj: groupby_obj.var(numeric_only=True),
check_exact=False
)
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ # pandas < 3 raises an error when numeric_only is False or None
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.var(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
pdf, psdf = self.pdf, self.psdf
@@ -54,10 +61,16 @@ class FuncTestsMixin(GroupbyStatTestingFuncMixin):
psdf.groupby("A").median().sort_index(),
expected,
)
- self.assert_eq(
- psdf.groupby("A").median(numeric_only=None).sort_index(),
- expected,
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(
+ psdf.groupby("A").median(numeric_only=None).sort_index(),
+ expected,
+ )
+ else:
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.median(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
self.assert_eq(
psdf.groupby("A").median(numeric_only=False).sort_index(),
expected,
@@ -96,6 +109,12 @@ class FuncTestsMixin(GroupbyStatTestingFuncMixin):
pdf.groupby("A").sum().sort_index(),
check_exact=False,
)
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ # pandas < 3 raises an error when numeric_only is False or None
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.sum(numeric_only=None),
+ expected_error=ValueError if using_pandas3 else None,
+ )
class FuncTests(
diff --git a/python/pyspark/pandas/tests/groupby/test_stat_prod.py
b/python/pyspark/pandas/tests/groupby/test_stat_prod.py
index 12e73a700708..c4564cc0949a 100644
--- a/python/pyspark/pandas/tests/groupby/test_stat_prod.py
+++ b/python/pyspark/pandas/tests/groupby/test_stat_prod.py
@@ -20,8 +20,9 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
-from pyspark.pandas.tests.groupby.test_stat import GroupbyStatTestingFuncMixin
+from pyspark.pandas.tests.groupby.test_stat import
GroupbyStatTestingFuncMixin, using_pandas3
class ProdTestsMixin(GroupbyStatTestingFuncMixin):
@@ -64,6 +65,13 @@ class ProdTestsMixin(GroupbyStatTestingFuncMixin):
psdf.groupby("A").prod(min_count=n).sort_index(),
almost=True,
)
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ # pandas < 3 raises an error when numeric_only is False or None
+ self._test_stat_func(
+ lambda groupby_obj: groupby_obj.prod(numeric_only=None,
min_count=n),
+ check_exact=False,
+ expected_error=ValueError if using_pandas3 else None,
+ )
class ProdTests(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]