This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1d09e7b [SPARK-38704][PYTHON] Support string `inclusive` parameter of `Series.between` 1d09e7b is described below commit 1d09e7be3aedd43a0b8beb44f17b7e79b9e9d402 Author: Xinrong Meng <xinrong.m...@databricks.com> AuthorDate: Fri Apr 1 13:38:15 2022 +0900 [SPARK-38704][PYTHON] Support string `inclusive` parameter of `Series.between` ### What changes were proposed in this pull request? Support string `inclusive` parameter of `Series.between` ### Why are the changes needed? To reach parity with Pandas. ### Does this PR introduce _any_ user-facing change? Yes. String `inclusive` is supported now as below. ```py >>> s = ps.Series([2, 0, 4, 8, np.nan]) With `inclusive` set to "both" boundary values are included: >>> s.between(0, 4, inclusive="both") 0 True 1 True 2 True 3 False 4 False dtype: bool With `inclusive` set to "neither" boundary values are excluded: >>> s.between(0, 4, inclusive="neither") 0 True 1 False 2 False 3 False 4 False dtype: bool With `inclusive` set to "right" only right boundary value is included: >>> s.between(0, 4, inclusive="right") 0 True 1 False 2 True 3 False 4 False dtype: bool With `inclusive` set to "left" only left boundary value is included: >>> s.between(0, 4, inclusive="left") 0 True 1 True 2 False 3 False 4 False dtype: bool ``` ### How was this patch tested? Unit tests. Closes #36015 from xinrong-databricks/series.between. Authored-by: Xinrong Meng <xinrong.m...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/series.py | 61 +++++++++++++++++++++++++----- python/pyspark/pandas/tests/test_series.py | 24 ++++++++++++ 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index a73ea1e..5c195da 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -22,6 +22,7 @@ import datetime import re import inspect import sys +import warnings from collections.abc import Mapping from functools import partial, reduce from typing import ( @@ -853,7 +854,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): """ return self.rfloordiv(other), self.rmod(other) - def between(self, left: Any, right: Any, inclusive: bool = True) -> "Series": + def between(self, left: Any, right: Any, inclusive: Union[bool, str] = "both") -> "Series": """ Return boolean Series equivalent to left <= series <= right. This function returns a boolean vector containing `True` wherever the @@ -866,8 +867,9 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Left boundary. right : scalar or list-like Right boundary. - inclusive : bool, default True - Include boundaries. + inclusive : {"both", "neither", "left", "right"} or boolean. "both" by default. + Include boundaries. Whether to set each bound as closed or open. + Booleans are deprecated in favour of `both` or `neither`. Returns ------- @@ -890,19 +892,39 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Boundary values are included by default: - >>> s.between(1, 4) + >>> s.between(0, 4) 0 True - 1 False + 1 True 2 True 3 False 4 False dtype: bool - With `inclusive` set to ``False`` boundary values are excluded: + With `inclusive` set to "neither" boundary values are excluded: + + >>> s.between(0, 4, inclusive="neither") + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + With `inclusive` set to "right" only right boundary value is included: - >>> s.between(1, 4, inclusive=False) + >>> s.between(0, 4, inclusive="right") 0 True 1 False + 2 True + 3 False + 4 False + dtype: bool + + With `inclusive` set to "left" only left boundary value is included: + + >>> s.between(0, 4, inclusive="left") + 0 True + 1 True 2 False 3 False 4 False @@ -918,12 +940,33 @@ class Series(Frame, IndexOpsMixin, Generic[T]): 3 False dtype: bool """ - if inclusive: + if inclusive is True or inclusive is False: + warnings.warn( + "Boolean inputs to the `inclusive` argument are deprecated in " + "favour of `both` or `neither`.", + FutureWarning, + ) + if inclusive: + inclusive = "both" + else: + inclusive = "neither" + + if inclusive == "both": lmask = self >= left rmask = self <= right - else: + elif inclusive == "left": + lmask = self >= left + rmask = self < right + elif inclusive == "right": + lmask = self > left + rmask = self <= right + elif inclusive == "neither": lmask = self > left rmask = self < right + else: + raise ValueError( + "Inclusive has to be either string of 'both'," "'left', 'right', or 'neither'." + ) return lmask & rmask diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 3ed8866..dafb519 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -2970,6 +2970,30 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pser.rpow(np.nan), psser.rpow(np.nan)) self.assert_eq(1 ** pser, 1 ** psser) + def test_between(self): + pser = pd.Series([np.nan, 1, 2, 3, 4]) + psser = ps.from_pandas(pser) + self.assert_eq(psser.between(1, 4), pser.between(1, 4)) + self.assert_eq(psser.between(1, 4, inclusive="both"), pser.between(1, 4, inclusive="both")) + self.assert_eq( + psser.between(1, 4, inclusive="neither"), pser.between(1, 4, inclusive="neither") + ) + self.assert_eq(psser.between(1, 4, inclusive="left"), pser.between(1, 4, inclusive="left")) + self.assert_eq( + psser.between(1, 4, inclusive="right"), pser.between(1, 4, inclusive="right") + ) + expected_err_msg = ( + "Inclusive has to be either string of 'both'," "'left', 'right', or 'neither'" + ) + with self.assertRaisesRegex(ValueError, expected_err_msg): + psser.between(1, 4, inclusive="middle") + + # Test for backward compatibility + self.assert_eq(psser.between(1, 4, inclusive=True), pser.between(1, 4, inclusive=True)) + self.assert_eq(psser.between(1, 4, inclusive=False), pser.between(1, 4, inclusive=False)) + with self.assertWarns(FutureWarning): + psser.between(1, 4, inclusive=True) + def test_between_time(self): idx = pd.date_range("2018-04-09", periods=4, freq="1D20min") pser = pd.Series([1, 2, 3, 4], index=idx) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org