This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 211ce40888d [SPARK-40561][PS] Implement `min_count` in `GroupBy.min` 211ce40888d is described below commit 211ce40888dcaaa3c3ffbd316109e17d0caad4e3 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Tue Sep 27 09:53:09 2022 +0800 [SPARK-40561][PS] Implement `min_count` in `GroupBy.min` ### What changes were proposed in this pull request? Implement `min_count` in `GroupBy.min` ### Why are the changes needed? for API coverage ### Does this PR introduce _any_ user-facing change? yes, new parameter `min_count` supported ``` >>> df.groupby("D").min(min_count=3).sort_index() A B C D a 1.0 False 3.0 b NaN None NaN ``` ### How was this patch tested? added UT and doctest Closes #37998 from zhengruifeng/ps_groupby_min. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/groupby.py | 41 ++++++++++++++++++++++++++--- python/pyspark/pandas/tests/test_groupby.py | 2 ++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 6d36cfecce6..7085d2ec059 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -643,16 +643,23 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): bool_to_numeric=True, ) - def min(self, numeric_only: Optional[bool] = False) -> FrameLike: + def min(self, numeric_only: Optional[bool] = False, min_count: int = -1) -> FrameLike: """ Compute min of group values. + .. versionadded:: 3.3.0 + Parameters ---------- numeric_only : bool, default False Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. + .. versionadded:: 3.4.0 + min_count : bool, default -1 + The required number of valid values to perform the operation. If fewer + than min_count non-NA values are present the result will be NA. + .. versionadded:: 3.4.0 See Also @@ -663,7 +670,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): Examples -------- >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True], - ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]}) + ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]}) >>> df.groupby("A").min().sort_index() B C D A @@ -677,9 +684,37 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): A 1 False 3 2 False 4 + + >>> df.groupby("D").min().sort_index() + A B C + D + a 1 False 3 + b 1 False 3 + + + >>> df.groupby("D").min(min_count=3).sort_index() + A B C + D + a 1.0 False 3.0 + b NaN None NaN """ + if not isinstance(min_count, int): + raise TypeError("min_count must be integer") + + if min_count > 0: + + def min(col: Column) -> Column: + return F.when( + F.count(F.when(~F.isnull(col), F.lit(0))) < min_count, F.lit(None) + ).otherwise(F.min(col)) + + else: + + def min(col: Column) -> Column: + return F.min(col) + return self._reduce_for_stat_function( - F.min, accepted_spark_types=(NumericType, BooleanType) if numeric_only else None + min, accepted_spark_types=(NumericType, BooleanType) if numeric_only else None ) # TODO: sync the doc. diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 4a57a3421df..f0b3a04be17 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -1401,8 +1401,10 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): def test_min(self): self._test_stat_func(lambda groupby_obj: groupby_obj.min()) + self._test_stat_func(lambda groupby_obj: groupby_obj.min(min_count=2)) self._test_stat_func(lambda groupby_obj: groupby_obj.min(numeric_only=None)) self._test_stat_func(lambda groupby_obj: groupby_obj.min(numeric_only=True)) + self._test_stat_func(lambda groupby_obj: groupby_obj.min(numeric_only=True, min_count=2)) def test_max(self): self._test_stat_func(lambda groupby_obj: groupby_obj.max()) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org