[spark] branch master updated: [SPARK-40305][PS] Implement Groupby.sem

ruifengz Sat, 03 Sep 2022 23:26:26 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new dde94a68b85 [SPARK-40305][PS] Implement Groupby.sem
dde94a68b85 is described below

commit dde94a68b850b23df8fca3531350a7b5643b3cd1
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Sun Sep 4 14:25:47 2022 +0800

    [SPARK-40305][PS] Implement Groupby.sem
    
    ### What changes were proposed in this pull request?
    Implement Groupby.sem
    
    ### Why are the changes needed?
    to increase API coverage
    
    ### Does this PR introduce _any_ user-facing change?
    yes, new API
    
    ### How was this patch tested?
    added UT
    
    Closes #37756 from zhengruifeng/ps_groupby_sem.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 .../source/reference/pyspark.pandas/groupby.rst    |  1 +
 python/pyspark/pandas/generic.py                   |  2 +-
 python/pyspark/pandas/groupby.py                   | 80 ++++++++++++++++++++--
 python/pyspark/pandas/missing/groupby.py           |  2 -
 python/pyspark/pandas/tests/test_groupby.py        | 21 ++++++
 5 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/python/docs/source/reference/pyspark.pandas/groupby.rst 
b/python/docs/source/reference/pyspark.pandas/groupby.rst
index 6d8eed8e684..b331a49b683 100644
--- a/python/docs/source/reference/pyspark.pandas/groupby.rst
+++ b/python/docs/source/reference/pyspark.pandas/groupby.rst
@@ -74,6 +74,7 @@ Computations / Descriptive Stats
    GroupBy.median
    GroupBy.min
    GroupBy.rank
+   GroupBy.sem
    GroupBy.std
    GroupBy.sum
    GroupBy.var
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index bd2b68da51f..8ce3061b7cd 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -2189,7 +2189,7 @@ class Frame(object, metaclass=ABCMeta):
                 return F.stddev_samp(spark_column)
 
         def sem(psser: "Series") -> Column:
-            return std(psser) / pow(Frame._count_expr(psser), 0.5)
+            return std(psser) / F.sqrt(Frame._count_expr(psser))
 
         return self._reduce_for_stat_function(
             sem,
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 4377ad6a5c9..84a5a3377f3 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -650,12 +650,11 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         assert ddof in (0, 1)
 
         # Raise the TypeError when all aggregation columns are of unaccepted 
data types
-        all_unaccepted = True
-        for _agg_col in self._agg_columns:
-            if isinstance(_agg_col.spark.data_type, (NumericType, 
BooleanType)):
-                all_unaccepted = False
-                break
-        if all_unaccepted:
+        any_accepted = any(
+            isinstance(_agg_col.spark.data_type, (NumericType, BooleanType))
+            for _agg_col in self._agg_columns
+        )
+        if not any_accepted:
             raise TypeError(
                 "Unaccepted data types of aggregation columns; numeric or bool 
expected."
             )
@@ -827,6 +826,75 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
 
         return self._prepare_return(DataFrame(internal))
 
+    def sem(self, ddof: int = 1) -> FrameLike:
+        """
+        Compute standard error of the mean of groups, excluding missing values.
+
+        .. versionadded:: 3.4.0
+
+        Parameters
+        ----------
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - 
ddof,
+            where N represents the number of elements.
+
+        Examples
+        --------
+        >>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, 
True],
+        ...                    "C": [3, None, 3, 4], "D": ["a", "b", "b", 
"a"]})
+
+        >>> df.groupby("A").sem()
+                  B         C
+        A
+        1  0.333333  0.333333
+        2       NaN       NaN
+
+        >>> df.groupby("D").sem(ddof=1)
+             A    B    C
+        D
+        a  0.0  0.0  0.5
+        b  0.5  0.0  NaN
+
+        >>> df.B.groupby(df.A).sem()
+        A
+        1    0.333333
+        2         NaN
+        Name: B, dtype: float64
+
+        See Also
+        --------
+        pyspark.pandas.Series.sem
+        pyspark.pandas.DataFrame.sem
+        """
+        if ddof not in [0, 1]:
+            raise TypeError("ddof must be 0 or 1")
+
+        # Raise the TypeError when all aggregation columns are of unaccepted 
data types
+        any_accepted = any(
+            isinstance(_agg_col.spark.data_type, (NumericType, BooleanType))
+            for _agg_col in self._agg_columns
+        )
+        if not any_accepted:
+            raise TypeError(
+                "Unaccepted data types of aggregation columns; numeric or bool 
expected."
+            )
+
+        if ddof == 0:
+
+            def sem(col: Column) -> Column:
+                return F.stddev_pop(col) / F.sqrt(F.count(col))
+
+        else:
+
+            def sem(col: Column) -> Column:
+                return F.stddev_samp(col) / F.sqrt(F.count(col))
+
+        return self._reduce_for_stat_function(
+            sem,
+            accepted_spark_types=(NumericType, BooleanType),
+            bool_to_numeric=True,
+        )
+
     def all(self, skipna: bool = True) -> FrameLike:
         """
         Returns True if all values in the group are truthful, else False.
diff --git a/python/pyspark/pandas/missing/groupby.py 
b/python/pyspark/pandas/missing/groupby.py
index ce61b1df1e1..8ae8a68b5fe 100644
--- a/python/pyspark/pandas/missing/groupby.py
+++ b/python/pyspark/pandas/missing/groupby.py
@@ -65,7 +65,6 @@ class MissingPandasLikeDataFrameGroupBy:
     pipe = _unsupported_function("pipe")
     prod = _unsupported_function("prod")
     resample = _unsupported_function("resample")
-    sem = _unsupported_function("sem")
 
 
 class MissingPandasLikeSeriesGroupBy:
@@ -100,4 +99,3 @@ class MissingPandasLikeSeriesGroupBy:
     pipe = _unsupported_function("pipe")
     prod = _unsupported_function("prod")
     resample = _unsupported_function("resample")
-    sem = _unsupported_function("sem")
diff --git a/python/pyspark/pandas/tests/test_groupby.py 
b/python/pyspark/pandas/tests/test_groupby.py
index cff2ce706d8..d1a1c9afcc7 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -1321,11 +1321,21 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
         ):
             psdf.groupby("A")[["C"]].std()
 
+        with self.assertRaisesRegex(
+            TypeError, "Unaccepted data types of aggregation columns; numeric 
or bool expected."
+        ):
+            psdf.groupby("A")[["C"]].sem()
+
         self.assert_eq(
             psdf.groupby("A").std().sort_index(),
             pdf.groupby("A").std().sort_index(),
             check_exact=False,
         )
+        self.assert_eq(
+            psdf.groupby("A").sem().sort_index(),
+            pdf.groupby("A").sem().sort_index(),
+            check_exact=False,
+        )
 
         # TODO: fix bug of `sum` and re-enable the test below
         # self._test_stat_func(lambda groupby_obj: groupby_obj.sum(), 
check_exact=False)
@@ -3055,6 +3065,17 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
                 psdf.groupby("a")["b"].var(ddof=ddof).sort_index(),
                 check_exact=False,
             )
+            # sem
+            self.assert_eq(
+                pdf.groupby("a").sem(ddof=ddof).sort_index(),
+                psdf.groupby("a").sem(ddof=ddof).sort_index(),
+                check_exact=False,
+            )
+            self.assert_eq(
+                pdf.groupby("a")["b"].sem(ddof=ddof).sort_index(),
+                psdf.groupby("a")["b"].sem(ddof=ddof).sort_index(),
+                check_exact=False,
+            )
 
     def test_getitem(self):
         psdf = ps.DataFrame(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-40305][PS] Implement Groupby.sem

Reply via email to