This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d165de8c04c [SPARK-39246][PS] Implement Groupby.skew d165de8c04c is described below commit d165de8c04c41de5c67925cf670b2d7211c4da68 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Sun May 22 13:42:12 2022 +0800 [SPARK-39246][PS] Implement Groupby.skew ### What changes were proposed in this pull request? Implement Groupby.skew ### Why are the changes needed? for api coverage ### Does this PR introduce _any_ user-facing change? yes, new api added ``` In [4]: df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, True], "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]}) In [5]: df.groupby("A").skew() Out[5]: B C A 1 -1.732051 1.732051 2 NaN NaN ``` ### How was this patch tested? added UT Closes #36624 from zhengruifeng/ps_groupby_skew_kurt. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/groupby.py | 34 +++++++++++++++++++++++++++++ python/pyspark/pandas/missing/groupby.py | 2 -- python/pyspark/pandas/tests/test_groupby.py | 2 +- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index d1cff8e960d..03e6a038232 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -54,6 +54,7 @@ else: _builtin_table = SelectionMixin._builtin_table # type: ignore[attr-defined] +from pyspark import SparkContext from pyspark.sql import Column, DataFrame as SparkDataFrame, Window, functions as F from pyspark.sql.types import ( BooleanType, @@ -725,6 +726,39 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): bool_to_numeric=True, ) + def skew(self) -> FrameLike: + """ + Compute skewness of groups, excluding missing values. + + .. versionadded:: 3.4.0 + + Examples + -------- + >>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, True], + ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]}) + + >>> df.groupby("A").skew() + B C + A + 1 -1.732051 1.732051 + 2 NaN NaN + + See Also + -------- + pyspark.pandas.Series.groupby + pyspark.pandas.DataFrame.groupby + """ + + def skew(scol: Column) -> Column: + sql_utils = SparkContext._active_spark_context._jvm.PythonSQLUtils + return Column(sql_utils.pandasSkewness(scol._jc)) + + return self._reduce_for_stat_function( + skew, + accepted_spark_types=(NumericType,), + bool_to_numeric=True, + ) + # TODO: skipna should be implemented. def all(self) -> FrameLike: """ diff --git a/python/pyspark/pandas/missing/groupby.py b/python/pyspark/pandas/missing/groupby.py index d0867e4982f..3ea443ebd6e 100644 --- a/python/pyspark/pandas/missing/groupby.py +++ b/python/pyspark/pandas/missing/groupby.py @@ -52,7 +52,6 @@ class MissingPandasLikeDataFrameGroupBy: ngroups = _unsupported_property("ngroups") plot = _unsupported_property("plot") quantile = _unsupported_property("quantile") - skew = _unsupported_property("skew") tshift = _unsupported_property("tshift") # Deprecated properties @@ -87,7 +86,6 @@ class MissingPandasLikeSeriesGroupBy: ngroups = _unsupported_property("ngroups") plot = _unsupported_property("plot") quantile = _unsupported_property("quantile") - skew = _unsupported_property("skew") tshift = _unsupported_property("tshift") # Deprecated properties diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 1375d7a9bc0..045cbaf5274 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -266,7 +266,7 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): funcs = [ ((True, False), ["sum", "min", "max", "count", "first", "last"]), ((True, True), ["mean"]), - ((False, False), ["var", "std"]), + ((False, False), ["var", "std", "skew"]), ] funcs = [(check_exact, almost, f) for (check_exact, almost), fs in funcs for f in fs] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org