This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new 548184079bc [SPARK-39186][PYTHON] Make pandas-on-Spark's skew consistent with pandas 548184079bc is described below commit 548184079bc0131f235fc65540911996f7aa6c86 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Sun May 15 09:30:55 2022 +0900 [SPARK-39186][PYTHON] Make pandas-on-Spark's skew consistent with pandas the logics of computing skewness are different between spark sql and pandas: spark sql: [`sqrt(n) * m3 / sqrt(m2 * m2 * m2))`](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L304) pandas: [`(count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)`](https://github.com/pandas-dev/pandas/blob/main/pandas/core/nanops.py#L1221) to make skew consistent with pandas yes, the logic to compute skew was changed added UT Closes #36549 from zhengruifeng/adjust_pandas_skew. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 7e4519c9a8ba35958ef6d408be3ca4e97917c965) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 386c75693b5b9dd5e3b2147d49f0284badaa7d6d) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/generic.py | 11 ++++++++++- python/pyspark/pandas/tests/test_stats.py | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 2fc79f6eb5f..ad3fcd74e57 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -1459,7 +1459,16 @@ class Frame(object, metaclass=ABCMeta): spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() ) ) - return F.skewness(spark_column) + + count_scol = F.count(F.when(~spark_column.isNull(), 1).otherwise(None)) + # refer to the Pandas implementation 'nanskew' + # https://github.com/pandas-dev/pandas/blob/main/pandas/core/nanops.py#L1152 + return F.when( + count_scol > 2, + F.skewness(spark_column) + * F.sqrt(1 - 1 / count_scol) + * (count_scol / (count_scol - 2)), + ).otherwise(None) return self._reduce_for_stat_function( skew, name="skew", axis=axis, numeric_only=numeric_only diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 21366954e32..667b43d0c96 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -183,6 +183,7 @@ class StatsTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1)) self.assert_eq(psdf.product(axis=1), pdf.product(axis=1)) self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1)) + self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True) self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1)) self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1)) self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1)) @@ -220,6 +221,11 @@ class StatsTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq( psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, numeric_only=True) ) + self.assert_eq( + psdf.skew(axis=0, numeric_only=True), + pdf.skew(axis=0, numeric_only=True), + almost=True, + ) self.assert_eq( psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, numeric_only=True) ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org