This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 26ed4fbc00d [SPARK-43873][PS] Enabling `FrameDescribeTests` 26ed4fbc00d is described below commit 26ed4fbc00dd9331807f747dd4e8ed7993c2497f Author: itholic <haejoon....@databricks.com> AuthorDate: Fri Aug 4 10:35:06 2023 +0900 [SPARK-43873][PS] Enabling `FrameDescribeTests` ### What changes were proposed in this pull request? This PR proposes to enable the test `FrameDescribeTests`. ### Why are the changes needed? To increate test coverage for pandas API on Spark with pandas 2.0.0 and above. ### Does this PR introduce _any_ user-facing change? No, it's test-only. ### How was this patch tested? Enabling the existing test. Closes #42319 from itholic/pandas_describe. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../pandas/tests/computation/test_describe.py | 39 +++++----------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/python/pyspark/pandas/tests/computation/test_describe.py b/python/pyspark/pandas/tests/computation/test_describe.py index af98d2869da..bbee9654eae 100644 --- a/python/pyspark/pandas/tests/computation/test_describe.py +++ b/python/pyspark/pandas/tests/computation/test_describe.py @@ -39,10 +39,6 @@ class FrameDescribeMixin: psdf = ps.from_pandas(pdf) return pdf, psdf - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.", - ) def test_describe(self): pdf, psdf = self.df_pair @@ -78,19 +74,10 @@ class FrameDescribeMixin: } ) pdf = psdf._to_pandas() - # NOTE: Set `datetime_is_numeric=True` for pandas: - # FutureWarning: Treating datetime data as categorical rather than numeric in - # `.describe` is deprecated and will be removed in a future version of pandas. - # Specify `datetime_is_numeric=True` to silence this - # warning and adopt the future behavior now. - # NOTE: Compare the result except percentiles, since we use approximate percentile - # so the result is different from pandas. if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): self.assert_eq( psdf.describe().loc[["count", "mean", "min", "max"]], - pdf.describe(datetime_is_numeric=True) - .astype(str) - .loc[["count", "mean", "min", "max"]], + pdf.describe().astype(str).loc[["count", "mean", "min", "max"]], ) else: self.assert_eq( @@ -136,17 +123,13 @@ class FrameDescribeMixin: if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): self.assert_eq( psdf.describe().loc[["count", "mean", "min", "max"]], - pdf.describe(datetime_is_numeric=True) - .astype(str) - .loc[["count", "mean", "min", "max"]], + pdf.describe().astype(str).loc[["count", "mean", "min", "max"]], ) psdf.A += psdf.A pdf.A += pdf.A self.assert_eq( psdf.describe().loc[["count", "mean", "min", "max"]], - pdf.describe(datetime_is_numeric=True) - .astype(str) - .loc[["count", "mean", "min", "max"]], + pdf.describe().astype(str).loc[["count", "mean", "min", "max"]], ) else: expected_result = ps.DataFrame( @@ -187,7 +170,7 @@ class FrameDescribeMixin: ) pdf = psdf._to_pandas() if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - pandas_result = pdf.describe(datetime_is_numeric=True) + pandas_result = pdf.describe() pandas_result.B = pandas_result.B.astype(str) self.assert_eq( psdf.describe().loc[["count", "mean", "min", "max"]], @@ -195,7 +178,7 @@ class FrameDescribeMixin: ) psdf.A += psdf.A pdf.A += pdf.A - pandas_result = pdf.describe(datetime_is_numeric=True) + pandas_result = pdf.describe() pandas_result.B = pandas_result.B.astype(str) self.assert_eq( psdf.describe().loc[["count", "mean", "min", "max"]], @@ -252,7 +235,7 @@ class FrameDescribeMixin: ) pdf = psdf._to_pandas() if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - pandas_result = pdf.describe(datetime_is_numeric=True) + pandas_result = pdf.describe() pandas_result.b = pandas_result.b.astype(str) self.assert_eq( psdf.describe().loc[["count", "mean", "min", "max"]], @@ -288,10 +271,6 @@ class FrameDescribeMixin: with self.assertRaisesRegex(ValueError, msg): psdf.describe() - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.", - ) def test_describe_empty(self): # Empty DataFrame psdf = ps.DataFrame(columns=["A", "B"]) @@ -328,7 +307,7 @@ class FrameDescribeMixin: # For timestamp type, we should convert NaT to None in pandas result # since pandas API on Spark doesn't support the NaT for object type. if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True) + pdf_result = pdf[pdf.a != pdf.a].describe() self.assert_eq( psdf[psdf.a != psdf.a].describe(), pdf_result.where(pdf_result.notnull(), None).astype(str), @@ -367,7 +346,7 @@ class FrameDescribeMixin: ) pdf = psdf._to_pandas() if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True) + pdf_result = pdf[pdf.a != pdf.a].describe() pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str) self.assert_eq( psdf[psdf.a != psdf.a].describe(), @@ -417,7 +396,7 @@ class FrameDescribeMixin: ) pdf = psdf._to_pandas() if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): - pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True) + pdf_result = pdf[pdf.a != pdf.a].describe() self.assert_eq( psdf[psdf.a != psdf.a].describe(), pdf_result.where(pdf_result.notnull(), None).astype(str), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org