This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ef05fb632424 [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests` ef05fb632424 is described below commit ef05fb632424f8f121bcd4518ab3a8815c295c85 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Tue Dec 5 10:19:41 2023 -0800 [SPARK-46268][PS][CONNECT][TESTS] Re-organize `StatsTests` ### What changes were proposed in this pull request? Re-organize `StatsTests` ### Why are the changes needed? break the big test file by grouping test cases by topics ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44185 from zhengruifeng/ps_reorg_test_stats. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- dev/sparktestsupport/modules.py | 8 +- .../pyspark/pandas/tests/computation/test_corr.py | 222 ++++++++++++++++ .../pandas/tests/{ => computation}/test_stats.py | 278 +-------------------- .../test_parity_corr.py} | 7 +- .../connect/{ => computation}/test_parity_stats.py | 4 +- .../test_parity_axis.py} | 6 +- python/pyspark/pandas/tests/frame/test_axis.py | 135 ++++++++++ 7 files changed, 373 insertions(+), 287 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 9bbe86baa1dc..900329d07c00 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -789,6 +789,7 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.computation.test_binary_ops", "pyspark.pandas.tests.computation.test_combine", "pyspark.pandas.tests.computation.test_compute", + "pyspark.pandas.tests.computation.test_corr", "pyspark.pandas.tests.computation.test_corrwith", "pyspark.pandas.tests.computation.test_cov", "pyspark.pandas.tests.computation.test_cumulative", @@ -797,7 +798,9 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.computation.test_melt", "pyspark.pandas.tests.computation.test_missing_data", "pyspark.pandas.tests.computation.test_pivot", + "pyspark.pandas.tests.computation.test_stats", "pyspark.pandas.tests.frame.test_attrs", + "pyspark.pandas.tests.frame.test_axis", "pyspark.pandas.tests.frame.test_constructor", "pyspark.pandas.tests.frame.test_conversion", "pyspark.pandas.tests.frame.test_reindexing", @@ -841,7 +844,6 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.series.test_series", "pyspark.pandas.tests.series.test_sort", "pyspark.pandas.tests.series.test_stat", - "pyspark.pandas.tests.test_stats", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -1014,6 +1016,7 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.computation.test_parity_combine", "pyspark.pandas.tests.connect.computation.test_parity_compute", "pyspark.pandas.tests.connect.computation.test_parity_cov", + "pyspark.pandas.tests.connect.computation.test_parity_corr", "pyspark.pandas.tests.connect.computation.test_parity_corrwith", "pyspark.pandas.tests.connect.computation.test_parity_cumulative", "pyspark.pandas.tests.connect.computation.test_parity_describe", @@ -1021,6 +1024,7 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.computation.test_parity_melt", "pyspark.pandas.tests.connect.groupby.test_parity_stat", "pyspark.pandas.tests.connect.frame.test_parity_attrs", + "pyspark.pandas.tests.connect.frame.test_parity_axis", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series", ], @@ -1075,7 +1079,6 @@ pyspark_pandas_connect_part1 = Module( "pyspark.pandas.tests.connect.series.test_parity_stat", "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic", "pyspark.pandas.tests.connect.test_parity_reshape", - "pyspark.pandas.tests.connect.test_parity_stats", "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_expanding", ], excluded_python_implementations=[ @@ -1094,6 +1097,7 @@ pyspark_pandas_connect_part2 = Module( python_test_goals=[ # pandas-on-Spark unittests "pyspark.pandas.tests.connect.computation.test_parity_pivot", + "pyspark.pandas.tests.connect.computation.test_parity_stats", "pyspark.pandas.tests.connect.indexes.test_parity_base_slow", "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property", "pyspark.pandas.tests.connect.test_parity_frame_interpolate", diff --git a/python/pyspark/pandas/tests/computation/test_corr.py b/python/pyspark/pandas/tests/computation/test_corr.py new file mode 100644 index 000000000000..a7b06aa2928a --- /dev/null +++ b/python/pyspark/pandas/tests/computation/test_corr.py @@ -0,0 +1,222 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED +from pyspark.testing.sqlutils import SQLTestUtils + + +class FrameCorrMixin: + def test_dataframe_corr(self): + pdf = pd.DataFrame( + index=[ + "".join( + np.random.choice( + list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10 + ) + ) + for _ in range(30) + ], + columns=list("ABCD"), + dtype="float64", + ) + psdf = ps.from_pandas(pdf) + + with self.assertRaisesRegex(ValueError, "Invalid method"): + psdf.corr("std") + with self.assertRaisesRegex(TypeError, "Invalid min_periods type"): + psdf.corr(min_periods="3") + + for method in ["pearson", "spearman", "kendall"]: + self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), check_exact=False) + self.assert_eq( + psdf.corr(method=method, min_periods=1), + pdf.corr(method=method, min_periods=1), + check_exact=False, + ) + self.assert_eq( + psdf.corr(method=method, min_periods=3), + pdf.corr(method=method, min_periods=3), + check_exact=False, + ) + self.assert_eq( + (psdf + 1).corr(method=method, min_periods=2), + (pdf + 1).corr(method=method, min_periods=2), + check_exact=False, + ) + + # multi-index columns + columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) + pdf.columns = columns + psdf.columns = columns + + for method in ["pearson", "spearman", "kendall"]: + self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), check_exact=False) + self.assert_eq( + psdf.corr(method=method, min_periods=1), + pdf.corr(method=method, min_periods=1), + check_exact=False, + ) + self.assert_eq( + psdf.corr(method=method, min_periods=3), + pdf.corr(method=method, min_periods=3), + check_exact=False, + ) + self.assert_eq( + (psdf + 1).corr(method=method, min_periods=2), + (pdf + 1).corr(method=method, min_periods=2), + check_exact=False, + ) + + # test with identical values + pdf = pd.DataFrame( + { + "a": [0, 1, 1, 1, 0], + "b": [2, 2, -1, 1, np.nan], + "c": [3, 3, 3, 3, 3], + "d": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + psdf = ps.from_pandas(pdf) + + for method in ["pearson", "spearman", "kendall"]: + self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), check_exact=False) + self.assert_eq( + psdf.corr(method=method, min_periods=1), + pdf.corr(method=method, min_periods=1), + check_exact=False, + ) + self.assert_eq( + psdf.corr(method=method, min_periods=3), + pdf.corr(method=method, min_periods=3), + check_exact=False, + ) + + def test_series_corr(self): + pdf = pd.DataFrame( + index=[ + "".join( + np.random.choice( + list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10 + ) + ) + for _ in range(30) + ], + columns=list("ABCD"), + dtype="float64", + ) + pser1 = pdf.A + pser2 = pdf.B + psdf = ps.from_pandas(pdf) + psser1 = psdf.A + psser2 = psdf.B + + with self.assertRaisesRegex(ValueError, "Invalid method"): + psser1.corr(psser2, method="std") + with self.assertRaisesRegex(TypeError, "Invalid min_periods type"): + psser1.corr(psser2, min_periods="3") + + for method in ["pearson", "spearman", "kendall"]: + self.assert_eq( + psser1.corr(psser2, method=method), + pser1.corr(pser2, method=method), + almost=True, + ) + self.assert_eq( + psser1.corr(psser2, method=method, min_periods=1), + pser1.corr(pser2, method=method, min_periods=1), + almost=True, + ) + self.assert_eq( + psser1.corr(psser2, method=method, min_periods=3), + pser1.corr(pser2, method=method, min_periods=3), + almost=True, + ) + self.assert_eq( + (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2), + (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2), + almost=True, + ) + + # different anchors + psser1 = ps.from_pandas(pser1) + psser2 = ps.from_pandas(pser2) + + with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): + psser1.corr(psser2) + + for method in ["pearson", "spearman", "kendall"]: + with ps.option_context("compute.ops_on_diff_frames", True): + self.assert_eq( + psser1.corr(psser2, method=method), + pser1.corr(pser2, method=method), + almost=True, + ) + self.assert_eq( + psser1.corr(psser2, method=method, min_periods=1), + pser1.corr(pser2, method=method, min_periods=1), + almost=True, + ) + self.assert_eq( + psser1.corr(psser2, method=method, min_periods=3), + pser1.corr(pser2, method=method, min_periods=3), + almost=True, + ) + self.assert_eq( + (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2), + (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2), + almost=True, + ) + + def test_cov_corr_meta(self): + # Disable arrow execution since corr() is using UDT internally which is not supported. + with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): + pdf = pd.DataFrame( + { + "a": np.array([1, 2, 3], dtype="i1"), + "b": np.array([1, 2, 3], dtype="i2"), + "c": np.array([1, 2, 3], dtype="i4"), + "d": np.array([1, 2, 3]), + "e": np.array([1.0, 2.0, 3.0], dtype="f4"), + "f": np.array([1.0, 2.0, 3.0]), + "g": np.array([True, False, True]), + "h": np.array(list("abc")), + }, + index=pd.Index([1, 2, 3], name="myindex"), + ) + psdf = ps.from_pandas(pdf) + self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True), check_exact=False) + + +class FrameCorrTests(FrameCorrMixin, PandasOnSparkTestCase, SQLTestUtils): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.computation.test_corr import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/computation/test_stats.py similarity index 53% rename from python/pyspark/pandas/tests/test_stats.py rename to python/pyspark/pandas/tests/computation/test_stats.py index bdc83ad7d5f5..c18c489617c2 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/computation/test_stats.py @@ -15,13 +15,10 @@ # limitations under the License. # -import unittest - import numpy as np import pandas as pd from pyspark import pandas as ps -from pyspark.pandas.config import option_context from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED from pyspark.testing.sqlutils import SQLTestUtils @@ -160,99 +157,6 @@ class StatsTestsMixin: ): psdf.D.abs() - def test_axis_on_dataframe(self): - # The number of each count is intentionally big - # because when data is small, it executes a shortcut. - # Less than 'compute.shortcut_limit' will execute a shortcut - # by using collected pandas dataframe directly. - # now we set the 'compute.shortcut_limit' as 1000 explicitly - with option_context("compute.shortcut_limit", 1000): - pdf = pd.DataFrame( - { - "A": [1, -2, 3, -4, 5] * 300, - "B": [1.0, -2, 3, -4, 5] * 300, - "C": [-6.0, -7, -8, -9, 10] * 300, - "D": [True, False, True, False, False] * 300, - }, - index=range(10, 15001, 10), - ) - # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas regression is fixed - # There is a regression in Pandas 2.1.0, - # so we should manually cast to float until the regression is fixed. - # See https://github.com/pandas-dev/pandas/issues/55194. - pdf = pdf.astype(float) - psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.count(axis=1), pdf.count(axis=1)) - self.assert_eq(psdf.var(axis=1), pdf.var(axis=1)) - self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0)) - self.assert_eq(psdf.std(axis=1), pdf.std(axis=1)) - self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0)) - self.assert_eq(psdf.max(axis=1), pdf.max(axis=1)) - self.assert_eq(psdf.min(axis=1), pdf.min(axis=1)) - self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1)) - self.assert_eq(psdf.product(axis=1), pdf.product(axis=1)) - self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0), almost=True) - self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1)) - self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True) - self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1)) - self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1)) - self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1)) - self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0)) - - self.assert_eq( - psdf.count(axis=1, numeric_only=True), pdf.count(axis=1, numeric_only=True) - ) - self.assert_eq(psdf.var(axis=1, numeric_only=True), pdf.var(axis=1, numeric_only=True)) - self.assert_eq( - psdf.var(axis=1, ddof=0, numeric_only=True), - pdf.var(axis=1, ddof=0, numeric_only=True), - ) - self.assert_eq(psdf.std(axis=1, numeric_only=True), pdf.std(axis=1, numeric_only=True)) - self.assert_eq( - psdf.std(axis=1, ddof=0, numeric_only=True), - pdf.std(axis=1, ddof=0, numeric_only=True), - ) - self.assert_eq( - psdf.max(axis=1, numeric_only=True), - pdf.max(axis=1, numeric_only=True).astype(float), - ) - self.assert_eq( - psdf.min(axis=1, numeric_only=True), - pdf.min(axis=1, numeric_only=True).astype(float), - ) - self.assert_eq( - psdf.sum(axis=1, numeric_only=True), - pdf.sum(axis=1, numeric_only=True).astype(float), - ) - self.assert_eq( - psdf.product(axis=1, numeric_only=True), - pdf.product(axis=1, numeric_only=True).astype(float), - ) - self.assert_eq( - psdf.kurtosis(axis=0, numeric_only=True), - pdf.kurtosis(axis=0, numeric_only=True), - almost=True, - ) - self.assert_eq( - psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, numeric_only=True) - ) - self.assert_eq( - psdf.skew(axis=0, numeric_only=True), - pdf.skew(axis=0, numeric_only=True), - almost=True, - ) - self.assert_eq( - psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, numeric_only=True) - ) - self.assert_eq( - psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, numeric_only=True) - ) - self.assert_eq(psdf.sem(axis=1, numeric_only=True), pdf.sem(axis=1, numeric_only=True)) - self.assert_eq( - psdf.sem(axis=1, ddof=0, numeric_only=True), - pdf.sem(axis=1, ddof=0, numeric_only=True), - ) - def test_skew_kurt_numerical_stability(self): pdf = pd.DataFrame( { @@ -268,186 +172,6 @@ class StatsTestsMixin: self.assert_eq(psdf.skew(), pdf.skew(), almost=True) self.assert_eq(psdf.kurt(), pdf.kurt(), almost=True) - def test_dataframe_corr(self): - pdf = pd.DataFrame( - index=[ - "".join( - np.random.choice( - list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10 - ) - ) - for _ in range(30) - ], - columns=list("ABCD"), - dtype="float64", - ) - psdf = ps.from_pandas(pdf) - - with self.assertRaisesRegex(ValueError, "Invalid method"): - psdf.corr("std") - with self.assertRaisesRegex(TypeError, "Invalid min_periods type"): - psdf.corr(min_periods="3") - - for method in ["pearson", "spearman", "kendall"]: - self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), check_exact=False) - self.assert_eq( - psdf.corr(method=method, min_periods=1), - pdf.corr(method=method, min_periods=1), - check_exact=False, - ) - self.assert_eq( - psdf.corr(method=method, min_periods=3), - pdf.corr(method=method, min_periods=3), - check_exact=False, - ) - self.assert_eq( - (psdf + 1).corr(method=method, min_periods=2), - (pdf + 1).corr(method=method, min_periods=2), - check_exact=False, - ) - - # multi-index columns - columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) - pdf.columns = columns - psdf.columns = columns - - for method in ["pearson", "spearman", "kendall"]: - self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), check_exact=False) - self.assert_eq( - psdf.corr(method=method, min_periods=1), - pdf.corr(method=method, min_periods=1), - check_exact=False, - ) - self.assert_eq( - psdf.corr(method=method, min_periods=3), - pdf.corr(method=method, min_periods=3), - check_exact=False, - ) - self.assert_eq( - (psdf + 1).corr(method=method, min_periods=2), - (pdf + 1).corr(method=method, min_periods=2), - check_exact=False, - ) - - # test with identical values - pdf = pd.DataFrame( - { - "a": [0, 1, 1, 1, 0], - "b": [2, 2, -1, 1, np.nan], - "c": [3, 3, 3, 3, 3], - "d": [np.nan, np.nan, np.nan, np.nan, np.nan], - } - ) - psdf = ps.from_pandas(pdf) - - for method in ["pearson", "spearman", "kendall"]: - self.assert_eq(psdf.corr(method=method), pdf.corr(method=method), check_exact=False) - self.assert_eq( - psdf.corr(method=method, min_periods=1), - pdf.corr(method=method, min_periods=1), - check_exact=False, - ) - self.assert_eq( - psdf.corr(method=method, min_periods=3), - pdf.corr(method=method, min_periods=3), - check_exact=False, - ) - - def test_series_corr(self): - pdf = pd.DataFrame( - index=[ - "".join( - np.random.choice( - list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10 - ) - ) - for _ in range(30) - ], - columns=list("ABCD"), - dtype="float64", - ) - pser1 = pdf.A - pser2 = pdf.B - psdf = ps.from_pandas(pdf) - psser1 = psdf.A - psser2 = psdf.B - - with self.assertRaisesRegex(ValueError, "Invalid method"): - psser1.corr(psser2, method="std") - with self.assertRaisesRegex(TypeError, "Invalid min_periods type"): - psser1.corr(psser2, min_periods="3") - - for method in ["pearson", "spearman", "kendall"]: - self.assert_eq( - psser1.corr(psser2, method=method), - pser1.corr(pser2, method=method), - almost=True, - ) - self.assert_eq( - psser1.corr(psser2, method=method, min_periods=1), - pser1.corr(pser2, method=method, min_periods=1), - almost=True, - ) - self.assert_eq( - psser1.corr(psser2, method=method, min_periods=3), - pser1.corr(pser2, method=method, min_periods=3), - almost=True, - ) - self.assert_eq( - (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2), - (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2), - almost=True, - ) - - # different anchors - psser1 = ps.from_pandas(pser1) - psser2 = ps.from_pandas(pser2) - - with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - psser1.corr(psser2) - - for method in ["pearson", "spearman", "kendall"]: - with ps.option_context("compute.ops_on_diff_frames", True): - self.assert_eq( - psser1.corr(psser2, method=method), - pser1.corr(pser2, method=method), - almost=True, - ) - self.assert_eq( - psser1.corr(psser2, method=method, min_periods=1), - pser1.corr(pser2, method=method, min_periods=1), - almost=True, - ) - self.assert_eq( - psser1.corr(psser2, method=method, min_periods=3), - pser1.corr(pser2, method=method, min_periods=3), - almost=True, - ) - self.assert_eq( - (psser1 + 1).corr(psser2 - 2, method=method, min_periods=2), - (pser1 + 1).corr(pser2 - 2, method=method, min_periods=2), - almost=True, - ) - - def test_cov_corr_meta(self): - # Disable arrow execution since corr() is using UDT internally which is not supported. - with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): - pdf = pd.DataFrame( - { - "a": np.array([1, 2, 3], dtype="i1"), - "b": np.array([1, 2, 3], dtype="i2"), - "c": np.array([1, 2, 3], dtype="i4"), - "d": np.array([1, 2, 3]), - "e": np.array([1.0, 2.0, 3.0], dtype="f4"), - "f": np.array([1.0, 2.0, 3.0]), - "g": np.array([True, False, True]), - "h": np.array(list("abc")), - }, - index=pd.Index([1, 2, 3], name="myindex"), - ) - psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True), check_exact=False) - def test_stats_on_boolean_dataframe(self): pdf = pd.DataFrame({"A": [True, False, True], "B": [False, False, True]}) psdf = ps.from_pandas(pdf) @@ -588,7 +312,7 @@ class StatsTests(StatsTestsMixin, PandasOnSparkTestCase, SQLTestUtils): if __name__ == "__main__": import unittest - from pyspark.pandas.tests.test_stats import * # noqa: F401 + from pyspark.pandas.tests.computation.test_stats import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py b/python/pyspark/pandas/tests/connect/computation/test_parity_corr.py similarity index 81% copy from python/pyspark/pandas/tests/connect/test_parity_stats.py copy to python/pyspark/pandas/tests/connect/computation/test_parity_corr.py index 7eddc4c15d4f..acf36b07829a 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_stats.py +++ b/python/pyspark/pandas/tests/connect/computation/test_parity_corr.py @@ -16,17 +16,18 @@ # import unittest -from pyspark.pandas.tests.test_stats import StatsTestsMixin +from pyspark import pandas as ps +from pyspark.pandas.tests.computation.test_corr import FrameCorrMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): +class FrameParityCorrTests(FrameCorrMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_stats import * # noqa: F401 + from pyspark.pandas.tests.connect.computation.test_parity_corr import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py b/python/pyspark/pandas/tests/connect/computation/test_parity_stats.py similarity index 88% copy from python/pyspark/pandas/tests/connect/test_parity_stats.py copy to python/pyspark/pandas/tests/connect/computation/test_parity_stats.py index 7eddc4c15d4f..14d37949590b 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_stats.py +++ b/python/pyspark/pandas/tests/connect/computation/test_parity_stats.py @@ -16,7 +16,7 @@ # import unittest -from pyspark.pandas.tests.test_stats import StatsTestsMixin +from pyspark.pandas.tests.computation.test_stats import StatsTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils @@ -26,7 +26,7 @@ class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils, ReusedConnectTes if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_stats import * # noqa: F401 + from pyspark.pandas.tests.connect.computation.test_parity_stats import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_stats.py b/python/pyspark/pandas/tests/connect/frame/test_parity_axis.py similarity index 83% rename from python/pyspark/pandas/tests/connect/test_parity_stats.py rename to python/pyspark/pandas/tests/connect/frame/test_parity_axis.py index 7eddc4c15d4f..804ed97fa31c 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_stats.py +++ b/python/pyspark/pandas/tests/connect/frame/test_parity_axis.py @@ -16,17 +16,17 @@ # import unittest -from pyspark.pandas.tests.test_stats import StatsTestsMixin +from pyspark.pandas.tests.frame.test_axis import FrameAxisMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class StatsParityTests(StatsTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): +class FrameParityAxisTests(FrameAxisMixin, PandasOnSparkTestUtils, ReusedConnectTestCase): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_stats import * # noqa: F401 + from pyspark.pandas.tests.connect.frame.test_parity_axis import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/frame/test_axis.py b/python/pyspark/pandas/tests/frame/test_axis.py new file mode 100644 index 000000000000..ee67cf1b55ed --- /dev/null +++ b/python/pyspark/pandas/tests/frame/test_axis.py @@ -0,0 +1,135 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import ComparisonTestBase +from pyspark.testing.sqlutils import SQLTestUtils + + +class FrameAxisMixin: + def test_axis_on_dataframe(self): + # The number of each count is intentionally big + # because when data is small, it executes a shortcut. + # Less than 'compute.shortcut_limit' will execute a shortcut + # by using collected pandas dataframe directly. + # now we set the 'compute.shortcut_limit' as 1000 explicitly + with ps.option_context("compute.shortcut_limit", 1000): + pdf = pd.DataFrame( + { + "A": [1, -2, 3, -4, 5] * 300, + "B": [1.0, -2, 3, -4, 5] * 300, + "C": [-6.0, -7, -8, -9, 10] * 300, + "D": [True, False, True, False, False] * 300, + }, + index=range(10, 15001, 10), + ) + # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas regression is fixed + # There is a regression in Pandas 2.1.0, + # so we should manually cast to float until the regression is fixed. + # See https://github.com/pandas-dev/pandas/issues/55194. + pdf = pdf.astype(float) + psdf = ps.from_pandas(pdf) + self.assert_eq(psdf.count(axis=1), pdf.count(axis=1)) + self.assert_eq(psdf.var(axis=1), pdf.var(axis=1)) + self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0)) + self.assert_eq(psdf.std(axis=1), pdf.std(axis=1)) + self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0)) + self.assert_eq(psdf.max(axis=1), pdf.max(axis=1)) + self.assert_eq(psdf.min(axis=1), pdf.min(axis=1)) + self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1)) + self.assert_eq(psdf.product(axis=1), pdf.product(axis=1)) + self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0), almost=True) + self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1)) + self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True) + self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1)) + self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1)) + self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1)) + self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0)) + + self.assert_eq( + psdf.count(axis=1, numeric_only=True), pdf.count(axis=1, numeric_only=True) + ) + self.assert_eq(psdf.var(axis=1, numeric_only=True), pdf.var(axis=1, numeric_only=True)) + self.assert_eq( + psdf.var(axis=1, ddof=0, numeric_only=True), + pdf.var(axis=1, ddof=0, numeric_only=True), + ) + self.assert_eq(psdf.std(axis=1, numeric_only=True), pdf.std(axis=1, numeric_only=True)) + self.assert_eq( + psdf.std(axis=1, ddof=0, numeric_only=True), + pdf.std(axis=1, ddof=0, numeric_only=True), + ) + self.assert_eq( + psdf.max(axis=1, numeric_only=True), + pdf.max(axis=1, numeric_only=True).astype(float), + ) + self.assert_eq( + psdf.min(axis=1, numeric_only=True), + pdf.min(axis=1, numeric_only=True).astype(float), + ) + self.assert_eq( + psdf.sum(axis=1, numeric_only=True), + pdf.sum(axis=1, numeric_only=True).astype(float), + ) + self.assert_eq( + psdf.product(axis=1, numeric_only=True), + pdf.product(axis=1, numeric_only=True).astype(float), + ) + self.assert_eq( + psdf.kurtosis(axis=0, numeric_only=True), + pdf.kurtosis(axis=0, numeric_only=True), + almost=True, + ) + self.assert_eq( + psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, numeric_only=True) + ) + self.assert_eq( + psdf.skew(axis=0, numeric_only=True), + pdf.skew(axis=0, numeric_only=True), + almost=True, + ) + self.assert_eq( + psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, numeric_only=True) + ) + self.assert_eq( + psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, numeric_only=True) + ) + self.assert_eq(psdf.sem(axis=1, numeric_only=True), pdf.sem(axis=1, numeric_only=True)) + self.assert_eq( + psdf.sem(axis=1, ddof=0, numeric_only=True), + pdf.sem(axis=1, ddof=0, numeric_only=True), + ) + + +class FrameAxisTests(FrameAxisMixin, ComparisonTestBase, SQLTestUtils): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.frame.test_axis import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org