This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 9b583809dd94 [SPARK-46462][PS][TESTS] Reorganize `OpsOnDiffFramesGroupByRollingTests` 9b583809dd94 is described below commit 9b583809dd9494ee8ed3c2e50356230e1ffae218 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Dec 21 08:58:25 2023 +0900 [SPARK-46462][PS][TESTS] Reorganize `OpsOnDiffFramesGroupByRollingTests` ### What changes were proposed in this pull request? Reorganize `OpsOnDiffFramesGroupByRollingTests` ### Why are the changes needed? for parallelism ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44420 from zhengruifeng/ps_test_diff_group_roll. Lead-authored-by: Ruifeng Zheng <ruife...@apache.org> Co-authored-by: Hyukjin Kwon <gurwls...@gmail.com> Co-authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/sparktestsupport/modules.py | 8 ++- .../test_parity_groupby_rolling.py} | 13 ++--- .../test_parity_groupby_rolling_adv.py} | 13 ++--- .../test_parity_groupby_rolling_count.py} | 13 ++--- .../test_groupby_rolling.py} | 66 ++++++---------------- .../diff_frames_ops/test_groupby_rolling_adv.py | 61 ++++++++++++++++++++ .../test_groupby_rolling_count.py} | 62 +++----------------- 7 files changed, 107 insertions(+), 129 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 0388f1812b0d..cbd3b35c0015 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -745,7 +745,9 @@ pyspark_pandas = Module( "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding", "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_adv", "pyspark.pandas.tests.diff_frames_ops.test_groupby_expanding_count", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling", + "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling", + "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv", + "pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count", "pyspark.pandas.tests.test_repr", "pyspark.pandas.tests.resample.test_on", "pyspark.pandas.tests.resample.test_error", @@ -1170,7 +1172,9 @@ pyspark_pandas_connect_part2 = Module( "pyspark.pandas.tests.connect.window.test_parity_expanding_error", "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding", "pyspark.pandas.tests.connect.window.test_parity_groupby_expanding_adv", - "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count", "pyspark.pandas.tests.connect.computation.test_parity_missing_data", "pyspark.pandas.tests.connect.groupby.test_parity_index", "pyspark.pandas.tests.connect.groupby.test_parity_describe", diff --git a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py similarity index 75% copy from python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py copy to python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py index 4a52bb0748f5..c8255d6f9c42 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +++ b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling.py @@ -16,24 +16,21 @@ # import unittest -from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import ( - OpsOnDiffFramesGroupByRollingTestsMixin, -) +from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import GroupByRollingMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class OpsOnDiffFramesGroupByRollingParityTests( - OpsOnDiffFramesGroupByRollingTestsMixin, +class GroupByRollingParityTests( + GroupByRollingMixin, PandasOnSparkTestUtils, - TestUtils, ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling import * + from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling import * # noqa try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py similarity index 75% copy from python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py copy to python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py index 4a52bb0748f5..f1793a1f8d82 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +++ b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_adv.py @@ -16,24 +16,21 @@ # import unittest -from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import ( - OpsOnDiffFramesGroupByRollingTestsMixin, -) +from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv import GroupByRollingAdvMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class OpsOnDiffFramesGroupByRollingParityTests( - OpsOnDiffFramesGroupByRollingTestsMixin, +class GroupByRollingAdvParityTests( + GroupByRollingAdvMixin, PandasOnSparkTestUtils, - TestUtils, ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling import * + from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_adv import * # noqa try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py similarity index 75% rename from python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py rename to python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py index 4a52bb0748f5..b0316401700e 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_ops_on_diff_frames_groupby_rolling.py +++ b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_groupby_rolling_count.py @@ -16,24 +16,21 @@ # import unittest -from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import ( - OpsOnDiffFramesGroupByRollingTestsMixin, -) +from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count import GroupByRollingCountMixin from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class OpsOnDiffFramesGroupByRollingParityTests( - OpsOnDiffFramesGroupByRollingTestsMixin, +class GroupByRollingCountParityTests( + GroupByRollingCountMixin, PandasOnSparkTestUtils, - TestUtils, ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling import * + from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_rolling_count import * # noqa try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py similarity index 62% copy from python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py copy to python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py index 676eafa74eed..ea1489ad55cd 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling.py @@ -19,20 +19,11 @@ import pandas as pd from pyspark import pandas as ps from pyspark.pandas.config import set_option, reset_option -from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils -class OpsOnDiffFramesGroupByRollingTestsMixin: - @classmethod - def setUpClass(cls): - super().setUpClass() - set_option("compute.ops_on_diff_frames", True) - - @classmethod - def tearDownClass(cls): - reset_option("compute.ops_on_diff_frames") - super().tearDownClass() - +class GroupByRollingTestingFuncMixin: def _test_groupby_rolling_func(self, f): pser = pd.Series([1, 2, 3], name="a") pkey = pd.Series([1, 2, 3], name="a") @@ -63,35 +54,17 @@ class OpsOnDiffFramesGroupByRollingTestsMixin: getattr(pdf.groupby(pkey)[["b"]].rolling(2), f)().sort_index(), ) - def test_groupby_rolling_count(self): - pser = pd.Series([1, 2, 3], name="a") - pkey = pd.Series([1, 2, 3], name="a") - psser = ps.from_pandas(pser) - kkey = ps.from_pandas(pkey) - # TODO(SPARK-43432): Fix `min_periods` for Rolling.count() to work same as pandas - self.assert_eq( - psser.groupby(kkey).rolling(2).count().sort_index(), - pser.groupby(pkey).rolling(2, min_periods=1).count().sort_index(), - ) - - pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) - pkey = pd.Series([1, 2, 3, 2], name="a") - psdf = ps.from_pandas(pdf) - kkey = ps.from_pandas(pkey) +class GroupByRollingMixin(GroupByRollingTestingFuncMixin): + @classmethod + def setUpClass(cls): + super().setUpClass() + set_option("compute.ops_on_diff_frames", True) - self.assert_eq( - psdf.groupby(kkey).rolling(2).count().sort_index(), - pdf.groupby(pkey).rolling(2, min_periods=1).count().sort_index(), - ) - self.assert_eq( - psdf.groupby(kkey)["b"].rolling(2).count().sort_index(), - pdf.groupby(pkey)["b"].rolling(2, min_periods=1).count().sort_index(), - ) - self.assert_eq( - psdf.groupby(kkey)[["b"]].rolling(2).count().sort_index(), - pdf.groupby(pkey)[["b"]].rolling(2, min_periods=1).count().sort_index(), - ) + @classmethod + def tearDownClass(cls): + reset_option("compute.ops_on_diff_frames") + super().tearDownClass() def test_groupby_rolling_min(self): self._test_groupby_rolling_func("min") @@ -105,23 +78,18 @@ class OpsOnDiffFramesGroupByRollingTestsMixin: def test_groupby_rolling_sum(self): self._test_groupby_rolling_func("sum") - def test_groupby_rolling_std(self): - # TODO: `std` now raise error in pandas 1.0.0 - self._test_groupby_rolling_func("std") - - def test_groupby_rolling_var(self): - self._test_groupby_rolling_func("var") - -class OpsOnDiffFramesGroupByRollingTests( - OpsOnDiffFramesGroupByRollingTestsMixin, PandasOnSparkTestCase, TestUtils +class GroupByRollingTests( + GroupByRollingMixin, + PandasOnSparkTestCase, + SQLTestUtils, ): pass if __name__ == "__main__": import unittest - from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import * # noqa: F401 + from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import * # noqa try: import xmlrunner diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py new file mode 100644 index 000000000000..48d75fbcaf89 --- /dev/null +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_adv.py @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.pandas.config import set_option, reset_option +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling import GroupByRollingTestingFuncMixin + + +class GroupByRollingAdvMixin(GroupByRollingTestingFuncMixin): + @classmethod + def setUpClass(cls): + super().setUpClass() + set_option("compute.ops_on_diff_frames", True) + + @classmethod + def tearDownClass(cls): + reset_option("compute.ops_on_diff_frames") + super().tearDownClass() + + def test_groupby_rolling_std(self): + # TODO: `std` now raise error in pandas 1.0.0 + self._test_groupby_rolling_func("std") + + def test_groupby_rolling_var(self): + self._test_groupby_rolling_func("var") + + +class GroupByRollingAdvTests( + GroupByRollingAdvMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + import unittest + from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_adv import * # noqa + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py similarity index 56% rename from python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py rename to python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py index 676eafa74eed..41ac982db745 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_rolling.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_groupby_rolling_count.py @@ -19,10 +19,11 @@ import pandas as pd from pyspark import pandas as ps from pyspark.pandas.config import set_option, reset_option -from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils -class OpsOnDiffFramesGroupByRollingTestsMixin: +class GroupByRollingCountMixin: @classmethod def setUpClass(cls): super().setUpClass() @@ -33,36 +34,6 @@ class OpsOnDiffFramesGroupByRollingTestsMixin: reset_option("compute.ops_on_diff_frames") super().tearDownClass() - def _test_groupby_rolling_func(self, f): - pser = pd.Series([1, 2, 3], name="a") - pkey = pd.Series([1, 2, 3], name="a") - psser = ps.from_pandas(pser) - kkey = ps.from_pandas(pkey) - - self.assert_eq( - getattr(psser.groupby(kkey).rolling(2), f)().sort_index(), - getattr(pser.groupby(pkey).rolling(2), f)().sort_index(), - ) - - pdf = pd.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) - pkey = pd.Series([1, 2, 3, 2], name="a") - psdf = ps.from_pandas(pdf) - kkey = ps.from_pandas(pkey) - - self.assert_eq( - getattr(psdf.groupby(kkey).rolling(2), f)().sort_index(), - getattr(pdf.groupby(pkey).rolling(2), f)().sort_index(), - ) - - self.assert_eq( - getattr(psdf.groupby(kkey)["b"].rolling(2), f)().sort_index(), - getattr(pdf.groupby(pkey)["b"].rolling(2), f)().sort_index(), - ) - self.assert_eq( - getattr(psdf.groupby(kkey)[["b"]].rolling(2), f)().sort_index(), - getattr(pdf.groupby(pkey)[["b"]].rolling(2), f)().sort_index(), - ) - def test_groupby_rolling_count(self): pser = pd.Series([1, 2, 3], name="a") pkey = pd.Series([1, 2, 3], name="a") @@ -93,35 +64,18 @@ class OpsOnDiffFramesGroupByRollingTestsMixin: pdf.groupby(pkey)[["b"]].rolling(2, min_periods=1).count().sort_index(), ) - def test_groupby_rolling_min(self): - self._test_groupby_rolling_func("min") - - def test_groupby_rolling_max(self): - self._test_groupby_rolling_func("max") - - def test_groupby_rolling_mean(self): - self._test_groupby_rolling_func("mean") - - def test_groupby_rolling_sum(self): - self._test_groupby_rolling_func("sum") - - def test_groupby_rolling_std(self): - # TODO: `std` now raise error in pandas 1.0.0 - self._test_groupby_rolling_func("std") - - def test_groupby_rolling_var(self): - self._test_groupby_rolling_func("var") - -class OpsOnDiffFramesGroupByRollingTests( - OpsOnDiffFramesGroupByRollingTestsMixin, PandasOnSparkTestCase, TestUtils +class GroupByRollingCountTests( + GroupByRollingCountMixin, + PandasOnSparkTestCase, + SQLTestUtils, ): pass if __name__ == "__main__": import unittest - from pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling import * # noqa: F401 + from pyspark.pandas.tests.diff_frames_ops.test_groupby_rolling_count import * # noqa try: import xmlrunner --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org