This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 514ecc6fc183 [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests` 514ecc6fc183 is described below commit 514ecc6fc183d7222b9dc299af4df328c71966d1 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Jan 11 10:52:32 2024 +0800 [SPARK-46656][PS][TESTS] Split `GroupbyParitySplitApplyTests` ### What changes were proposed in this pull request? Split `GroupbyParitySplitApplyTests` ### Why are the changes needed? to testing parallelism this test normally takes 10 mins: ``` Starting test(python3.9): pyspark.pandas.tests.connect.groupby.test_parity_split_apply (temp output: /__w/spark/spark/python/target/fb71133e-7d03-4c9b-8a64-10e1d02d6bb6/python3.9__pyspark.pandas.tests.connect.groupby.test_parity_split_apply__6wojkexo.log) Finished test(python3.9): pyspark.pandas.tests.connect.groupby.test_parity_split_apply (598s) ``` ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44664 from zhengruifeng/ps_test_split_apply. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- dev/sparktestsupport/modules.py | 6 ++++ .../connect/groupby/test_parity_split_apply.py | 4 ++- ...lit_apply.py => test_parity_split_apply_adv.py} | 10 ++++--- ...t_apply.py => test_parity_split_apply_basic.py} | 10 ++++--- ...apply.py => test_parity_split_apply_min_max.py} | 10 ++++--- .../pandas/tests/groupby/test_split_apply.py | 32 ++++++++++++++++------ .../test_split_apply_adv.py} | 24 +++++++++++----- .../test_split_apply_basic.py} | 24 +++++++++++----- .../test_split_apply_min_max.py} | 24 +++++++++++----- 9 files changed, 102 insertions(+), 42 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index b8ae23613688..abeb1aa5666a 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -888,6 +888,9 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.groupby.test_rank", "pyspark.pandas.tests.groupby.test_size", "pyspark.pandas.tests.groupby.test_split_apply", + "pyspark.pandas.tests.groupby.test_split_apply_adv", + "pyspark.pandas.tests.groupby.test_split_apply_basic", + "pyspark.pandas.tests.groupby.test_split_apply_min_max", "pyspark.pandas.tests.groupby.test_stat", "pyspark.pandas.tests.groupby.test_stat_adv", "pyspark.pandas.tests.groupby.test_stat_ddof", @@ -1174,6 +1177,9 @@ pyspark_pandas_connect_part1 = Module( "pyspark.pandas.tests.connect.groupby.test_parity_cumulative", "pyspark.pandas.tests.connect.groupby.test_parity_missing_data", "pyspark.pandas.tests.connect.groupby.test_parity_split_apply", + "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv", + "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic", + "pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov", diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py index 895fe984be27..b5678f91ab02 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py @@ -22,7 +22,9 @@ from pyspark.testing.pandasutils import PandasOnSparkTestUtils class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase + GroupbySplitApplyMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, ): pass diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py similarity index 84% copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py index 895fe984be27..f8ddd8b8c9ab 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_adv.py @@ -16,19 +16,21 @@ # import unittest -from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin +from pyspark.pandas.tests.groupby.test_split_apply_adv import GroupbySplitApplyAdvMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class GroupbySplitApplyAdvParityTests( + GroupbySplitApplyAdvMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401 + from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_adv import * # noqa try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py similarity index 83% copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py index 895fe984be27..2964213ab484 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_basic.py @@ -16,19 +16,21 @@ # import unittest -from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin +from pyspark.pandas.tests.groupby.test_split_apply_basic import GroupbySplitApplyBasicMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class GroupbySplitApplyBasicParityTests( + GroupbySplitApplyBasicMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401 + from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_basic import * # noqa try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py similarity index 83% copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py copy to python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py index 895fe984be27..1d0e2eb2957c 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply_min_max.py @@ -16,19 +16,21 @@ # import unittest -from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin +from pyspark.pandas.tests.groupby.test_split_apply_min_max import GroupbySplitApplyMMMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils -class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class GroupbySplitApplyMMParityTests( + GroupbySplitApplyMMMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401 + from pyspark.pandas.tests.connect.groupby.test_parity_split_apply_min_max import * # noqa try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/groupby/test_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply.py index 13cdae94c0e3..8251ba0e9bb2 100644 --- a/python/pyspark/pandas/tests/groupby/test_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply.py @@ -19,12 +19,12 @@ import unittest import pandas as pd from pyspark import pandas as ps -from pyspark.testing.pandasutils import ComparisonTestBase +from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils -class GroupbySplitApplyMixin: - def test_split_apply_combine_on_series(self): +class GroupbySplitApplyTestingFuncMixin: + def _test_split_apply_func(self, funcs): # TODO(SPARK-45228): Enabling string type columns for `test_split_apply_combine_on_series` # when Pandas regression is fixed # There is a regression in Pandas 2.1.0, @@ -42,11 +42,14 @@ class GroupbySplitApplyMixin: psdf = ps.from_pandas(pdf) funcs = [ - ((True, False), ["sum", "min", "max", "count", "first", "last"]), - ((True, True), ["mean"]), - ((False, False), ["var", "std", "skew"]), + ( + check_exact, + almost, + f, + ) + for (check_exact, almost), fs in funcs + for f in fs ] - funcs = [(check_exact, almost, f) for (check_exact, almost), fs in funcs for f in fs] for as_index in [True, False]: if as_index: @@ -155,7 +158,20 @@ class GroupbySplitApplyMixin: ) -class GroupbySplitApplyTests(GroupbySplitApplyMixin, ComparisonTestBase, SQLTestUtils): +class GroupbySplitApplyMixin(GroupbySplitApplyTestingFuncMixin): + def test_split_apply_combine_on_series(self): + funcs = [ + ((True, False), ["sum"]), + ((True, True), ["mean"]), + ] + self._test_split_apply_func(funcs) + + +class GroupbySplitApplyTests( + GroupbySplitApplyMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): pass diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_adv.py similarity index 64% copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py copy to python/pyspark/pandas/tests/groupby/test_split_apply_adv.py index 895fe984be27..abce6d5ed4f0 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply_adv.py @@ -16,22 +16,32 @@ # import unittest -from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin -from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin -class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class GroupbySplitApplyAdvMixin(GroupbySplitApplyTestingFuncMixin): + def test_split_apply_combine_on_series(self): + funcs = [ + ((False, False), ["var", "std", "skew"]), + ] + self._test_split_apply_func(funcs) + + +class GroupbySplitApplyAdvTests( + GroupbySplitApplyAdvMixin, + PandasOnSparkTestCase, + SQLTestUtils, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401 + from pyspark.pandas.tests.groupby.test_split_apply_adv import * # noqa: F401 try: - import xmlrunner # type: ignore[import] + import xmlrunner testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_basic.py similarity index 64% copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py copy to python/pyspark/pandas/tests/groupby/test_split_apply_basic.py index 895fe984be27..17c6179d19ac 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply_basic.py @@ -16,22 +16,32 @@ # import unittest -from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin -from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin -class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class GroupbySplitApplyBasicMixin(GroupbySplitApplyTestingFuncMixin): + def test_split_apply_combine_on_series(self): + funcs = [ + ((True, False), ["count", "first", "last"]), + ] + self._test_split_apply_func(funcs) + + +class GroupbySplitApplyBasicTests( + GroupbySplitApplyBasicMixin, + PandasOnSparkTestCase, + SQLTestUtils, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401 + from pyspark.pandas.tests.groupby.test_split_apply_basic import * # noqa: F401 try: - import xmlrunner # type: ignore[import] + import xmlrunner testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: diff --git a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py b/python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py similarity index 65% copy from python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py copy to python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py index 895fe984be27..c16c23323a8c 100644 --- a/python/pyspark/pandas/tests/connect/groupby/test_parity_split_apply.py +++ b/python/pyspark/pandas/tests/groupby/test_split_apply_min_max.py @@ -16,22 +16,32 @@ # import unittest -from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyMixin -from pyspark.testing.connectutils import ReusedConnectTestCase -from pyspark.testing.pandasutils import PandasOnSparkTestUtils +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils +from pyspark.pandas.tests.groupby.test_split_apply import GroupbySplitApplyTestingFuncMixin -class GroupbyParitySplitApplyTests( - GroupbySplitApplyMixin, PandasOnSparkTestUtils, ReusedConnectTestCase +class GroupbySplitApplyMMMixin(GroupbySplitApplyTestingFuncMixin): + def test_split_apply_combine_on_series(self): + funcs = [ + ((True, False), ["min", "max"]), + ] + self._test_split_apply_func(funcs) + + +class GroupbySplitApplyMMTests( + GroupbySplitApplyMMMixin, + PandasOnSparkTestCase, + SQLTestUtils, ): pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.groupby.test_parity_split_apply import * # noqa: F401 + from pyspark.pandas.tests.groupby.test_split_apply_min_max import * # noqa: F401 try: - import xmlrunner # type: ignore[import] + import xmlrunner testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) except ImportError: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org