This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 29acdf755cd [SPARK-45153][BUILD][PS] Rebalance testing time for `pyspark-pandas-connect-part1` 29acdf755cd is described below commit 29acdf755cd0a5c88c5cc5c8c16947a5b8e840f9 Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Mon Sep 18 12:02:28 2023 -0700 [SPARK-45153][BUILD][PS] Rebalance testing time for `pyspark-pandas-connect-part1` ### What changes were proposed in this pull request? This PR proposes to rebalance the tests for `pyspark-pandas-connect-part1`. ### Why are the changes needed? We rebalance the CI by splitting slow tests into multiple parts, but `pyspark-pandas-connect-part1` takes almost an hour than other splitted `pyspark-pandas-connect-partx` tests as below: |pyspark-pandas-connect-part0|pyspark-pandas-connect-part1|pyspark-pandas-connect-part2| |--|--|--| |1h 51m|2h 55m|1h 54m| ### Does this PR introduce _any_ user-facing change? No, this PR is proposed to improve the build infra. ### How was this patch tested? We should manually check the CI from GitHub Actions after PR is opened. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42909 from itholic/SPARK-45153. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .github/workflows/build_and_test.yml | 2 ++ dev/sparktestsupport/modules.py | 46 ++++++++++++++++++++++++------------ dev/sparktestsupport/utils.py | 14 +++++------ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9c5d25d30af..39fd2796015 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -372,6 +372,8 @@ jobs: pyspark-pandas-connect-part1 - >- pyspark-pandas-connect-part2 + - >- + pyspark-pandas-connect-part3 env: MODULES_TO_TEST: ${{ matrix.modules }} HADOOP_PROFILE: ${{ inputs.hadoop }} diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 0a751052491..a3bfa288383 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -965,7 +965,6 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.test_parity_utils", "pyspark.pandas.tests.connect.test_parity_window", "pyspark.pandas.tests.connect.indexes.test_parity_base", - "pyspark.pandas.tests.connect.indexes.test_parity_datetime", "pyspark.pandas.tests.connect.indexes.test_parity_align", "pyspark.pandas.tests.connect.indexes.test_parity_indexing", "pyspark.pandas.tests.connect.indexes.test_parity_reindex", @@ -982,7 +981,10 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.computation.test_parity_describe", "pyspark.pandas.tests.connect.computation.test_parity_eval", "pyspark.pandas.tests.connect.computation.test_parity_melt", - "pyspark.pandas.tests.connect.computation.test_parity_pivot", + "pyspark.pandas.tests.connect.groupby.test_parity_stat", + "pyspark.pandas.tests.connect.frame.test_parity_attrs", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame", + "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -999,7 +1001,6 @@ pyspark_pandas_connect_part1 = Module( ], python_test_goals=[ # pandas-on-Spark unittests - "pyspark.pandas.tests.connect.frame.test_parity_attrs", "pyspark.pandas.tests.connect.frame.test_parity_constructor", "pyspark.pandas.tests.connect.frame.test_parity_conversion", "pyspark.pandas.tests.connect.frame.test_parity_reindexing", @@ -1012,21 +1013,12 @@ pyspark_pandas_connect_part1 = Module( "pyspark.pandas.tests.connect.groupby.test_parity_aggregate", "pyspark.pandas.tests.connect.groupby.test_parity_apply_func", "pyspark.pandas.tests.connect.groupby.test_parity_cumulative", - "pyspark.pandas.tests.connect.groupby.test_parity_describe", - "pyspark.pandas.tests.connect.groupby.test_parity_groupby", - "pyspark.pandas.tests.connect.groupby.test_parity_head_tail", - "pyspark.pandas.tests.connect.groupby.test_parity_index", "pyspark.pandas.tests.connect.groupby.test_parity_missing_data", "pyspark.pandas.tests.connect.groupby.test_parity_split_apply", - "pyspark.pandas.tests.connect.groupby.test_parity_stat", "pyspark.pandas.tests.connect.test_parity_indexing", - "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames", - "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov_corrwith", - "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_frame", - "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_dot_series", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_index", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_series", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_frame", @@ -1043,7 +1035,10 @@ pyspark_pandas_connect_part1 = Module( "pyspark.pandas.tests.connect.series.test_parity_series", "pyspark.pandas.tests.connect.series.test_parity_sort", "pyspark.pandas.tests.connect.series.test_parity_stat", + "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic", + "pyspark.pandas.tests.connect.test_parity_reshape", "pyspark.pandas.tests.connect.test_parity_stats", + "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_expanding", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and @@ -1060,20 +1055,41 @@ pyspark_pandas_connect_part2 = Module( ], python_test_goals=[ # pandas-on-Spark unittests + "pyspark.pandas.tests.connect.computation.test_parity_pivot", "pyspark.pandas.tests.connect.indexes.test_parity_base_slow", "pyspark.pandas.tests.connect.indexes.test_parity_datetime_property", "pyspark.pandas.tests.connect.test_parity_frame_interpolate", "pyspark.pandas.tests.connect.test_parity_series_interpolate", "pyspark.pandas.tests.connect.test_parity_frame_resample", "pyspark.pandas.tests.connect.test_parity_series_resample", - "pyspark.pandas.tests.connect.test_parity_reshape", - "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic", "pyspark.pandas.tests.connect.test_parity_ewm", "pyspark.pandas.tests.connect.test_parity_rolling", "pyspark.pandas.tests.connect.test_parity_expanding", - "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_expanding", "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby_rolling", "pyspark.pandas.tests.connect.computation.test_parity_missing_data", + "pyspark.pandas.tests.connect.groupby.test_parity_index", + "pyspark.pandas.tests.connect.groupby.test_parity_describe", + "pyspark.pandas.tests.connect.groupby.test_parity_head_tail", + "pyspark.pandas.tests.connect.groupby.test_parity_groupby", + ], + excluded_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and + # they aren't available there + ], +) + + +pyspark_pandas_connect_part3 = Module( + name="pyspark-pandas-connect-part3", + dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow], + source_file_regexes=[ + "python/pyspark/pandas", + ], + python_test_goals=[ + # pandas-on-Spark unittests + "pyspark.pandas.tests.connect.indexes.test_parity_datetime", + "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames", + "pyspark.pandas.tests.connect.test_parity_ops_on_diff_frames_groupby", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py index c21ab7b384a..441ae3cf8be 100755 --- a/dev/sparktestsupport/utils.py +++ b/dev/sparktestsupport/utils.py @@ -114,16 +114,16 @@ def determine_modules_to_test(changed_modules, deduplicated=True): ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', - 'pyspark-pandas-connect-part2', 'pyspark-pandas-slow', 'pyspark-sql', 'pyspark-testing', - 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + 'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', + 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] >>> sorted([x.name for x in determine_modules_to_test( ... [modules.sparkr, modules.sql], deduplicated=False)]) ... # doctest: +NORMALIZE_WHITESPACE ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', - 'pyspark-pandas-connect-part2', 'pyspark-pandas-slow', 'pyspark-sql', 'pyspark-testing', - 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + 'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', + 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] >>> sorted([x.name for x in determine_modules_to_test( ... [modules.sql, modules.core], deduplicated=False)]) ... # doctest: +NORMALIZE_WHITESPACE @@ -131,9 +131,9 @@ def determine_modules_to_test(changed_modules, deduplicated=True): 'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect', 'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', 'pyspark-pandas-connect-part2', - 'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', - 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', - 'streaming-kafka-0-10', 'streaming-kinesis-asl'] + 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql', + 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10', + 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl'] """ modules_to_test = set() for module in changed_modules: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org