This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 8e60a04d19e [SPARK-44667][INFRA] Uninstall large ML libraries for non-ML jobs 8e60a04d19e is described below commit 8e60a04d19ed7b1d340eb7fb068df365f7969b43 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Sat Aug 5 08:05:29 2023 +0800 [SPARK-44667][INFRA] Uninstall large ML libraries for non-ML jobs ### What changes were proposed in this pull request? Uninstall large ML libraries for non-ML jobs ### Why are the changes needed? ML is integrating external frameworks: torch, deepspeed (maybe xgboost in future) those libraries are huge, and not needed in other jobs. this PR uninstall torch, which save ~1.3G ![image](https://github.com/apache/spark/assets/7322292/e8181924-ca30-4e1e-8808-659f6a75c1d1) ### Does this PR introduce _any_ user-facing change? no, infra-only ### How was this patch tested? updated CI Closes #42334 from zhengruifeng/infra_uninstall_torch. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .github/workflows/build_and_test.yml | 14 +++++++++++--- dev/sparktestsupport/modules.py | 18 ++++++++++++++++-- dev/sparktestsupport/utils.py | 23 +++++++++++++---------- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ea0c8e1d7fd..04585481a9c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -350,9 +350,11 @@ jobs: - >- pyspark-errors - >- - pyspark-sql, pyspark-mllib, pyspark-resource, pyspark-testing + pyspark-sql, pyspark-resource, pyspark-testing - >- - pyspark-core, pyspark-streaming, pyspark-ml + pyspark-core, pyspark-streaming + - >- + pyspark-mllib, pyspark-ml, pyspark-ml-connect - >- pyspark-pandas - >- @@ -411,7 +413,13 @@ jobs: restore-keys: | pyspark-coursier- - name: Free up disk space - run: ./dev/free_disk_space_container + shell: 'script -q -e -c "bash {0}"' + run: | + if [[ "$MODULES_TO_TEST" != *"pyspark-ml"* ]]; then + # uninstall libraries dedicated for ML testing + python3.9 -m pip uninstall -y torch torchvision torcheval torchtnt tensorboard mlflow + fi + ./dev/free_disk_space_container - name: Install Java ${{ matrix.java }} uses: actions/setup-java@v3 with: diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 9e45e0facef..b2f978c47ea 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -817,10 +817,9 @@ pyspark_pandas_slow = Module( pyspark_connect = Module( name="pyspark-connect", - dependencies=[pyspark_sql, pyspark_ml, connect], + dependencies=[pyspark_sql, connect], source_file_regexes=[ "python/pyspark/sql/connect", - "python/pyspark/ml/connect", ], python_test_goals=[ # sql doctests @@ -871,6 +870,21 @@ pyspark_connect = Module( "pyspark.sql.tests.connect.test_parity_pandas_udf_scalar", "pyspark.sql.tests.connect.test_parity_pandas_udf_grouped_agg", "pyspark.sql.tests.connect.test_parity_pandas_udf_window", + ], + excluded_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and + # they aren't available there + ], +) + + +pyspark_ml_connect = Module( + name="pyspark-ml-connect", + dependencies=[pyspark_connect, pyspark_ml], + source_file_regexes=[ + "python/pyspark/ml/connect", + ], + python_test_goals=[ # ml doctests "pyspark.ml.connect.functions", # ml unittests diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py index 816c982bd60..e79d864c320 100755 --- a/dev/sparktestsupport/utils.py +++ b/dev/sparktestsupport/utils.py @@ -112,25 +112,28 @@ def determine_modules_to_test(changed_modules, deduplicated=True): >>> sorted([x.name for x in determine_modules_to_test([modules.sql])]) ... # doctest: +NORMALIZE_WHITESPACE ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver', - 'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', - 'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-pandas-slow-connect', 'pyspark-sql', - 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + 'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', + 'pyspark-pandas', 'pyspark-pandas-connect', 'pyspark-pandas-slow', + 'pyspark-pandas-slow-connect', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', + 'sql-kafka-0-10'] >>> sorted([x.name for x in determine_modules_to_test( ... [modules.sparkr, modules.sql], deduplicated=False)]) ... # doctest: +NORMALIZE_WHITESPACE ['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver', - 'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', - 'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-pandas-slow-connect', 'pyspark-sql', - 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + 'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', + 'pyspark-pandas', 'pyspark-pandas-connect', 'pyspark-pandas-slow', + 'pyspark-pandas-slow-connect', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', + 'sql-kafka-0-10'] >>> sorted([x.name for x in determine_modules_to_test( ... [modules.sql, modules.core], deduplicated=False)]) ... # doctest: +NORMALIZE_WHITESPACE ['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx', 'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect', - 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect', - 'pyspark-pandas-slow', 'pyspark-pandas-slow-connect', 'pyspark-resource', 'pyspark-sql', - 'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10', - 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl'] + 'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas', + 'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-pandas-slow-connect', + 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'pyspark-testing', 'repl', + 'root', 'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', + 'streaming-kinesis-asl'] """ modules_to_test = set() for module in changed_modules: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org