This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1904dee475d7 [SPARK-48116][INFRA] Run `pyspark-pandas*` only in PR builder and Daily Python CIs 1904dee475d7 is described below commit 1904dee475d735533ff5d0d2d3580e4e83b7520b Author: Dongjoon Hyun <dh...@apple.com> AuthorDate: Fri May 3 21:15:21 2024 -0700 [SPARK-48116][INFRA] Run `pyspark-pandas*` only in PR builder and Daily Python CIs ### What changes were proposed in this pull request? This PR aims to run `pyspark-pandas*` only in PR builder and Daily Python CIs. In other words, only the commit builder will skip it by default. Please note that all PR builders is not consuming ASF resources and they provides lots of test coverage everyday. - https://github.com/apache/spark/actions/workflows/build_python.yml ### Why are the changes needed? To reduce GitHub Action usage to meet ASF INFRA policy. - https://infra.apache.org/github-actions-policy.html > All workflows MUST have a job concurrency level less than or equal to 20. This means a workflow cannot have more than 20 jobs running at the same time across all matrices. Although `pandas` is an **optional** package in PySpark, this is essential for PySpark users and we have **6 test pipelines** which requires lots of resources. We need to optimize the job concurrently level to `less than or equal to 20` while keeping the test capability as much as possible. https://github.com/apache/spark/blob/f450272a9aac812d735eb5f741eec1f6cf1c837c/dev/requirements.txt#L4-L8 - pyspark-pandas - pyspark-pandas-slow - pyspark-pandas-connect-part0 - pyspark-pandas-connect-part1 - pyspark-pandas-connect-part2 - pyspark-pandas-connect-part3 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual review. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46367 from dongjoon-hyun/SPARK-48116. Authored-by: Dongjoon Hyun <dh...@apple.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .github/workflows/build_and_test.yml | 137 ++++++++++++++++++++++++++++++++++- .github/workflows/build_python.yml | 3 +- 2 files changed, 137 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0dc217570ba0..8568cd539f03 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -79,6 +79,11 @@ jobs: pyspark=true; sparkr=true; pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` pyspark=`./dev/is-changed.py -m $pyspark_modules` + if [ "${{ github.repository != 'apache/spark' }}" ]; then + pandas=$pyspark + else + pandas=false + fi sparkr=`./dev/is-changed.py -m sparkr` kubernetes=`./dev/is-changed.py -m kubernetes` # 'build' is always true for now. @@ -87,6 +92,7 @@ jobs: { \"build\": \"true\", \"pyspark\": \"$pyspark\", + \"pyspark-pandas\": \"$pandas\", \"sparkr\": \"$sparkr\", \"tpcds-1g\": \"false\", \"docker-integration-tests\": \"false\", @@ -347,12 +353,139 @@ jobs: pyspark-core, pyspark-errors, pyspark-streaming - >- pyspark-mllib, pyspark-ml, pyspark-ml-connect + - >- + pyspark-connect + env: + MODULES_TO_TEST: ${{ matrix.modules }} + PYTHON_TO_TEST: 'python3.11' + HADOOP_PROFILE: ${{ inputs.hadoop }} + HIVE_PROFILE: hive2.3 + GITHUB_PREV_SHA: ${{ github.event.before }} + SPARK_LOCAL_IP: localhost + SKIP_UNIDOC: true + SKIP_MIMA: true + SKIP_PACKAGING: true + METASPACE_SIZE: 1g + BRANCH: ${{ inputs.branch }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + # In order to fetch changed files + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Add GITHUB_WORKSPACE to git trust safe.directory + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktest...@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktest...@gmail.com' commit -m "Merged commit" --allow-empty + # Cache local repositories. Note that GitHub Actions cache has a 10G limit. + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + pyspark-coursier- + - name: Free up disk space + shell: 'script -q -e -c "bash {0}"' + run: | + if [ -f ./dev/free_disk_space_container ]; then + ./dev/free_disk_space_container + fi + - name: Install Java ${{ matrix.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ matrix.java }} + - name: List Python packages (${{ env.PYTHON_TO_TEST }}) + env: ${{ fromJSON(inputs.envs) }} + shell: 'script -q -e -c "bash {0}"' + run: | + for py in $(echo $PYTHON_TO_TEST | tr "," "\n") + do + echo $py + $py -m pip list + done + - name: Install Conda for pip packaging test + if: contains(matrix.modules, 'pyspark-errors') + run: | + curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh + bash miniconda.sh -b -p $HOME/miniconda + rm miniconda.sh + # Run the tests. + - name: Run tests + env: ${{ fromJSON(inputs.envs) }} + shell: 'script -q -e -c "bash {0}"' + run: | + if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then + export PATH=$PATH:$HOME/miniconda/bin + export SKIP_PACKAGING=false + echo "Python Packaging Tests Enabled!" + fi + if [ ! -z "$PYTHON_TO_TEST" ]; then + ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" + else + # For branch-3.5 and below, it uses the default Python versions. + ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" + fi + - name: Upload coverage to Codecov + if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' + uses: codecov/codecov-action@v4 + with: + files: ./python/coverage.xml + flags: unittests + name: PySpark + - name: Upload test results to report + env: ${{ fromJSON(inputs.envs) }} + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} + path: "**/target/test-reports/*.xml" + - name: Upload unit tests log files + env: ${{ fromJSON(inputs.envs) }} + if: ${{ !success() }} + uses: actions/upload-artifact@v4 + with: + name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} + path: "**/target/unit-tests.log" + + pyspark-pandas: + needs: [precondition, infra-image] + # always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true' + name: "Build modules: ${{ matrix.modules }}" + runs-on: ubuntu-latest + timeout-minutes: 180 + container: + image: ${{ needs.precondition.outputs.image_url }} + strategy: + fail-fast: false + matrix: + java: + - ${{ inputs.java }} + modules: - >- pyspark-pandas - >- pyspark-pandas-slow - - >- - pyspark-connect - >- pyspark-pandas-connect-part0 - >- diff --git a/.github/workflows/build_python.yml b/.github/workflows/build_python.yml index 761fd20f0c79..3354fb726368 100644 --- a/.github/workflows/build_python.yml +++ b/.github/workflows/build_python.yml @@ -44,5 +44,6 @@ jobs: } jobs: >- { - "pyspark": "true" + "pyspark": "true", + "pyspark-pandas": "true" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org