This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5b965f70c057 [SPARK-48239][INFRA] Update the release docker image to follow what we use in Github Action jobs 5b965f70c057 is described below commit 5b965f70c057cb478896feea2456fc59267596df Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Mon May 13 08:26:52 2024 +0900 [SPARK-48239][INFRA] Update the release docker image to follow what we use in Github Action jobs ### What changes were proposed in this pull request? We have Github Action jobs to test package building and doc generation, but the execution environment is different from what we use for the release process. This PR updates the release docker image to follow what we use in Github Action: https://github.com/apache/spark/blob/master/dev/infra/Dockerfile Note: it's not exactly the same, as I have to do some modification to make it usable for the release process. In the future we should have a better way to unify these two docker files. ### Why are the changes needed? to make us be able to release ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? manually ### Was this patch authored or co-authored using generative AI tooling? no Closes #46534 from cloud-fan/re. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/create-release/release-build.sh | 3 + dev/create-release/spark-rm/Dockerfile | 170 +++++++++++++++++++++------------ 2 files changed, 112 insertions(+), 61 deletions(-) diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index b720a8fc9386..0fb16aafcbaa 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -80,6 +80,9 @@ done export LC_ALL=C.UTF-8 export LANG=C.UTF-8 +export PYSPARK_PYTHON=/usr/local/bin/python +export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python + # Commit ref to checkout when building GIT_REF=${GIT_REF:-master} diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 13f4112ca03d..adaa4df3f579 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -15,74 +15,122 @@ # limitations under the License. # -# Image for building Spark releases. Based on Ubuntu 20.04. -# -# Includes: -# * Java 17 -# * Ivy -# * Python (3.8.5) -# * R-base/R-base-dev (4.0.3) -# * Ruby (2.7.0) -# -# You can test it as below: -# cd dev/create-release/spark-rm -# docker build -t spark-rm --build-arg UID=$UID . +# Image for building Spark releases. Based on Ubuntu 22.04. +FROM ubuntu:jammy-20240227 -FROM ubuntu:20.04 +ENV FULL_REFRESH_DATE 20240318 -# For apt to be noninteractive ENV DEBIAN_FRONTEND noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN true -# These arguments are just for reuse and not really meant to be customized. -ARG APT_INSTALL="apt-get install --no-install-recommends -y" +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + subversion \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + nodejs \ + npm \ + openjdk-17-jdk-headless \ + pandoc \ + pkg-config \ + python3.10 \ + python3-psutil \ + texlive-latex-base \ + texlive \ + texlive-fonts-extra \ + texinfo \ + texlive-latex-extra \ + qpdf \ + r-base \ + ruby \ + ruby-dev \ + software-properties-common \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* -ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==2.0.3 pyarrow==10.0.1 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 grpcio-status==1.62.0 googleapis-common-protos==1.56.4" -ARG GEM_PKGS="bundler:2.4.22" -# Install extra needed repos and refresh. -# - CRAN repo -# - Ruby repo (for doc generation) -# -# This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch -# the most current package versions (instead of potentially using old versions cached by docker). -RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ - echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> /etc/apt/sources.list && \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ - gpg -a --export E084DAB9 | apt-key add - && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - apt-get clean && \ - apt-get update && \ - $APT_INSTALL software-properties-common && \ - apt-get update && \ - # Install openjdk 17. - $APT_INSTALL openjdk-17-jdk && \ - update-alternatives --set java $(ls /usr/lib/jvm/java-17-openjdk-*/bin/java) && \ - # Install build / source control tools - $APT_INSTALL curl wget git maven ivy subversion make gcc lsof libffi-dev \ - pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev && \ - curl -sL https://deb.nodesource.com/setup_12.x | bash && \ - $APT_INSTALL nodejs && \ - # Install needed python packages. Use pip for installing packages (for consistency). - $APT_INSTALL python-is-python3 python3-pip python3-setuptools && \ - # qpdf is required for CRAN checks to pass. - $APT_INSTALL qpdf jq && \ - pip3 install $PIP_PKGS && \ - # Install R packages and dependencies used when building. - # R depends on pandoc*, libssl (which are installed above). - # Note that PySpark doc generation also needs pandoc due to nbsphinx - $APT_INSTALL r-base r-base-dev && \ - $APT_INSTALL libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev && \ - $APT_INSTALL texlive-latex-base texlive texlive-fonts-extra texinfo qpdf texlive-latex-extra && \ - $APT_INSTALL libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev && \ - Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" && \ - Rscript -e "devtools::install_github('jimhester/lintr')" && \ - Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ - Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" && \ - # Install tools needed to build the documentation. - $APT_INSTALL ruby2.7 ruby2.7-dev && \ - gem install --no-document $GEM_PKGS +RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list +RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN gpg -a --export E084DAB9 | apt-key add - +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' + +# See more in SPARK-39959, roxygen2 < 7.2.1 +RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', \ + 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', \ + 'ggplot2', 'mvtnorm', 'statmod', 'xml2'), repos='https://cloud.r-project.org/')" && \ + Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" && \ + Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" + +# See more in SPARK-39735 +ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library" + + +RUN add-apt-repository ppa:pypy/ppa +RUN mkdir -p /usr/local/pypy/pypy3.9 && \ + curl -sqL https://downloads.python.org/pypy/pypy3.9-v7.3.16-linux64.tar.bz2 | tar xjf - -C /usr/local/pypy/pypy3.9 --strip-components=1 && \ + ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.8 && \ + ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3 +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3 +RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.2' scipy coverage matplotlib lxml + + +ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.2 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4" + +# Install Python 3.10 packages +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +RUN python3.10 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.10 -m pip install --ignore-installed 'six==1.16.0' # Avoid `python3-six` installation +RUN python3.10 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.10 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.10 -m pip install deepspeed torcheval && \ + python3.10 -m pip cache purge + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 python3.9-distutils \ + && rm -rf /var/lib/apt/lists/* +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.9 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.9 -m pip install torcheval && \ + python3.9 -m pip cache purge + +# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 +# See 'ipython_genutils' in SPARK-38517 +# See 'docutils<0.18.0' in SPARK-39421 +RUN python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ +ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ +'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ +'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ +'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' +RUN python3.9 -m pip list + +RUN gem install --no-document "bundler:2.4.22" +RUN ln -s "$(which python3.9)" "/usr/local/bin/python" WORKDIR /opt/spark-rm/output --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org