This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 5b965f70c057 [SPARK-48239][INFRA] Update the release docker image to 
follow what we use in Github Action jobs
5b965f70c057 is described below

commit 5b965f70c057cb478896feea2456fc59267596df
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Mon May 13 08:26:52 2024 +0900

    [SPARK-48239][INFRA] Update the release docker image to follow what we use 
in Github Action jobs
    
    ### What changes were proposed in this pull request?
    
    We have Github Action jobs to test package building and doc generation, but 
the execution environment is different from what we use for the release process.
    
    This PR updates the release docker image to follow what we use in Github 
Action: https://github.com/apache/spark/blob/master/dev/infra/Dockerfile
    
    Note: it's not exactly the same, as I have to do some modification to make 
it usable for the release process. In the future we should have a better way to 
unify these two docker files.
    
    ### Why are the changes needed?
    
    to make us be able to release
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    manually
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #46534 from cloud-fan/re.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 dev/create-release/release-build.sh    |   3 +
 dev/create-release/spark-rm/Dockerfile | 170 +++++++++++++++++++++------------
 2 files changed, 112 insertions(+), 61 deletions(-)

diff --git a/dev/create-release/release-build.sh 
b/dev/create-release/release-build.sh
index b720a8fc9386..0fb16aafcbaa 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -80,6 +80,9 @@ done
 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
 
+export PYSPARK_PYTHON=/usr/local/bin/python
+export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python
+
 # Commit ref to checkout when building
 GIT_REF=${GIT_REF:-master}
 
diff --git a/dev/create-release/spark-rm/Dockerfile 
b/dev/create-release/spark-rm/Dockerfile
index 13f4112ca03d..adaa4df3f579 100644
--- a/dev/create-release/spark-rm/Dockerfile
+++ b/dev/create-release/spark-rm/Dockerfile
@@ -15,74 +15,122 @@
 # limitations under the License.
 #
 
-# Image for building Spark releases. Based on Ubuntu 20.04.
-#
-# Includes:
-# * Java 17
-# * Ivy
-# * Python (3.8.5)
-# * R-base/R-base-dev (4.0.3)
-# * Ruby (2.7.0)
-#
-# You can test it as below:
-#   cd dev/create-release/spark-rm
-#   docker build -t spark-rm --build-arg UID=$UID .
+# Image for building Spark releases. Based on Ubuntu 22.04.
+FROM ubuntu:jammy-20240227
 
-FROM ubuntu:20.04
+ENV FULL_REFRESH_DATE 20240318
 
-# For apt to be noninteractive
 ENV DEBIAN_FRONTEND noninteractive
 ENV DEBCONF_NONINTERACTIVE_SEEN true
 
-# These arguments are just for reuse and not really meant to be customized.
-ARG APT_INSTALL="apt-get install --no-install-recommends -y"
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    ca-certificates \
+    curl \
+    gfortran \
+    git \
+    subversion \
+    gnupg \
+    libcurl4-openssl-dev \
+    libfontconfig1-dev \
+    libfreetype6-dev \
+    libfribidi-dev \
+    libgit2-dev \
+    libharfbuzz-dev \
+    libjpeg-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    libpng-dev \
+    libpython3-dev \
+    libssl-dev \
+    libtiff5-dev \
+    libxml2-dev \
+    nodejs \
+    npm \
+    openjdk-17-jdk-headless \
+    pandoc \
+    pkg-config \
+    python3.10 \
+    python3-psutil \
+    texlive-latex-base \
+    texlive \
+    texlive-fonts-extra \
+    texinfo \
+    texlive-latex-extra \
+    qpdf \
+    r-base \
+    ruby \
+    ruby-dev \
+    software-properties-common \
+    wget \
+    zlib1g-dev \
+    && rm -rf /var/lib/apt/lists/*
 
-ARG PIP_PKGS="sphinx==4.5.0 mkdocs==1.1.2 numpy==1.20.3 
pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 
jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 
sphinx-copybutton==0.5.2 pandas==2.0.3 pyarrow==10.0.1 plotly==5.4.0 
markupsafe==2.0.1 docutils<0.17 grpcio==1.62.0 protobuf==4.21.6 
grpcio-status==1.62.0 googleapis-common-protos==1.56.4"
-ARG GEM_PKGS="bundler:2.4.22"
 
-# Install extra needed repos and refresh.
-# - CRAN repo
-# - Ruby repo (for doc generation)
-#
-# This is all in a single "RUN" command so that if anything changes, "apt 
update" is run to fetch
-# the most current package versions (instead of potentially using old versions 
cached by docker).
-RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \
-  echo 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' >> 
/etc/apt/sources.list && \
-  gpg --keyserver hkps://keyserver.ubuntu.com --recv-key 
E298A3A825C0D65DFD57CBB651716619E084DAB9 && \
-  gpg -a --export E084DAB9 | apt-key add - && \
-  apt-get clean && \
-  rm -rf /var/lib/apt/lists/* && \
-  apt-get clean && \
-  apt-get update && \
-  $APT_INSTALL software-properties-common && \
-  apt-get update && \
-  # Install openjdk 17.
-  $APT_INSTALL openjdk-17-jdk && \
-  update-alternatives --set java $(ls /usr/lib/jvm/java-17-openjdk-*/bin/java) 
&& \
-  # Install build / source control tools
-  $APT_INSTALL curl wget git maven ivy subversion make gcc lsof libffi-dev \
-    pandoc pandoc-citeproc libssl-dev libcurl4-openssl-dev libxml2-dev && \
-  curl -sL https://deb.nodesource.com/setup_12.x | bash && \
-  $APT_INSTALL nodejs && \
-  # Install needed python packages. Use pip for installing packages (for 
consistency).
-  $APT_INSTALL python-is-python3 python3-pip python3-setuptools && \
-  # qpdf is required for CRAN checks to pass.
-  $APT_INSTALL qpdf jq && \
-  pip3 install $PIP_PKGS && \
-  # Install R packages and dependencies used when building.
-  # R depends on pandoc*, libssl (which are installed above).
-  # Note that PySpark doc generation also needs pandoc due to nbsphinx
-  $APT_INSTALL r-base r-base-dev && \
-  $APT_INSTALL libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev && \
-  $APT_INSTALL texlive-latex-base texlive texlive-fonts-extra texinfo qpdf 
texlive-latex-extra && \
-  $APT_INSTALL libfontconfig1-dev libharfbuzz-dev libfribidi-dev 
libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev && \
-  Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 
'testthat', 'knitr', 'rmarkdown', 'markdown', 'roxygen2', 'e1071', 'survival'), 
repos='https://cloud.r-project.org/')" && \
-  Rscript -e "devtools::install_github('jimhester/lintr')" && \
-  Rscript -e "devtools::install_version('pkgdown', version='2.0.1', 
repos='https://cloud.r-project.org')" && \
-  Rscript -e "devtools::install_version('preferably', version='0.4', 
repos='https://cloud.r-project.org')" && \
-  # Install tools needed to build the documentation.
-  $APT_INSTALL ruby2.7 ruby2.7-dev && \
-  gem install --no-document $GEM_PKGS
+RUN echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> 
/etc/apt/sources.list
+RUN gpg --keyserver hkps://keyserver.ubuntu.com --recv-key 
E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN gpg -a --export E084DAB9 | apt-key add -
+RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu 
jammy-cran40/'
+
+# See more in SPARK-39959, roxygen2 < 7.2.1
+RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown',  \
+    'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow',  \
+    'ggplot2', 'mvtnorm', 'statmod', 'xml2'), 
repos='https://cloud.r-project.org/')" && \
+    Rscript -e "devtools::install_version('roxygen2', version='7.2.0', 
repos='https://cloud.r-project.org')" && \
+    Rscript -e "devtools::install_version('lintr', version='2.0.1', 
repos='https://cloud.r-project.org')" && \
+    Rscript -e "devtools::install_version('pkgdown', version='2.0.1', 
repos='https://cloud.r-project.org')" && \
+    Rscript -e "devtools::install_version('preferably', version='0.4', 
repos='https://cloud.r-project.org')"
+
+# See more in SPARK-39735
+ENV R_LIBS_SITE 
"/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
+
+
+RUN add-apt-repository ppa:pypy/ppa
+RUN mkdir -p /usr/local/pypy/pypy3.9 && \
+    curl -sqL 
https://downloads.python.org/pypy/pypy3.9-v7.3.16-linux64.tar.bz2 | tar xjf - 
-C /usr/local/pypy/pypy3.9 --strip-components=1 && \
+    ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.8 && \
+    ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
+RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.2' scipy coverage 
matplotlib lxml
+
+
+ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.2 scipy 
plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 
scikit-learn>=1.3.2"
+# Python deps for Spark Connect
+ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 
googleapis-common-protos==1.56.4"
+
+# Install Python 3.10 packages
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+RUN python3.10 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs 
this
+RUN python3.10 -m pip install --ignore-installed 'six==1.16.0'  # Avoid 
`python3-six` installation
+RUN python3.10 -m pip install $BASIC_PIP_PKGS unittest-xml-reporting 
$CONNECT_PIP_PKGS && \
+    python3.10 -m pip install torch torchvision --index-url 
https://download.pytorch.org/whl/cpu && \
+    python3.10 -m pip install deepspeed torcheval && \
+    python3.10 -m pip cache purge
+
+# Install Python 3.9
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get update && apt-get install -y \
+    python3.9 python3.9-distutils \
+    && rm -rf /var/lib/apt/lists/*
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
+RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs 
this
+RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting 
$CONNECT_PIP_PKGS && \
+    python3.9 -m pip install torch torchvision --index-url 
https://download.pytorch.org/whl/cpu && \
+    python3.9 -m pip install torcheval && \
+    python3.9 -m pip cache purge
+
+# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
+# See 'ipython_genutils' in SPARK-38517
+# See 'docutils<0.18.0' in SPARK-39421
+RUN python3.9 -m pip install 'sphinx==4.5.0' mkdocs 
'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 
markupsafe 'pyzmq<24.0.0' \
+ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow 
pandas 'plotly>=4.8' 'docutils<0.18.0' \
+'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 
'black==23.9.1' \
+'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 
'googleapis-common-protos-stubs==2.2.0' \
+'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 
'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 
'sphinxcontrib-serializinghtml==1.1.5'
+RUN python3.9 -m pip list
+
+RUN gem install --no-document "bundler:2.4.22"
+RUN ln -s "$(which python3.9)" "/usr/local/bin/python"
 
 WORKDIR /opt/spark-rm/output
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to