This is an automated email from the ASF dual-hosted git repository. yikun pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark-docker.git
The following commit(s) were added to refs/heads/master by this push: new 7f83637 [SPARK-43365] Refactor Dockerfile and workflow based on base image 7f83637 is described below commit 7f836378d8bfe453b7e1dba304b54cb1cfacda49 Author: Yikun Jiang <yikunk...@gmail.com> AuthorDate: Sat May 6 09:15:41 2023 +0800 [SPARK-43365] Refactor Dockerfile and workflow based on base image ### What changes were proposed in this pull request? This PR changes Dockerfile and workflow based on base image to save space by sharing layers by having one image from another. After this PR: - The spark / PySpark / SparkR related files extract into base image - Install PySpark / SparkR deps in PySpark / SparkR images. - Add the base image build step - Apply changes to template: `./add-dockerfiles.sh 3.4.0` to make it work. - This PR didn't contain changes on 3.3.X Dockerfiles to make PR more clear, the 3.3.x changes will be a separate PR when we address all comments for 3.4.0. [1] https://github.com/docker-library/official-images/pull/13089?notification_referrer_id=NT_kwDOABp-orI0MzIwMzMwNzY5OjE3MzYzNTQ#issuecomment-1533540388 ### Why are the changes needed? Address DOI comments, and also to save space by sharing layers by having one image from another. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed. Closes #36 from Yikun/official. Authored-by: Yikun Jiang <yikunk...@gmail.com> Signed-off-by: Yikun Jiang <yikunk...@gmail.com> --- .github/workflows/main.yml | 20 ++++ 3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile | 63 +----------- .../entrypoint.sh | 114 --------------------- 3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile | 63 +----------- .../scala2.12-java11-python3-ubuntu/entrypoint.sh | 114 --------------------- 3.4.0/scala2.12-java11-r-ubuntu/Dockerfile | 60 +---------- 3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh | 107 ------------------- 3.4.0/scala2.12-java11-ubuntu/Dockerfile | 4 + 3.4.0/scala2.12-java11-ubuntu/entrypoint.sh | 7 ++ Dockerfile.template | 15 --- add-dockerfiles.sh | 9 +- entrypoint.sh.template | 2 - add-dockerfiles.sh => r-python.template | 54 +++------- tools/template.py | 16 +++ 14 files changed, 77 insertions(+), 571 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fd37990..c1d0c56 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -91,10 +91,12 @@ jobs: scala) SUFFIX=ubuntu ;; esac + BASE_IMGAE_TAG=${{ inputs.spark }}-scala${{ inputs.scala }}-java${{ inputs.java }}-ubuntu TAG=scala${{ inputs.scala }}-java${{ inputs.java }}-$SUFFIX IMAGE_NAME=spark IMAGE_PATH=${{ inputs.spark }}/$TAG + BASE_IMAGE_PATH=${{ inputs.spark }}/scala${{ inputs.scala }}-java${{ inputs.java }}-ubuntu if [ "${{ inputs.build }}" == "true" ]; then # Use the local registry to build and test REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') @@ -105,6 +107,7 @@ jobs: TEST_REPO=${{ inputs.repository }} UNIQUE_IMAGE_TAG=${{ inputs.image-tag }} fi + BASE_IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$BASE_IMGAE_TAG IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$UNIQUE_IMAGE_TAG PUBLISH_REPO=${{ inputs.repository }} @@ -116,8 +119,12 @@ jobs: echo "TEST_REPO=${TEST_REPO}" >> $GITHUB_ENV # Image name: spark echo "IMAGE_NAME=${IMAGE_NAME}" >> $GITHUB_ENV + # Base Image Dockerfile: 3.3.0/scala2.12-java11-ubuntu + echo "BASE_IMAGE_PATH=${BASE_IMAGE_PATH}" >> $GITHUB_ENV # Image dockerfile path: 3.3.0/scala2.12-java11-python3-ubuntu echo "IMAGE_PATH=${IMAGE_PATH}" >> $GITHUB_ENV + # Base Image URL: spark:3.3.0-scala2.12-java11-ubuntu + echo "BASE_IMAGE_URL=${BASE_IMAGE_URL}" >> $GITHUB_ENV # Image URL: ghcr.io/apache/spark-docker/spark:3.3.0-scala2.12-java11-python3-ubuntu echo "IMAGE_URL=${IMAGE_URL}" >> $GITHUB_ENV @@ -132,6 +139,9 @@ jobs: echo "IMAGE_PATH: "${IMAGE_PATH} echo "IMAGE_URL: "${IMAGE_URL} + echo "BASE_IMAGE_PATH: "${BASE_IMAGE_PATH} + echo "BASE_IMAGE_URL: "${BASE_IMAGE_URL} + echo "PUBLISH_REPO:"${PUBLISH_REPO} echo "PUBLISH_IMAGE_URL:"${PUBLISH_IMAGE_URL} @@ -146,10 +156,20 @@ jobs: # This required by local registry driver-opts: network=host + - name: Build - Build the base image + if: ${{ inputs.build }} + uses: docker/build-push-action@v3 + with: + context: ${{ env.BASE_IMAGE_PATH }} + tags: ${{ env.BASE_IMAGE_URL }} + platforms: linux/amd64,linux/arm64 + push: true + - name: Build - Build and push test image if: ${{ inputs.build }} uses: docker/build-push-action@v3 with: + build-args: BASE_IMAGE=${{ env.BASE_IMAGE_URL }} context: ${{ env.IMAGE_PATH }} tags: ${{ env.IMAGE_URL }} platforms: linux/amd64,linux/arm64 diff --git a/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile index 4f62e8d..86337c5 100644 --- a/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile @@ -14,73 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal - -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark +ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu +FROM $BASE_IMAGE RUN set -ex && \ apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ apt install -y python3 python3-pip && \ apt install -y r-base r-base-dev && \ - mkdir -p /opt/spark && \ - mkdir /opt/spark/python && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \ - GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1 - -RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv python/pyspark /opt/spark/python/pyspark/; \ - mv python/lib /opt/spark/python/lib/; \ - mv R /opt/spark/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark ENV R_HOME /usr/lib/R - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh deleted file mode 100644 index 4bb1557..0000000 --- a/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -if ! [ -z ${PYSPARK_PYTHON+x} ]; then - export PYSPARK_PYTHON -fi -if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then - export PYSPARK_DRIVER_PYTHON -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile index 2be0cb4..540805f 100644 --- a/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile @@ -14,70 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal - -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark +ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu +FROM $BASE_IMAGE RUN set -ex && \ apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ apt install -y python3 python3-pip && \ - mkdir -p /opt/spark && \ - mkdir /opt/spark/python && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* - -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \ - GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1 - -RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv python/pyspark /opt/spark/python/pyspark/; \ - mv python/lib /opt/spark/python/lib/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh deleted file mode 100644 index 4bb1557..0000000 --- a/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -if ! [ -z ${PYSPARK_PYTHON+x} ]; then - export PYSPARK_PYTHON -fi -if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then - export PYSPARK_DRIVER_PYTHON -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile index 22fe82b..c65c2ce 100644 --- a/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile @@ -14,69 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal - -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark +ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu +FROM $BASE_IMAGE RUN set -ex && \ apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ apt install -y r-base r-base-dev && \ - mkdir -p /opt/spark && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \ - GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1 - -RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv R /opt/spark/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark ENV R_HOME /usr/lib/R - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh deleted file mode 100644 index 159d539..0000000 --- a/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.4.0/scala2.12-java11-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-ubuntu/Dockerfile index 4e3df64..997b8d3 100644 --- a/3.4.0/scala2.12-java11-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-ubuntu/Dockerfile @@ -26,6 +26,7 @@ RUN set -ex && \ ln -s /lib /lib64 && \ apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ mkdir -p /opt/spark && \ + mkdir /opt/spark/python && \ mkdir -p /opt/spark/examples && \ mkdir -p /opt/spark/work-dir && \ touch /opt/spark/RELEASE && \ @@ -64,6 +65,9 @@ RUN set -ex; \ mv examples /opt/spark/; \ mv kubernetes/tests /opt/spark/; \ mv data /opt/spark/; \ + mv python/pyspark /opt/spark/python/pyspark/; \ + mv python/lib /opt/spark/python/lib/; \ + mv R /opt/spark/; \ cd ..; \ rm -rf "$SPARK_TMP"; diff --git a/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh index 159d539..4bb1557 100644 --- a/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh +++ b/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh @@ -45,6 +45,13 @@ if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" fi +if ! [ -z ${PYSPARK_PYTHON+x} ]; then + export PYSPARK_PYTHON +fi +if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then + export PYSPARK_DRIVER_PYTHON +fi + # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then diff --git a/Dockerfile.template b/Dockerfile.template index 4819cb2..5fe4f25 100644 --- a/Dockerfile.template +++ b/Dockerfile.template @@ -25,16 +25,8 @@ RUN set -ex && \ apt-get update && \ ln -s /lib /lib64 && \ apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ - {%- if HAVE_PY %} - apt install -y python3 python3-pip && \ - {%- endif %} - {%- if HAVE_R %} - apt install -y r-base r-base-dev && \ - {%- endif %} mkdir -p /opt/spark && \ - {%- if HAVE_PY %} mkdir /opt/spark/python && \ - {%- endif %} mkdir -p /opt/spark/examples && \ mkdir -p /opt/spark/work-dir && \ touch /opt/spark/RELEASE && \ @@ -73,22 +65,15 @@ RUN set -ex; \ mv examples /opt/spark/; \ mv kubernetes/tests /opt/spark/; \ mv data /opt/spark/; \ - {%- if HAVE_PY %} mv python/pyspark /opt/spark/python/pyspark/; \ mv python/lib /opt/spark/python/lib/; \ - {%- endif %} - {%- if HAVE_R %} mv R /opt/spark/; \ - {%- endif %} cd ..; \ rm -rf "$SPARK_TMP"; COPY entrypoint.sh /opt/ ENV SPARK_HOME /opt/spark -{%- if HAVE_R %} -ENV R_HOME /usr/lib/R -{%- endif %} WORKDIR /opt/spark/work-dir RUN chmod g+w /opt/spark/work-dir diff --git a/add-dockerfiles.sh b/add-dockerfiles.sh index 1683f33..7dcd7b0 100755 --- a/add-dockerfiles.sh +++ b/add-dockerfiles.sh @@ -48,6 +48,11 @@ for TAG in $TAGS; do OPTS+=" --spark-version $VERSION" mkdir -p $VERSION/$TAG - python3 tools/template.py $OPTS -f entrypoint.sh.template > $VERSION/$TAG/entrypoint.sh - python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile + + if [ "$TAG" == "scala2.12-java11-ubuntu" ]; then + python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile + python3 tools/template.py $OPTS -f entrypoint.sh.template > $VERSION/$TAG/entrypoint.sh + else + python3 tools/template.py $OPTS -f r-python.template > $VERSION/$TAG/Dockerfile + fi done diff --git a/entrypoint.sh.template b/entrypoint.sh.template index dd56d84..4bb1557 100644 --- a/entrypoint.sh.template +++ b/entrypoint.sh.template @@ -44,7 +44,6 @@ readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" fi -{%- if HAVE_PY %} if ! [ -z ${PYSPARK_PYTHON+x} ]; then export PYSPARK_PYTHON @@ -52,7 +51,6 @@ fi if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then export PYSPARK_DRIVER_PYTHON fi -{%- endif %} # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. diff --git a/add-dockerfiles.sh b/r-python.template old mode 100755 new mode 100644 similarity index 50% copy from add-dockerfiles.sh copy to r-python.template index 1683f33..fec4e70 --- a/add-dockerfiles.sh +++ b/r-python.template @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -16,38 +14,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -# Usage: $0 [version] -# Generate dockerfiles for specified spark version. -# -# Examples: -# - Add 3.3.0 dockerfiles: -# $ ./add-dockerfiles.sh -# - Add 3.3.1 dockerfiles: -# $ ./add-dockerfiles.sh 3.3.1 - -VERSION=${1:-"3.3.0"} - -TAGS=" -scala2.12-java11-python3-r-ubuntu -scala2.12-java11-python3-ubuntu -scala2.12-java11-r-ubuntu -scala2.12-java11-ubuntu -" - -for TAG in $TAGS; do - OPTS="" - if echo $TAG | grep -q "python"; then - OPTS+=" --pyspark" - fi - - if echo $TAG | grep -q "r-"; then - OPTS+=" --sparkr" - fi - - OPTS+=" --spark-version $VERSION" - - mkdir -p $VERSION/$TAG - python3 tools/template.py $OPTS -f entrypoint.sh.template > $VERSION/$TAG/entrypoint.sh - python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile -done +ARG BASE_IMAGE=spark:{{ SPARK_VERSION }}-scala{{ SCALA_VERSION }}-java{{ JAVA_VERSION }}-ubuntu +FROM $BASE_IMAGE + +RUN set -ex && \ + apt-get update && \ + {%- if HAVE_PY %} + apt install -y python3 python3-pip && \ + {%- endif %} + {%- if HAVE_R %} + apt install -y r-base r-base-dev && \ + {%- endif %} + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* +{%- if HAVE_R %} + +ENV R_HOME /usr/lib/R +{%- endif %} diff --git a/tools/template.py b/tools/template.py index 693182b..cb74cc3 100755 --- a/tools/template.py +++ b/tools/template.py @@ -50,6 +50,20 @@ def parse_opts(): default="3.3.0", ) + parser.add_argument( + "-j", + "--java-version", + help="The Spark version of Dockerfile.", + default="11", + ) + + parser.add_argument( + "-s", + "--scala-version", + help="The Spark version of Dockerfile.", + default="2.12", + ) + parser.add_argument( "-i", "--image", @@ -88,6 +102,8 @@ def main(): HAVE_R=opts.sparkr, SPARK_VERSION=opts.spark_version, SPARK_GPG_KEY=GPG_KEY_DICT.get(opts.spark_version), + JAVA_VERSION=opts.java_version, + SCALA_VERSION=opts.scala_version, ) ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org