(iceberg-python) branch main updated: improve dockerfile for better caching (#2930)

kevinjqliu Tue, 20 Jan 2026 07:24:07 -0800

This is an automated email from the ASF dual-hosted git repository.

kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new ca93b319 improve dockerfile for better caching (#2930)
ca93b319 is described below

commit ca93b3192be5390b26dbd5e626e158da7ce68414
Author: Kevin Liu <[email protected]>
AuthorDate: Tue Jan 20 10:23:12 2026 -0500

    improve dockerfile for better caching (#2930)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    Reorder Dockerfile commands for better caching
    All files has the same functionalities, just reordered.
    
    ## Are these changes tested?
    
    ## Are there any user-facing changes?
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 dev/docker-compose-integration.yml | 22 +++++++++---------
 dev/hive/Dockerfile                | 25 +++++++++++++++------
 dev/spark/Dockerfile               | 46 +++++++++++++++++---------------------
 3 files changed, 51 insertions(+), 42 deletions(-)

diff --git a/dev/docker-compose-integration.yml 
b/dev/docker-compose-integration.yml
index 482468a9..03f5684c 100644
--- a/dev/docker-compose-integration.yml
+++ b/dev/docker-compose-integration.yml
@@ -17,21 +17,22 @@
 
 services:
   spark-iceberg:
-    container_name: pyiceberg-spark
+    image: pyiceberg-spark:latest
     build: spark/
+    container_name: pyiceberg-spark
     networks:
       iceberg_net:
     depends_on:
       - rest
       - hive
       - minio
+    ports:
+      - 15002:15002 # Spark Connect
+      - 4040:4040 # Spark UI
     environment:
       - AWS_ACCESS_KEY_ID=admin
       - AWS_SECRET_ACCESS_KEY=password
       - AWS_REGION=us-east-1
-    ports:
-      - 15002:15002 # Spark Connect
-      - 4040:4040 # Spark UI
     links:
       - rest:rest
       - hive:hive
@@ -60,10 +61,6 @@ services:
   minio:
     image: minio/minio
     container_name: pyiceberg-minio
-    environment:
-      - MINIO_ROOT_USER=admin
-      - MINIO_ROOT_PASSWORD=password
-      - MINIO_DOMAIN=minio
     networks:
       iceberg_net:
         aliases:
@@ -71,14 +68,18 @@ services:
     ports:
       - 9001:9001
       - 9000:9000
+    environment:
+      - MINIO_ROOT_USER=admin
+      - MINIO_ROOT_PASSWORD=password
+      - MINIO_DOMAIN=minio
     command: ["server", "/data", "--console-address", ":9001"]
   mc:
-    depends_on:
-      - minio
     image: minio/mc
     container_name: pyiceberg-mc
     networks:
       iceberg_net:
+    depends_on:
+      - minio
     environment:
       - AWS_ACCESS_KEY_ID=admin
       - AWS_SECRET_ACCESS_KEY=password
@@ -91,6 +92,7 @@ services:
       tail -f /dev/null
       "
   hive:
+    image: pyiceberg-hive:latest
     build: hive/
     container_name: pyiceberg-hive
     hostname: hive
diff --git a/dev/hive/Dockerfile b/dev/hive/Dockerfile
index 2c87b69e..e46a0357 100644
--- a/dev/hive/Dockerfile
+++ b/dev/hive/Dockerfile
@@ -15,17 +15,28 @@
 
 FROM apache/hive:4.0.0
 
-ENV HADOOP_VERSION=3.3.6
-ENV AWS_SDK_BUNDLE=1.12.753
+# Dependency versions - changing these invalidates the JAR download layer
+ARG HADOOP_VERSION=3.3.6
+ARG AWS_SDK_BUNDLE=1.12.753
+ARG MAVEN_MIRROR=https://repo1.maven.org/maven2
 
 USER root
 
-# Install curl, download JARs, and cleanup in a single layer
-RUN apt-get update -qq && apt-get -qq -y install curl && \
-    curl 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar
 -Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
-    curl 
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar
 -Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Install curl (separate layer - rarely changes)
+RUN apt-get update -qq && \
+    apt-get -qq -y install --no-install-recommends curl && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
+# Download JARs with retry logic (slow layer - only changes when versions 
change)
+RUN curl -fsSL --retry 3 --retry-delay 5 \
+        -o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
+        
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar"
 && \
+    curl -fsSL --retry 3 --retry-delay 5 \
+        -o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
+        
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar"
+
+# Copy configuration last (changes more frequently than JARs)
 COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
 
 USER hive
diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile
index 4b486c90..0e1f29d1 100644
--- a/dev/spark/Dockerfile
+++ b/dev/spark/Dockerfile
@@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
 FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
 
 # Dependency versions - keep these compatible
+# Changing these will invalidate the JAR download cache layer
 ARG ICEBERG_VERSION=1.10.1
 ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
 ARG HADOOP_VERSION=3.4.1
-ARG SCALA_VERSION=2.13
 ARG AWS_SDK_VERSION=2.24.6
 ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
 
 USER root
 WORKDIR ${SPARK_HOME}
 
-# Install curl for JAR downloads
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends curl && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy configuration (early for better caching)
-COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
-
-# Create event log directory
-RUN mkdir -p /home/iceberg/spark-events && \
+# Install curl and create directories
+RUN apt-get update -qq && \
+    apt-get install -qq -y --no-install-recommends curl && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    mkdir -p /home/iceberg/spark-events && \
     chown -R spark:spark /home/iceberg
 
-# Required JAR dependencies
-ENV JARS_TO_DOWNLOAD="\
-    
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
 \
-    
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
 \
-    
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar 
\
-    
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
-
-# Download JARs with retry logic
+# Download JARs with retry logic (most cacheable - only changes when versions 
change)
+# This is the slowest step, so we do it before copying config files
 RUN set -e && \
     cd "${SPARK_HOME}/jars" && \
-    for jar_path in ${JARS_TO_DOWNLOAD}; do \
+    for jar_path in \
+        
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar"
 \
+        
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar"
 \
+        
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar"
 \
+        
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar";
 \
+    do \
         jar_name=$(basename "${jar_path}") && \
-        echo "Downloading ${jar_name}..." && \
         curl -fsSL --retry 3 --retry-delay 5 \
              -o "${jar_name}" \
              "${MAVEN_MIRROR}/${jar_path}" && \
-        echo "✓ Downloaded ${jar_name}"; \
-    done && \
-    chown -R spark:spark "${SPARK_HOME}/jars"
+        chown spark:spark "${jar_name}"; \
+    done
+
+# Copy configuration last (changes more frequently than JARs)
+COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
 
 USER spark
 WORKDIR ${SPARK_HOME}
 
 # Start Spark Connect server
-CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true 
${SPARK_HOME}/sbin/start-connect-server.sh"]
+CMD ["bash", "-c", "SPARK_NO_DAEMONIZE=true 
${SPARK_HOME}/sbin/start-connect-server.sh"]

(iceberg-python) branch main updated: improve dockerfile for better caching (#2930)

Reply via email to