This is an automated email from the ASF dual-hosted git repository.
kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new ca93b319 improve dockerfile for better caching (#2930)
ca93b319 is described below
commit ca93b3192be5390b26dbd5e626e158da7ce68414
Author: Kevin Liu <[email protected]>
AuthorDate: Tue Jan 20 10:23:12 2026 -0500
improve dockerfile for better caching (#2930)
<!--
Thanks for opening a pull request!
-->
<!-- In the case this PR will resolve an issue, please replace
${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
<!-- Closes #${GITHUB_ISSUE_ID} -->
# Rationale for this change
Reorder Dockerfile commands for better caching
All files has the same functionalities, just reordered.
## Are these changes tested?
## Are there any user-facing changes?
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
dev/docker-compose-integration.yml | 22 +++++++++---------
dev/hive/Dockerfile | 25 +++++++++++++++------
dev/spark/Dockerfile | 46 +++++++++++++++++---------------------
3 files changed, 51 insertions(+), 42 deletions(-)
diff --git a/dev/docker-compose-integration.yml
b/dev/docker-compose-integration.yml
index 482468a9..03f5684c 100644
--- a/dev/docker-compose-integration.yml
+++ b/dev/docker-compose-integration.yml
@@ -17,21 +17,22 @@
services:
spark-iceberg:
- container_name: pyiceberg-spark
+ image: pyiceberg-spark:latest
build: spark/
+ container_name: pyiceberg-spark
networks:
iceberg_net:
depends_on:
- rest
- hive
- minio
+ ports:
+ - 15002:15002 # Spark Connect
+ - 4040:4040 # Spark UI
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
- ports:
- - 15002:15002 # Spark Connect
- - 4040:4040 # Spark UI
links:
- rest:rest
- hive:hive
@@ -60,10 +61,6 @@ services:
minio:
image: minio/minio
container_name: pyiceberg-minio
- environment:
- - MINIO_ROOT_USER=admin
- - MINIO_ROOT_PASSWORD=password
- - MINIO_DOMAIN=minio
networks:
iceberg_net:
aliases:
@@ -71,14 +68,18 @@ services:
ports:
- 9001:9001
- 9000:9000
+ environment:
+ - MINIO_ROOT_USER=admin
+ - MINIO_ROOT_PASSWORD=password
+ - MINIO_DOMAIN=minio
command: ["server", "/data", "--console-address", ":9001"]
mc:
- depends_on:
- - minio
image: minio/mc
container_name: pyiceberg-mc
networks:
iceberg_net:
+ depends_on:
+ - minio
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
@@ -91,6 +92,7 @@ services:
tail -f /dev/null
"
hive:
+ image: pyiceberg-hive:latest
build: hive/
container_name: pyiceberg-hive
hostname: hive
diff --git a/dev/hive/Dockerfile b/dev/hive/Dockerfile
index 2c87b69e..e46a0357 100644
--- a/dev/hive/Dockerfile
+++ b/dev/hive/Dockerfile
@@ -15,17 +15,28 @@
FROM apache/hive:4.0.0
-ENV HADOOP_VERSION=3.3.6
-ENV AWS_SDK_BUNDLE=1.12.753
+# Dependency versions - changing these invalidates the JAR download layer
+ARG HADOOP_VERSION=3.3.6
+ARG AWS_SDK_BUNDLE=1.12.753
+ARG MAVEN_MIRROR=https://repo1.maven.org/maven2
USER root
-# Install curl, download JARs, and cleanup in a single layer
-RUN apt-get update -qq && apt-get -qq -y install curl && \
- curl
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar
-Lo /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar && \
- curl
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar
-Lo /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
+# Install curl (separate layer - rarely changes)
+RUN apt-get update -qq && \
+ apt-get -qq -y install --no-install-recommends curl && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+# Download JARs with retry logic (slow layer - only changes when versions
change)
+RUN curl -fsSL --retry 3 --retry-delay 5 \
+ -o /opt/hive/lib/hadoop-aws-${HADOOP_VERSION}.jar \
+
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar"
&& \
+ curl -fsSL --retry 3 --retry-delay 5 \
+ -o /opt/hive/lib/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar \
+
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_BUNDLE}/aws-java-sdk-bundle-${AWS_SDK_BUNDLE}.jar"
+
+# Copy configuration last (changes more frequently than JARs)
COPY core-site.xml /opt/hadoop/etc/hadoop/core-site.xml
USER hive
diff --git a/dev/spark/Dockerfile b/dev/spark/Dockerfile
index 4b486c90..0e1f29d1 100644
--- a/dev/spark/Dockerfile
+++ b/dev/spark/Dockerfile
@@ -18,50 +18,46 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
# Dependency versions - keep these compatible
+# Changing these will invalidate the JAR download cache layer
ARG ICEBERG_VERSION=1.10.1
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
ARG HADOOP_VERSION=3.4.1
-ARG SCALA_VERSION=2.13
ARG AWS_SDK_VERSION=2.24.6
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
USER root
WORKDIR ${SPARK_HOME}
-# Install curl for JAR downloads
-RUN apt-get update && \
- apt-get install -y --no-install-recommends curl && \
- rm -rf /var/lib/apt/lists/*
-
-# Copy configuration (early for better caching)
-COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
-
-# Create event log directory
-RUN mkdir -p /home/iceberg/spark-events && \
+# Install curl and create directories
+RUN apt-get update -qq && \
+ apt-get install -qq -y --no-install-recommends curl && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/* && \
+ mkdir -p /home/iceberg/spark-events && \
chown -R spark:spark /home/iceberg
-# Required JAR dependencies
-ENV JARS_TO_DOWNLOAD="\
-
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
\
-
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
\
-
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar
\
-
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
-
-# Download JARs with retry logic
+# Download JARs with retry logic (most cacheable - only changes when versions
change)
+# This is the slowest step, so we do it before copying config files
RUN set -e && \
cd "${SPARK_HOME}/jars" && \
- for jar_path in ${JARS_TO_DOWNLOAD}; do \
+ for jar_path in \
+
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar"
\
+
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar"
\
+
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar"
\
+
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar";
\
+ do \
jar_name=$(basename "${jar_path}") && \
- echo "Downloading ${jar_name}..." && \
curl -fsSL --retry 3 --retry-delay 5 \
-o "${jar_name}" \
"${MAVEN_MIRROR}/${jar_path}" && \
- echo "✓ Downloaded ${jar_name}"; \
- done && \
- chown -R spark:spark "${SPARK_HOME}/jars"
+ chown spark:spark "${jar_name}"; \
+ done
+
+# Copy configuration last (changes more frequently than JARs)
+COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
USER spark
WORKDIR ${SPARK_HOME}
# Start Spark Connect server
-CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true
${SPARK_HOME}/sbin/start-connect-server.sh"]
+CMD ["bash", "-c", "SPARK_NO_DAEMONIZE=true
${SPARK_HOME}/sbin/start-connect-server.sh"]