(iceberg-python) branch main updated: CI: Use Spark base image for Docker (#2540)

fokko Fri, 17 Oct 2025 17:32:12 -0700

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 5ee5eeac CI: Use Spark base image for Docker (#2540)
5ee5eeac is described below

commit 5ee5eeac076b7d3e12ced7f59d4331c2e2f2ceec
Author: Kevin Liu <[email protected]>
AuthorDate: Thu Oct 2 12:51:11 2025 -0700

    CI: Use Spark base image for Docker (#2540)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    Closes #1527
    
    This PR modifies the `dev/Dockerfile` file to use spark as the base
    image. This should be better than downloading spark from source. And
    likely faster on github runner.
    
    This PR also
    - modifies provision.py uses spark connect
    - add healthcheck to spark docker container
    
    ## Are these changes tested?
    
    ## Are there any user-facing changes?
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 .github/workflows/python-ci.yml    |   3 +
 Makefile                           |   6 +-
 dev/Dockerfile                     | 123 ++++++++++++++-----------------------
 dev/docker-compose-integration.yml |   9 ++-
 dev/entrypoint.sh                  |  23 -------
 dev/provision.py                   |  56 +++++++----------
 mkdocs/docs/how-to-release.md      |   4 --
 ruff.toml                          |   1 -
 tests/integration/test_reads.py    |   8 +++
 9 files changed, 88 insertions(+), 145 deletions(-)

diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
index 9f8ebf72..6a2b144f 100644
--- a/.github/workflows/python-ci.yml
+++ b/.github/workflows/python-ci.yml
@@ -75,6 +75,9 @@ jobs:
 
     steps:
     - uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
+      with:
+        python-version: ${{ matrix.python }}
     - name: Install system dependencies
       run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for 
kerberos
     - name: Install
diff --git a/Makefile b/Makefile
index cbcc26dd..d142c5ad 100644
--- a/Makefile
+++ b/Makefile
@@ -100,10 +100,8 @@ test-integration: test-integration-setup 
test-integration-exec test-integration-
 test-integration-setup: ## Start Docker services for integration tests
        docker compose -f dev/docker-compose-integration.yml kill
        docker compose -f dev/docker-compose-integration.yml rm -f
-       docker compose -f dev/docker-compose-integration.yml up -d
-       sleep 10
-       docker compose -f dev/docker-compose-integration.yml cp 
./dev/provision.py spark-iceberg:/opt/spark/provision.py
-       docker compose -f dev/docker-compose-integration.yml exec -T 
spark-iceberg ipython ./provision.py
+       docker compose -f dev/docker-compose-integration.yml up -d --wait
+       $(POETRY) run python dev/provision.py
 
 test-integration-exec: ## Run integration tests (excluding provision)
        $(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS)
diff --git a/dev/Dockerfile b/dev/Dockerfile
index 77ba1548..d0fc6a4f 100644
--- a/dev/Dockerfile
+++ b/dev/Dockerfile
@@ -13,86 +13,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM python:3.12-bullseye
+ARG BASE_IMAGE_SPARK_VERSION=3.5.6
 
-RUN apt-get -qq update && \
-    apt-get -qq install -y --no-install-recommends \
-      sudo \
-      curl \
-      vim \
-      unzip \
-      openjdk-11-jdk \
-      build-essential \
-      software-properties-common \
-      ssh && \
-    apt-get -qq clean && \
-    rm -rf /var/lib/apt/lists/*
+FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
 
-# Optional env variables
-ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
-ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"}
-ENV 
PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
+# Dependency versions - keep these compatible
+ARG ICEBERG_VERSION=1.10.0
+ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
+ARG SPARK_VERSION=3.5.6
+ARG SCALA_VERSION=2.12
+ARG HADOOP_VERSION=3.3.4
+ARG AWS_SDK_VERSION=1.12.753
+ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
 
-RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p 
/home/iceberg/spark-events
+USER root
 WORKDIR ${SPARK_HOME}
 
-ENV SPARK_VERSION=3.5.6
-ENV SCALA_VERSION=2.12
-ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION}
-ENV ICEBERG_VERSION=1.10.0
-ENV PYICEBERG_VERSION=0.10.0
-ENV HADOOP_VERSION=3.3.4
-ENV AWS_SDK_VERSION=1.12.753
-
-# Try the primary Apache mirror (downloads.apache.org) first, then fall back 
to the archive
-RUN set -eux; \
-  FILE=spark-${SPARK_VERSION}-bin-hadoop3.tgz; \
-  URLS="https://downloads.apache.org/spark/spark-${SPARK_VERSION}/${FILE} 
https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${FILE}";; \
-  for url in $URLS; do \
-    echo "Attempting download: $url"; \
-    if curl --retry 3 --retry-delay 5 -f -s -C - "$url" -o "$FILE"; then \
-      echo "Downloaded from: $url"; \
-      break; \
-    else \
-      echo "Failed to download from: $url"; \
-    fi; \
-  done; \
-  if [ ! -f "$FILE" ]; then echo "Failed to download Spark from all mirrors" 
>&2; exit 1; fi; \
-  tar xzf "$FILE" --directory /opt/spark --strip-components 1; \
-  rm -rf "$FILE"
-
-# Download Spark Connect server JAR
-RUN curl --retry 5 -s -L 
https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
 \
-      -Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
-
-# Download iceberg spark runtime
-RUN curl --retry 5 -s 
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
 \
-      -Lo 
/opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
-
-# Download AWS bundle
-RUN curl --retry 5 -s 
https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
 \
-      -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
-
-# Download hadoop-aws (required for S3 support)
-RUN curl --retry 5 -s 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar
 \
-      -Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar
-
-# Download AWS SDK bundle
-RUN curl --retry 5 -s 
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar
 \
-      -Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar
-
-COPY spark-defaults.conf /opt/spark/conf
-ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
-
-RUN chmod u+x /opt/spark/sbin/* && \
-    chmod u+x /opt/spark/bin/*
-
-RUN pip3 install -q ipython
-
-RUN pip3 install "pyiceberg[s3fs,hive,pyarrow]==${PYICEBERG_VERSION}"
+# Install curl for JAR downloads
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl && \
+    rm -rf /var/lib/apt/lists/*
 
-COPY entrypoint.sh .
-COPY provision.py .
+# Copy configuration (early for better caching)
+COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
+
+# Create event log directory
+RUN mkdir -p /home/iceberg/spark-events && \
+    chown -R spark:spark /home/iceberg
+
+# Required JAR dependencies
+ENV JARS_TO_DOWNLOAD="\
+    
org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
 \
+    
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
 \
+    
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
 \
+    
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar 
\
+    
com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar"
+
+# Download JARs with retry logic
+RUN set -e && \
+    cd "${SPARK_HOME}/jars" && \
+    for jar_path in ${JARS_TO_DOWNLOAD}; do \
+        jar_name=$(basename "${jar_path}") && \
+        echo "Downloading ${jar_name}..." && \
+        curl -fsSL --retry 3 --retry-delay 5 \
+             -o "${jar_name}" \
+             "${MAVEN_MIRROR}/${jar_path}" && \
+        echo "✓ Downloaded ${jar_name}"; \
+    done && \
+    chown -R spark:spark "${SPARK_HOME}/jars"
+
+USER spark
+WORKDIR ${SPARK_HOME}
 
-ENTRYPOINT ["./entrypoint.sh"]
-CMD ["notebook"]
+# Start Spark Connect server
+CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true 
${SPARK_HOME}/sbin/start-connect-server.sh"]
diff --git a/dev/docker-compose-integration.yml 
b/dev/docker-compose-integration.yml
index ec4245bc..9e983356 100644
--- a/dev/docker-compose-integration.yml
+++ b/dev/docker-compose-integration.yml
@@ -17,7 +17,6 @@
 
 services:
   spark-iceberg:
-    image: python-integration
     container_name: pyiceberg-spark
     build: .
     networks:
@@ -37,6 +36,12 @@ services:
       - rest:rest
       - hive:hive
       - minio:minio
+    healthcheck:
+      test: ["CMD", "sh", "-c", "netstat -an | grep 15002 | grep LISTEN"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 90s
   rest:
     image: apache/iceberg-rest-fixture
     container_name: pyiceberg-rest
@@ -87,7 +92,7 @@ services:
       "
   hive:
     build: hive/
-    container_name: hive
+    container_name: pyiceberg-hive
     hostname: hive
     networks:
       iceberg_net:
diff --git a/dev/entrypoint.sh b/dev/entrypoint.sh
deleted file mode 100755
index 3912eb4b..00000000
--- a/dev/entrypoint.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-start-connect-server.sh
-
-tail -f /dev/null
diff --git a/dev/provision.py b/dev/provision.py
index 71bbbd73..695ef9b1 100644
--- a/dev/provision.py
+++ b/dev/provision.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import math
 
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import current_date, date_add, expr
@@ -23,35 +22,26 @@ from pyiceberg.catalog import load_catalog
 from pyiceberg.schema import Schema
 from pyiceberg.types import FixedType, NestedField, UUIDType
 
-# The configuration is important, otherwise we get many small
-# parquet files with a single row. When a positional delete
-# hits the Parquet file with one row, the parquet file gets
-# dropped instead of having a merge-on-read delete file.
-spark = (
-    SparkSession
-        .builder
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.default.parallelism", "1")
-        .getOrCreate()
-)
+# Create SparkSession against the remote Spark Connect server
+spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
 
 catalogs = {
-    'rest': load_catalog(
+    "rest": load_catalog(
         "rest",
         **{
             "type": "rest",
-            "uri": "http://rest:8181";,
-            "s3.endpoint": "http://minio:9000";,
+            "uri": "http://localhost:8181";,
+            "s3.endpoint": "http://localhost:9000";,
             "s3.access-key-id": "admin",
             "s3.secret-access-key": "password",
         },
     ),
-    'hive': load_catalog(
+    "hive": load_catalog(
         "hive",
         **{
             "type": "hive",
-            "uri": "thrift://hive:9083",
-            "s3.endpoint": "http://minio:9000";,
+            "uri": "thrift://localhost:9083",
+            "s3.endpoint": "http://localhost:9000";,
             "s3.access-key-id": "admin",
             "s3.secret-access-key": "password",
         },
@@ -119,7 +109,7 @@ for catalog_name, catalog in catalogs.items():
     #   v3: Using deletion vectors
 
     for format_version in [2, 3]:
-        identifier = 
f'{catalog_name}.default.test_positional_mor_deletes_v{format_version}'
+        identifier = 
f"{catalog_name}.default.test_positional_mor_deletes_v{format_version}"
         spark.sql(
             f"""
         CREATE OR REPLACE TABLE {identifier} (
@@ -137,10 +127,8 @@ for catalog_name, catalog in catalogs.items():
         """
         )
 
-        spark.sql(
-            f"""
-        INSERT INTO {identifier}
-        VALUES
+        spark.sql("""
+        SELECT * FROM VALUES
             (CAST('2023-03-01' AS date), 1, 'a'),
             (CAST('2023-03-02' AS date), 2, 'b'),
             (CAST('2023-03-03' AS date), 3, 'c'),
@@ -152,9 +140,9 @@ for catalog_name, catalog in catalogs.items():
             (CAST('2023-03-09' AS date), 9, 'i'),
             (CAST('2023-03-10' AS date), 10, 'j'),
             (CAST('2023-03-11' AS date), 11, 'k'),
-            (CAST('2023-03-12' AS date), 12, 'l');
-        """
-        )
+            (CAST('2023-03-12' AS date), 12, 'l')
+        AS t(dt, number, letter)
+        """).coalesce(1).writeTo(identifier).append()
 
         spark.sql(f"ALTER TABLE {identifier} CREATE TAG tag_12")
 
@@ -164,7 +152,7 @@ for catalog_name, catalog in catalogs.items():
 
         spark.sql(f"DELETE FROM {identifier} WHERE number = 9")
 
-        identifier = 
f'{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}'
+        identifier = 
f"{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}"
 
         spark.sql(
             f"""
@@ -178,15 +166,13 @@ for catalog_name, catalog in catalogs.items():
             'write.delete.mode'='merge-on-read',
             'write.update.mode'='merge-on-read',
             'write.merge.mode'='merge-on-read',
-            'format-version'='2'
+            'format-version'='{format_version}'
           );
         """
         )
 
-        spark.sql(
-            f"""
-        INSERT INTO {identifier}
-        VALUES
+        spark.sql("""
+        SELECT * FROM VALUES
             (CAST('2023-03-01' AS date), 1, 'a'),
             (CAST('2023-03-02' AS date), 2, 'b'),
             (CAST('2023-03-03' AS date), 3, 'c'),
@@ -198,9 +184,9 @@ for catalog_name, catalog in catalogs.items():
             (CAST('2023-03-09' AS date), 9, 'i'),
             (CAST('2023-03-10' AS date), 10, 'j'),
             (CAST('2023-03-11' AS date), 11, 'k'),
-            (CAST('2023-03-12' AS date), 12, 'l');
-        """
-        )
+            (CAST('2023-03-12' AS date), 12, 'l')
+        AS t(dt, number, letter)
+        """).coalesce(1).writeTo(identifier).append()
 
         # Perform two deletes, should produce:
         #   v2: two positional delete files in v2
diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md
index a2fa3f70..cefc982d 100644
--- a/mkdocs/docs/how-to-release.md
+++ b/mkdocs/docs/how-to-release.md
@@ -389,10 +389,6 @@ Run the [`Release Docs` Github 
Action](https://github.com/apache/iceberg-python/
 
 Make sure to create a PR to update the [GitHub issues 
template](https://github.com/apache/iceberg-python/blob/main/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml)
 with the latest version.
 
-### Update the integration tests
-
-Ensure to update the `PYICEBERG_VERSION` in the 
[Dockerfile](https://github.com/apache/iceberg-python/blob/main/dev/Dockerfile).
-
 ## Misc
 
 ### Set up GPG key and Upload to Apache Iceberg KEYS file
diff --git a/ruff.toml b/ruff.toml
index 11fd2a95..b7bc461c 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -16,7 +16,6 @@
 # under the License.
 
 src = ['pyiceberg','tests']
-extend-exclude = ["dev/provision.py"]
 
 # Exclude a variety of commonly ignored directories.
 exclude = [
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index 375eb35b..99116ad1 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -432,6 +432,8 @@ def test_pyarrow_deletes(catalog: Catalog, format_version: 
int) -> None:
     #  (11, 'k'),
     #  (12, 'l')
     test_positional_mor_deletes = 
catalog.load_table(f"default.test_positional_mor_deletes_v{format_version}")
+    if format_version == 2:
+        assert len(test_positional_mor_deletes.inspect.delete_files()) > 0, 
"Table should produce position delete files"
     arrow_table = test_positional_mor_deletes.scan().to_arrow()
     assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 
11, 12]
 
@@ -470,6 +472,8 @@ def test_pyarrow_deletes_double(catalog: Catalog, 
format_version: int) -> None:
     #  (11, 'k'),
     #  (12, 'l')
     test_positional_mor_double_deletes = 
catalog.load_table(f"default.test_positional_mor_double_deletes_v{format_version}")
+    if format_version == 2:
+        assert len(test_positional_mor_double_deletes.inspect.delete_files()) 
> 0, "Table should produce position delete files"
     arrow_table = test_positional_mor_double_deletes.scan().to_arrow()
     assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 
12]
 
@@ -508,6 +512,8 @@ def test_pyarrow_batches_deletes(catalog: Catalog, 
format_version: int) -> None:
     #  (11, 'k'),
     #  (12, 'l')
     test_positional_mor_deletes = 
catalog.load_table(f"default.test_positional_mor_deletes_v{format_version}")
+    if format_version == 2:
+        assert len(test_positional_mor_deletes.inspect.delete_files()) > 0, 
"Table should produce position delete files"
     arrow_table = 
test_positional_mor_deletes.scan().to_arrow_batch_reader().read_all()
     assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 
11, 12]
 
@@ -550,6 +556,8 @@ def test_pyarrow_batches_deletes_double(catalog: Catalog, 
format_version: int) -
     #  (11, 'k'),
     #  (12, 'l')
     test_positional_mor_double_deletes = 
catalog.load_table(f"default.test_positional_mor_double_deletes_v{format_version}")
+    if format_version == 2:
+        assert len(test_positional_mor_double_deletes.inspect.delete_files()) 
> 0, "Table should produce position delete files"
     arrow_table = 
test_positional_mor_double_deletes.scan().to_arrow_batch_reader().read_all()
     assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 
12]

(iceberg-python) branch main updated: CI: Use Spark base image for Docker (#2540)

Reply via email to