(iceberg-python) branch main updated: Add papermill-based tests for PyIceberg examples (#3330)

fokko Fri, 15 May 2026 01:14:12 -0700

This is an automated email from the ASF dual-hosted git repository.

Fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 95c45d44 Add papermill-based tests for PyIceberg examples (#3330)
95c45d44 is described below

commit 95c45d44094cce592452dee9b81492673f1fe548
Author: Federico <[email protected]>
AuthorDate: Fri May 15 10:13:54 2026 +0200

    Add papermill-based tests for PyIceberg examples (#3330)
    
    Closes #3328
    
    # Rationale for this change
    `pyiceberg_example.ipynb` and `spark_integration_example.ipynb` had no
    automated test coverage. Breaking changes to notebook cells could go
    undetected in CI.
    
    This PR adds papermill-based tests that execute the
    real notebooks as-is, so any change to a cell is automatically reflected
    in the tests.
    
    
    ## Are these changes tested?
    Yes. The tests themselves are the change. Run them with:
    
    ```bash
    make test-notebook
    ```
    
    ## Are there any user-facing changes?
    No.
---
 Makefile                                          |   5 +-
 pyproject.toml                                    |   4 +
 tests/notebooks/test_pyiceberg_example.py         | 101 +++++++++++++
 tests/notebooks/test_spark_integration_example.py | 170 ++++++++++++++++++++++
 uv.lock                                           |  35 +++++
 5 files changed, 314 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d262de45..4fe76119 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 .PHONY: help install install-uv check-license lint \
         test test-integration test-integration-setup test-integration-exec 
test-integration-cleanup test-integration-rebuild \
-        test-s3 test-adls test-gcs test-coverage coverage-report \
+        test-s3 test-adls test-gcs test-coverage coverage-report test 
test-notebook\
         docs-serve docs-build notebook notebook-infra \
         clean
 
@@ -150,6 +150,9 @@ coverage-report: ## Combine and report coverage
        uv run $(PYTHON_ARG) coverage html
        uv run $(PYTHON_ARG) coverage xml
 
+test-notebook: ## Run notebook tests (pyiceberg_example and 
spark_integration_example) via papermill
+       $(TEST_RUNNER) pytest tests/notebooks/test_pyiceberg_example.py 
tests/notebooks/test_spark_integration_example.py -m notebook $(PYTEST_ARGS)
+
 # ================
 # Documentation
 # ================
diff --git a/pyproject.toml b/pyproject.toml
index ac1177db..96118f84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,6 +122,9 @@ dev = [
     "google-cloud-bigquery>=3.33.0,<4",
     "pyarrow-stubs>=20.0.0.20251107", # Remove when pyarrow >= 23.0.0 
https://github.com/apache/arrow/pull/47609
     "sqlalchemy>=2.0.18,<3",
+    "papermill>=2.6.0",
+    "nbformat>=5.10.0",
+    "ipykernel>=6.29.0",
 ]
 # for mkdocs
 docs = [
@@ -161,6 +164,7 @@ markers = [
   "integration: marks integration tests against Apache Spark",
   "gcs: marks a test as requiring access to gcs compliant storage (use with 
--gs.token, --gs.project, and --gs.endpoint)",
   "benchmark: collection of tests to validate read/write performance before 
and after a change",
+  "notebook: marks tests that execute Jupyter notebooks via papermill",
 ]
 
 # Turns a warning into an error
diff --git a/tests/notebooks/test_pyiceberg_example.py 
b/tests/notebooks/test_pyiceberg_example.py
new file mode 100644
index 00000000..eea5b499
--- /dev/null
+++ b/tests/notebooks/test_pyiceberg_example.py
@@ -0,0 +1,101 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / 
"pyiceberg_example.ipynb"
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+    """Concatenate all stdout streams from every executed cell."""
+    return "".join(
+        out.get("text", "")
+        for cell in nb.cells
+        for out in cell.get("outputs", [])
+        if out.get("output_type") == "stream" and out.get("name") == "stdout"
+    )
+
+
[email protected](scope="session")
+def pyiceberg_nb(tmp_path_factory: pytest.TempPathFactory) -> 
nbformat.NotebookNode:
+    out = tmp_path_factory.mktemp("nb_out") / "pyiceberg_example_out.ipynb"
+    return pm.execute_notebook(str(NOTEBOOK_PATH), str(out), 
kernel_name="python3")
+
+
+class TestSmoke:
+    def test_notebook_completes_without_error(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        """papermill raises PapermillExecutionError if any cell fails."""
+        assert pyiceberg_nb is not None
+
+    def test_all_code_cells_executed(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        for cell in pyiceberg_nb.cells:
+            if cell.cell_type == "code":
+                assert cell.get("execution_count") is not None, f"Cell not 
executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+    def test_pyiceberg_version_printed(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        assert "PyIceberg version:" in get_all_stdout(pyiceberg_nb)
+
+    def test_warehouse_location_printed(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Warehouse location:" in stdout
+        assert "iceberg_warehouse_" in stdout
+
+    def test_catalog_loaded_successfully(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        assert "Catalog loaded successfully!" in get_all_stdout(pyiceberg_nb)
+
+    def test_namespace_default_created(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        assert "default" in get_all_stdout(pyiceberg_nb)
+
+    def test_rows_written_is_five(self, pyiceberg_nb: nbformat.NotebookNode) 
-> None:
+        assert "Rows written: 5" in get_all_stdout(pyiceberg_nb)
+
+    def test_schema_evolved_message(self, pyiceberg_nb: nbformat.NotebookNode) 
-> None:
+        assert "Schema evolved!" in get_all_stdout(pyiceberg_nb)
+
+    def test_tip_per_mile_column_present_after_evolution(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        assert "tip_per_mile" in get_all_stdout(pyiceberg_nb)
+
+    def test_filter_result_is_positive(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        """The notebook prints 'Rows with tip_per_mile > 1.0: N' — N must be > 
0."""
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Rows with tip_per_mile > 1.0:" in stdout
+        for line in stdout.splitlines():
+            if "Rows with tip_per_mile > 1.0:" in line:
+                count = int(line.split(":")[-1].strip())
+                assert count > 0
+                break
+
+    def test_snapshot_id_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> 
None:
+        assert "Current snapshot ID:" in get_all_stdout(pyiceberg_nb)
+
+    def test_table_history_has_entries(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Table history:" in stdout
+        assert "Snapshot:" in stdout
+
+    def test_warehouse_contains_parquet_and_metadata_files(self, pyiceberg_nb: 
nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert ".parquet" in stdout
+        assert ".metadata.json" in stdout
diff --git a/tests/notebooks/test_spark_integration_example.py 
b/tests/notebooks/test_spark_integration_example.py
new file mode 100644
index 00000000..e242e431
--- /dev/null
+++ b/tests/notebooks/test_spark_integration_example.py
@@ -0,0 +1,170 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+import textwrap
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / 
"spark_integration_example.ipynb"
+
+# ---------------------------------------------------------------------------
+# Mock pyspark
+# Replaces pyspark.sql.SparkSession with a fake one
+# ---------------------------------------------------------------------------
+_MOCK_PYSPARK = textwrap.dedent("""\
+    import sys
+    import types
+    from unittest.mock import MagicMock
+
+    def _make_fake_pyspark():
+        pyspark_mod = types.ModuleType("pyspark")
+        sql_mod     = types.ModuleType("pyspark.sql")
+        pyspark_mod.sql = sql_mod
+        sys.modules.setdefault("pyspark",     pyspark_mod)
+        sys.modules.setdefault("pyspark.sql", sql_mod)
+        return pyspark_mod, sql_mod
+
+    _pyspark, _sql = _make_fake_pyspark()
+
+    _SHOW_CATALOGS = (
+        "+-------------+\\n"
+        "|catalogName  |\\n"
+        "+-------------+\\n"
+        "|spark_catalog|\\n"
+        "|local        |\\n"
+        "+-------------+\\n"
+    )
+    _SHOW_NAMESPACES = (
+        "+---------+\\n"
+        "|namespace|\\n"
+        "+---------+\\n"
+        "|default  |\\n"
+        "+---------+\\n"
+    )
+    _SHOW_TABLES = (
+        "+---------+-----------+-----------+\\n"
+        "|namespace|tableName  |isTemporary|\\n"
+        "+---------+-----------+-----------+\\n"
+        "|default  |test_all   |false      |\\n"
+        "+---------+-----------+-----------+\\n"
+    )
+    _DESCRIBE_TABLE = (
+        "+--------------------+---------+-------+\\n"
+        "|col_name            |data_type|comment|\\n"
+        "+--------------------+---------+-------+\\n"
+        "|boolean_col         |boolean  |null   |\\n"
+        "|integer_col         |integer  |null   |\\n"
+        "+--------------------+---------+-------+\\n"
+    )
+    _SQL_RESPONSES = {
+        "SHOW CATALOGS":                        _SHOW_CATALOGS,
+        "SHOW NAMESPACES":                       _SHOW_NAMESPACES,
+        "SHOW TABLES FROM default":              _SHOW_TABLES,
+        "DESCRIBE TABLE default.test_all_types": _DESCRIBE_TABLE,
+    }
+
+    def _make_df(output):
+        df = MagicMock()
+        df.show.side_effect = lambda *a, **kw: print(output, end="")
+        return df
+
+    class _FakeBuilder:
+        def remote(self, url): return self
+        def getOrCreate(self): return _FakeSession()
+
+    class _FakeSession:
+        builder = _FakeBuilder()
+        def sql(self, query):
+            key = query.strip().rstrip(";")
+            output = _SQL_RESPONSES.get(key, "+------+\\n| col  
|\\n+------+\\n| val  |\\n+------+\\n")
+            return _make_df(output)
+
+    _FakeSparkSession = MagicMock(spec=object)
+    _FakeSparkSession.builder = _FakeBuilder()
+    _sql.SparkSession = _FakeSparkSession
+""")
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+    """Concatenate all stdout streams from every executed cell."""
+    return "".join(
+        out.get("text", "")
+        for cell in nb.cells
+        for out in cell.get("outputs", [])
+        if out.get("output_type") == "stream" and out.get("name") == "stdout"
+    )
+
+
+def _inject_mock_and_execute(notebook_path: Path, output_path: Path) -> 
nbformat.NotebookNode:
+    """
+    Load the real notebook, prepend the mock-pyspark setup cell, write to a
+    temporary copy and execute it with papermill.
+    """
+    nb = nbformat.read(str(notebook_path), as_version=4)
+
+    mock_cell = nbformat.v4.new_code_cell(_MOCK_PYSPARK)
+    mock_cell.metadata["tags"] = ["injected-mock"]
+    nb.cells.insert(0, mock_cell)
+
+    patched_path = output_path.parent / "spark_patched.ipynb"
+    nbformat.write(nb, str(patched_path))
+
+    return pm.execute_notebook(str(patched_path), str(output_path), 
kernel_name="python3")
+
+
[email protected](scope="session")
+def spark_nb(tmp_path_factory: pytest.TempPathFactory) -> 
nbformat.NotebookNode:
+    out = tmp_path_factory.mktemp("nb_out") / 
"spark_integration_example_out.ipynb"
+    return _inject_mock_and_execute(NOTEBOOK_PATH, out)
+
+
+class TestSmoke:
+    def test_notebook_completes_without_error(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        assert spark_nb is not None
+
+    def test_all_code_cells_executed(self, spark_nb: nbformat.NotebookNode) -> 
None:
+        for cell in spark_nb.cells:
+            if cell.cell_type == "code":
+                assert cell.get("execution_count") is not None, f"Cell not 
executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+    def test_show_catalogs_lists_spark_catalog_and_local(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(spark_nb)
+        assert "spark_catalog" in stdout
+        assert "local" in stdout
+
+    def test_show_namespaces_contains_default(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        assert "default" in get_all_stdout(spark_nb)
+
+    def test_show_tables_produces_tabular_output(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        assert "+---------+-----------+-----------+" in 
get_all_stdout(spark_nb)
+
+    def test_describe_table_lists_column_names(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        assert "col_name" in get_all_stdout(spark_nb)
+
+    def test_describe_table_lists_data_types(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(spark_nb)
+        assert "boolean" in stdout or "integer" in stdout
+
+    def test_show_tables_includes_test_table_row(self, spark_nb: 
nbformat.NotebookNode) -> None:
+        assert "test_all" in get_all_stdout(spark_nb)
diff --git a/uv.lock b/uv.lock
index 27a669a6..2f67510b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1372,6 +1372,15 @@ wheels = [
     { url = 
"https://files.pythonhosted.org/packages/7a/e3/9d34173ec068631faea3ea6e73050700729363e7e33306a9a3218e5cdc61/duckdb-1.5.2-cp314-cp314-win_arm64.whl";,
 hash = 
"sha256:c9f3e0b71b8a50fccfb42794899285d9d318ce2503782b9dd54868e5ecd0ad31", size 
= 14402513, upload-time = "2026-04-13T11:30:06.609Z" },
 ]
 
+[[package]]
+name = "entrypoints"
+version = "0.4"
+source = { registry = "https://pypi.org/simple"; }
+sdist = { url = 
"https://files.pythonhosted.org/packages/ea/8d/a7121ffe5f402dc015277d2d31eb82d2187334503a011c18f2e78ecbb9b2/entrypoints-0.4.tar.gz";,
 hash = 
"sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4", size 
= 13974, upload-time = "2022-02-02T21:30:28.172Z" }
+wheels = [
+    { url = 
"https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl";,
 hash = 
"sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f", size 
= 5294, upload-time = "2022-02-02T21:30:26.024Z" },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
@@ -3896,6 +3905,26 @@ wheels = [
     { url = 
"https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl";,
 hash = 
"sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size 
= 8663, upload-time = "2024-01-18T20:08:11.28Z" },
 ]
 
+[[package]]
+name = "papermill"
+version = "2.7.0"
+source = { registry = "https://pypi.org/simple"; }
+dependencies = [
+    { name = "aiohttp", marker = "python_full_version == '3.12.*'" },
+    { name = "click" },
+    { name = "entrypoints" },
+    { name = "nbclient" },
+    { name = "nbformat" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tenacity" },
+    { name = "tqdm" },
+]
+sdist = { url = 
"https://files.pythonhosted.org/packages/f8/b6/92d770c5ced66ed0134256f8de781e98c824d3a0662af1643a91fcc36663/papermill-2.7.0.tar.gz";,
 hash = 
"sha256:ec10b37594a060662f57269e1ebd108c209d204450f00fdfeb70a1c7cfb7fbc8", size 
= 77961, upload-time = "2026-02-27T19:07:30.548Z" }
+wheels = [
+    { url = 
"https://files.pythonhosted.org/packages/95/9f/f9fd57a727dcc89c54e84455d8317bff7db05ef21bb6d05b03705111f7c0/papermill-2.7.0-py3-none-any.whl";,
 hash = 
"sha256:e1855e6670100a02bb4f8a6870484a5c10b84a8d2e49c49921c90209940c7514", size 
= 88858, upload-time = "2026-02-27T19:07:28.862Z" },
+]
+
 [[package]]
 name = "parso"
 version = "0.8.5"
@@ -4681,9 +4710,12 @@ dev = [
     { name = "docutils" },
     { name = "fastavro" },
     { name = "google-cloud-bigquery" },
+    { name = "ipykernel" },
     { name = "moto", extra = ["server"] },
     { name = "mypy-boto3-dynamodb" },
     { name = "mypy-boto3-glue" },
+    { name = "nbformat" },
+    { name = "papermill" },
     { name = "prek" },
     { name = "protobuf" },
     { name = "pyarrow-stubs" },
@@ -4771,9 +4803,12 @@ dev = [
     { name = "docutils", specifier = "!=0.21.post1" },
     { name = "fastavro", specifier = "==1.12.2" },
     { name = "google-cloud-bigquery", specifier = ">=3.33.0,<4" },
+    { name = "ipykernel", specifier = ">=6.29.0" },
     { name = "moto", extras = ["server"], specifier = ">=5.0.2,<6" },
     { name = "mypy-boto3-dynamodb", specifier = ">=1.28.18" },
     { name = "mypy-boto3-glue", specifier = ">=1.28.18" },
+    { name = "nbformat", specifier = ">=5.10.0" },
+    { name = "papermill", specifier = ">=2.6.0" },
     { name = "prek", specifier = ">=0.2.1,<0.4" },
     { name = "protobuf", specifier = "==6.33.5" },
     { name = "pyarrow-stubs", specifier = ">=20.0.0.20251107" },

(iceberg-python) branch main updated: Add papermill-based tests for PyIceberg examples (#3330)

Reply via email to