This is an automated email from the ASF dual-hosted git repository.
Fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 95c45d44 Add papermill-based tests for PyIceberg examples (#3330)
95c45d44 is described below
commit 95c45d44094cce592452dee9b81492673f1fe548
Author: Federico <[email protected]>
AuthorDate: Fri May 15 10:13:54 2026 +0200
Add papermill-based tests for PyIceberg examples (#3330)
Closes #3328
# Rationale for this change
`pyiceberg_example.ipynb` and `spark_integration_example.ipynb` had no
automated test coverage. Breaking changes to notebook cells could go
undetected in CI.
This PR adds papermill-based tests that execute the
real notebooks as-is, so any change to a cell is automatically reflected
in the tests.
## Are these changes tested?
Yes. The tests themselves are the change. Run them with:
```bash
make test-notebook
```
## Are there any user-facing changes?
No.
---
Makefile | 5 +-
pyproject.toml | 4 +
tests/notebooks/test_pyiceberg_example.py | 101 +++++++++++++
tests/notebooks/test_spark_integration_example.py | 170 ++++++++++++++++++++++
uv.lock | 35 +++++
5 files changed, 314 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index d262de45..4fe76119 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@
# under the License.
.PHONY: help install install-uv check-license lint \
test test-integration test-integration-setup test-integration-exec
test-integration-cleanup test-integration-rebuild \
- test-s3 test-adls test-gcs test-coverage coverage-report \
+ test-s3 test-adls test-gcs test-coverage coverage-report test
test-notebook\
docs-serve docs-build notebook notebook-infra \
clean
@@ -150,6 +150,9 @@ coverage-report: ## Combine and report coverage
uv run $(PYTHON_ARG) coverage html
uv run $(PYTHON_ARG) coverage xml
+test-notebook: ## Run notebook tests (pyiceberg_example and
spark_integration_example) via papermill
+ $(TEST_RUNNER) pytest tests/notebooks/test_pyiceberg_example.py
tests/notebooks/test_spark_integration_example.py -m notebook $(PYTEST_ARGS)
+
# ================
# Documentation
# ================
diff --git a/pyproject.toml b/pyproject.toml
index ac1177db..96118f84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,6 +122,9 @@ dev = [
"google-cloud-bigquery>=3.33.0,<4",
"pyarrow-stubs>=20.0.0.20251107", # Remove when pyarrow >= 23.0.0
https://github.com/apache/arrow/pull/47609
"sqlalchemy>=2.0.18,<3",
+ "papermill>=2.6.0",
+ "nbformat>=5.10.0",
+ "ipykernel>=6.29.0",
]
# for mkdocs
docs = [
@@ -161,6 +164,7 @@ markers = [
"integration: marks integration tests against Apache Spark",
"gcs: marks a test as requiring access to gcs compliant storage (use with
--gs.token, --gs.project, and --gs.endpoint)",
"benchmark: collection of tests to validate read/write performance before
and after a change",
+ "notebook: marks tests that execute Jupyter notebooks via papermill",
]
# Turns a warning into an error
diff --git a/tests/notebooks/test_pyiceberg_example.py
b/tests/notebooks/test_pyiceberg_example.py
new file mode 100644
index 00000000..eea5b499
--- /dev/null
+++ b/tests/notebooks/test_pyiceberg_example.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" /
"pyiceberg_example.ipynb"
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+ """Concatenate all stdout streams from every executed cell."""
+ return "".join(
+ out.get("text", "")
+ for cell in nb.cells
+ for out in cell.get("outputs", [])
+ if out.get("output_type") == "stream" and out.get("name") == "stdout"
+ )
+
+
[email protected](scope="session")
+def pyiceberg_nb(tmp_path_factory: pytest.TempPathFactory) ->
nbformat.NotebookNode:
+ out = tmp_path_factory.mktemp("nb_out") / "pyiceberg_example_out.ipynb"
+ return pm.execute_notebook(str(NOTEBOOK_PATH), str(out),
kernel_name="python3")
+
+
+class TestSmoke:
+ def test_notebook_completes_without_error(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ """papermill raises PapermillExecutionError if any cell fails."""
+ assert pyiceberg_nb is not None
+
+ def test_all_code_cells_executed(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ for cell in pyiceberg_nb.cells:
+ if cell.cell_type == "code":
+ assert cell.get("execution_count") is not None, f"Cell not
executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+ def test_pyiceberg_version_printed(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ assert "PyIceberg version:" in get_all_stdout(pyiceberg_nb)
+
+ def test_warehouse_location_printed(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ stdout = get_all_stdout(pyiceberg_nb)
+ assert "Warehouse location:" in stdout
+ assert "iceberg_warehouse_" in stdout
+
+ def test_catalog_loaded_successfully(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ assert "Catalog loaded successfully!" in get_all_stdout(pyiceberg_nb)
+
+ def test_namespace_default_created(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ assert "default" in get_all_stdout(pyiceberg_nb)
+
+ def test_rows_written_is_five(self, pyiceberg_nb: nbformat.NotebookNode)
-> None:
+ assert "Rows written: 5" in get_all_stdout(pyiceberg_nb)
+
+ def test_schema_evolved_message(self, pyiceberg_nb: nbformat.NotebookNode)
-> None:
+ assert "Schema evolved!" in get_all_stdout(pyiceberg_nb)
+
+ def test_tip_per_mile_column_present_after_evolution(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ assert "tip_per_mile" in get_all_stdout(pyiceberg_nb)
+
+ def test_filter_result_is_positive(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ """The notebook prints 'Rows with tip_per_mile > 1.0: N' — N must be >
0."""
+ stdout = get_all_stdout(pyiceberg_nb)
+ assert "Rows with tip_per_mile > 1.0:" in stdout
+ for line in stdout.splitlines():
+ if "Rows with tip_per_mile > 1.0:" in line:
+ count = int(line.split(":")[-1].strip())
+ assert count > 0
+ break
+
+ def test_snapshot_id_printed(self, pyiceberg_nb: nbformat.NotebookNode) ->
None:
+ assert "Current snapshot ID:" in get_all_stdout(pyiceberg_nb)
+
+ def test_table_history_has_entries(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ stdout = get_all_stdout(pyiceberg_nb)
+ assert "Table history:" in stdout
+ assert "Snapshot:" in stdout
+
+ def test_warehouse_contains_parquet_and_metadata_files(self, pyiceberg_nb:
nbformat.NotebookNode) -> None:
+ stdout = get_all_stdout(pyiceberg_nb)
+ assert ".parquet" in stdout
+ assert ".metadata.json" in stdout
diff --git a/tests/notebooks/test_spark_integration_example.py
b/tests/notebooks/test_spark_integration_example.py
new file mode 100644
index 00000000..e242e431
--- /dev/null
+++ b/tests/notebooks/test_spark_integration_example.py
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import textwrap
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" /
"spark_integration_example.ipynb"
+
+# ---------------------------------------------------------------------------
+# Mock pyspark
+# Replaces pyspark.sql.SparkSession with a fake one
+# ---------------------------------------------------------------------------
+_MOCK_PYSPARK = textwrap.dedent("""\
+ import sys
+ import types
+ from unittest.mock import MagicMock
+
+ def _make_fake_pyspark():
+ pyspark_mod = types.ModuleType("pyspark")
+ sql_mod = types.ModuleType("pyspark.sql")
+ pyspark_mod.sql = sql_mod
+ sys.modules.setdefault("pyspark", pyspark_mod)
+ sys.modules.setdefault("pyspark.sql", sql_mod)
+ return pyspark_mod, sql_mod
+
+ _pyspark, _sql = _make_fake_pyspark()
+
+ _SHOW_CATALOGS = (
+ "+-------------+\\n"
+ "|catalogName |\\n"
+ "+-------------+\\n"
+ "|spark_catalog|\\n"
+ "|local |\\n"
+ "+-------------+\\n"
+ )
+ _SHOW_NAMESPACES = (
+ "+---------+\\n"
+ "|namespace|\\n"
+ "+---------+\\n"
+ "|default |\\n"
+ "+---------+\\n"
+ )
+ _SHOW_TABLES = (
+ "+---------+-----------+-----------+\\n"
+ "|namespace|tableName |isTemporary|\\n"
+ "+---------+-----------+-----------+\\n"
+ "|default |test_all |false |\\n"
+ "+---------+-----------+-----------+\\n"
+ )
+ _DESCRIBE_TABLE = (
+ "+--------------------+---------+-------+\\n"
+ "|col_name |data_type|comment|\\n"
+ "+--------------------+---------+-------+\\n"
+ "|boolean_col |boolean |null |\\n"
+ "|integer_col |integer |null |\\n"
+ "+--------------------+---------+-------+\\n"
+ )
+ _SQL_RESPONSES = {
+ "SHOW CATALOGS": _SHOW_CATALOGS,
+ "SHOW NAMESPACES": _SHOW_NAMESPACES,
+ "SHOW TABLES FROM default": _SHOW_TABLES,
+ "DESCRIBE TABLE default.test_all_types": _DESCRIBE_TABLE,
+ }
+
+ def _make_df(output):
+ df = MagicMock()
+ df.show.side_effect = lambda *a, **kw: print(output, end="")
+ return df
+
+ class _FakeBuilder:
+ def remote(self, url): return self
+ def getOrCreate(self): return _FakeSession()
+
+ class _FakeSession:
+ builder = _FakeBuilder()
+ def sql(self, query):
+ key = query.strip().rstrip(";")
+ output = _SQL_RESPONSES.get(key, "+------+\\n| col
|\\n+------+\\n| val |\\n+------+\\n")
+ return _make_df(output)
+
+ _FakeSparkSession = MagicMock(spec=object)
+ _FakeSparkSession.builder = _FakeBuilder()
+ _sql.SparkSession = _FakeSparkSession
+""")
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+ """Concatenate all stdout streams from every executed cell."""
+ return "".join(
+ out.get("text", "")
+ for cell in nb.cells
+ for out in cell.get("outputs", [])
+ if out.get("output_type") == "stream" and out.get("name") == "stdout"
+ )
+
+
+def _inject_mock_and_execute(notebook_path: Path, output_path: Path) ->
nbformat.NotebookNode:
+ """
+ Load the real notebook, prepend the mock-pyspark setup cell, write to a
+ temporary copy and execute it with papermill.
+ """
+ nb = nbformat.read(str(notebook_path), as_version=4)
+
+ mock_cell = nbformat.v4.new_code_cell(_MOCK_PYSPARK)
+ mock_cell.metadata["tags"] = ["injected-mock"]
+ nb.cells.insert(0, mock_cell)
+
+ patched_path = output_path.parent / "spark_patched.ipynb"
+ nbformat.write(nb, str(patched_path))
+
+ return pm.execute_notebook(str(patched_path), str(output_path),
kernel_name="python3")
+
+
[email protected](scope="session")
+def spark_nb(tmp_path_factory: pytest.TempPathFactory) ->
nbformat.NotebookNode:
+ out = tmp_path_factory.mktemp("nb_out") /
"spark_integration_example_out.ipynb"
+ return _inject_mock_and_execute(NOTEBOOK_PATH, out)
+
+
+class TestSmoke:
+ def test_notebook_completes_without_error(self, spark_nb:
nbformat.NotebookNode) -> None:
+ assert spark_nb is not None
+
+ def test_all_code_cells_executed(self, spark_nb: nbformat.NotebookNode) ->
None:
+ for cell in spark_nb.cells:
+ if cell.cell_type == "code":
+ assert cell.get("execution_count") is not None, f"Cell not
executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+ def test_show_catalogs_lists_spark_catalog_and_local(self, spark_nb:
nbformat.NotebookNode) -> None:
+ stdout = get_all_stdout(spark_nb)
+ assert "spark_catalog" in stdout
+ assert "local" in stdout
+
+ def test_show_namespaces_contains_default(self, spark_nb:
nbformat.NotebookNode) -> None:
+ assert "default" in get_all_stdout(spark_nb)
+
+ def test_show_tables_produces_tabular_output(self, spark_nb:
nbformat.NotebookNode) -> None:
+ assert "+---------+-----------+-----------+" in
get_all_stdout(spark_nb)
+
+ def test_describe_table_lists_column_names(self, spark_nb:
nbformat.NotebookNode) -> None:
+ assert "col_name" in get_all_stdout(spark_nb)
+
+ def test_describe_table_lists_data_types(self, spark_nb:
nbformat.NotebookNode) -> None:
+ stdout = get_all_stdout(spark_nb)
+ assert "boolean" in stdout or "integer" in stdout
+
+ def test_show_tables_includes_test_table_row(self, spark_nb:
nbformat.NotebookNode) -> None:
+ assert "test_all" in get_all_stdout(spark_nb)
diff --git a/uv.lock b/uv.lock
index 27a669a6..2f67510b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1372,6 +1372,15 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/7a/e3/9d34173ec068631faea3ea6e73050700729363e7e33306a9a3218e5cdc61/duckdb-1.5.2-cp314-cp314-win_arm64.whl",
hash =
"sha256:c9f3e0b71b8a50fccfb42794899285d9d318ce2503782b9dd54868e5ecd0ad31", size
= 14402513, upload-time = "2026-04-13T11:30:06.609Z" },
]
+[[package]]
+name = "entrypoints"
+version = "0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url =
"https://files.pythonhosted.org/packages/ea/8d/a7121ffe5f402dc015277d2d31eb82d2187334503a011c18f2e78ecbb9b2/entrypoints-0.4.tar.gz",
hash =
"sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4", size
= 13974, upload-time = "2022-02-02T21:30:28.172Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl",
hash =
"sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f", size
= 5294, upload-time = "2022-02-02T21:30:26.024Z" },
+]
+
[[package]]
name = "exceptiongroup"
version = "1.3.0"
@@ -3896,6 +3905,26 @@ wheels = [
{ url =
"https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl",
hash =
"sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size
= 8663, upload-time = "2024-01-18T20:08:11.28Z" },
]
+[[package]]
+name = "papermill"
+version = "2.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "aiohttp", marker = "python_full_version == '3.12.*'" },
+ { name = "click" },
+ { name = "entrypoints" },
+ { name = "nbclient" },
+ { name = "nbformat" },
+ { name = "pyyaml" },
+ { name = "requests" },
+ { name = "tenacity" },
+ { name = "tqdm" },
+]
+sdist = { url =
"https://files.pythonhosted.org/packages/f8/b6/92d770c5ced66ed0134256f8de781e98c824d3a0662af1643a91fcc36663/papermill-2.7.0.tar.gz",
hash =
"sha256:ec10b37594a060662f57269e1ebd108c209d204450f00fdfeb70a1c7cfb7fbc8", size
= 77961, upload-time = "2026-02-27T19:07:30.548Z" }
+wheels = [
+ { url =
"https://files.pythonhosted.org/packages/95/9f/f9fd57a727dcc89c54e84455d8317bff7db05ef21bb6d05b03705111f7c0/papermill-2.7.0-py3-none-any.whl",
hash =
"sha256:e1855e6670100a02bb4f8a6870484a5c10b84a8d2e49c49921c90209940c7514", size
= 88858, upload-time = "2026-02-27T19:07:28.862Z" },
+]
+
[[package]]
name = "parso"
version = "0.8.5"
@@ -4681,9 +4710,12 @@ dev = [
{ name = "docutils" },
{ name = "fastavro" },
{ name = "google-cloud-bigquery" },
+ { name = "ipykernel" },
{ name = "moto", extra = ["server"] },
{ name = "mypy-boto3-dynamodb" },
{ name = "mypy-boto3-glue" },
+ { name = "nbformat" },
+ { name = "papermill" },
{ name = "prek" },
{ name = "protobuf" },
{ name = "pyarrow-stubs" },
@@ -4771,9 +4803,12 @@ dev = [
{ name = "docutils", specifier = "!=0.21.post1" },
{ name = "fastavro", specifier = "==1.12.2" },
{ name = "google-cloud-bigquery", specifier = ">=3.33.0,<4" },
+ { name = "ipykernel", specifier = ">=6.29.0" },
{ name = "moto", extras = ["server"], specifier = ">=5.0.2,<6" },
{ name = "mypy-boto3-dynamodb", specifier = ">=1.28.18" },
{ name = "mypy-boto3-glue", specifier = ">=1.28.18" },
+ { name = "nbformat", specifier = ">=5.10.0" },
+ { name = "papermill", specifier = ">=2.6.0" },
{ name = "prek", specifier = ">=0.2.1,<0.4" },
{ name = "protobuf", specifier = "==6.33.5" },
{ name = "pyarrow-stubs", specifier = ">=20.0.0.20251107" },