This is an automated email from the ASF dual-hosted git repository.
kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 1b69a253 fix: Validate SetStatisticsUpdate correctly (#2866)
1b69a253 is described below
commit 1b69a2536640b3f8178fa6ac28d05123458a3dd8
Author: Ragnar Dahlén <[email protected]>
AuthorDate: Tue Dec 30 20:43:40 2025 +0100
fix: Validate SetStatisticsUpdate correctly (#2866)
Closes #2865
Previously the pydantic `@model_validator` on `SetStatisticsUpdate`
would fail because it assumed statistics was a model instance. In a
"before"" validator that is not necessarily case.
Check type and handle both models and dicts instead.
### Before
```
>>> import pyiceberg.table.update
>>>
pyiceberg.table.update.SetStatisticsUpdate.model_validate({'statistics':
{'snapshot-id': 1234, 'file-size-in-bytes': 0, 'statistics-path': '',
'file-footer-size-in-bytes': 0, 'blob-metadata': []}})
Traceback (most recent call last):
File "<python-input-1>", line 1, in <module>
pyiceberg.table.update.SetStatisticsUpdate.model_validate({'statistics':
{'snapshot-id': 1234, 'file-size-in-bytes': 0, 'statistics-path': '',
'file-footer-size-in-bytes': 0, 'blob-metadata': []}})
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/ragge/projects/github.com/ragnard/iceberg-python/.venv/lib/python3.14/site-packages/pydantic/main.py",
line 716, in model_validate
return cls.__pydantic_validator__.validate_python(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
obj,
^^^^
...<5 lines>...
by_name=by_name,
^^^^^^^^^^^^^^^^
)
^
File
"/home/ragge/projects/github.com/ragnard/iceberg-python/pyiceberg/table/update/__init__.py",
line 191, in validate_snapshot_id
data["snapshot_id"] = stats.snapshot_id
^^^^^^^^^^^^^^^^^
AttributeError: 'dict' object has no attribute 'snapshot_id'
```
### After
```
>>> import pyiceberg.table.update
>>>
pyiceberg.table.update.SetStatisticsUpdate.model_validate({'statistics':
{'snapshot-id': 1234, 'file-size-in-bytes': 0, 'statistics-path': '',
'file-footer-size-in-bytes': 0, 'blob-metadata': []}})
SetStatisticsUpdate(action='set-statistics',
statistics=StatisticsFile(snapshot_id=1234, statistics_path='',
file_size_in_bytes=0, file_footer_size_in_bytes=0, key_metadata=None,
blob_metadata=[]), snapshot_id=1234)
```
# Rationale for this change
## Are these changes tested?
Yes, but only using the two-liners above.
## Are there any user-facing changes?
No.
<!-- In the case of user-facing changes, please add the changelog label. -->
---
pyiceberg/table/update/__init__.py | 12 +++++--
tests/table/test_init.py | 66 +++++++++++++++++++++++++++++++++++++-
2 files changed, 74 insertions(+), 4 deletions(-)
diff --git a/pyiceberg/table/update/__init__.py
b/pyiceberg/table/update/__init__.py
index a79e2cb4..68e3d9f7 100644
--- a/pyiceberg/table/update/__init__.py
+++ b/pyiceberg/table/update/__init__.py
@@ -181,9 +181,15 @@ class SetStatisticsUpdate(IcebergBaseModel):
@model_validator(mode="before")
def validate_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]:
- stats = cast(StatisticsFile, data["statistics"])
-
- data["snapshot_id"] = stats.snapshot_id
+ stats = data["statistics"]
+ if isinstance(stats, StatisticsFile):
+ snapshot_id = stats.snapshot_id
+ elif isinstance(stats, dict):
+ snapshot_id = cast(int, stats.get("snapshot-id"))
+ else:
+ snapshot_id = None
+
+ data["snapshot_id"] = snapshot_id
return data
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
index 37d7f46e..e40513fe 100644
--- a/tests/table/test_init.py
+++ b/tests/table/test_init.py
@@ -21,7 +21,7 @@ from copy import copy
from typing import Any
import pytest
-from pydantic import ValidationError
+from pydantic import BaseModel, ValidationError
from sortedcontainers import SortedList
from pyiceberg.catalog.noop import NoopCatalog
@@ -1391,6 +1391,8 @@ def test_set_statistics_update(table_v2_with_statistics:
Table) -> None:
statistics=statistics_file,
)
+ assert model_roundtrips(update)
+
new_metadata = update_table_metadata(
table_v2_with_statistics.metadata,
(update,),
@@ -1425,6 +1427,57 @@ def test_set_statistics_update(table_v2_with_statistics:
Table) -> None:
assert json.loads(updated_statistics[0].model_dump_json()) ==
json.loads(expected)
+def
test_set_statistics_update_handles_deprecated_snapshot_id(table_v2_with_statistics:
Table) -> None:
+ snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id
+
+ blob_metadata = BlobMetadata(
+ type="apache-datasketches-theta-v1",
+ snapshot_id=snapshot_id,
+ sequence_number=2,
+ fields=[1],
+ properties={"prop-key": "prop-value"},
+ )
+
+ statistics_file = StatisticsFile(
+ snapshot_id=snapshot_id,
+ statistics_path="s3://bucket/warehouse/stats.puffin",
+ file_size_in_bytes=124,
+ file_footer_size_in_bytes=27,
+ blob_metadata=[blob_metadata],
+ )
+ update_with_model = SetStatisticsUpdate(statistics=statistics_file)
+ assert model_roundtrips(update_with_model)
+ assert update_with_model.snapshot_id == snapshot_id
+
+ update_with_dict = SetStatisticsUpdate.model_validate({"statistics":
statistics_file.model_dump()})
+ assert model_roundtrips(update_with_dict)
+ assert update_with_dict.snapshot_id == snapshot_id
+
+ update_json = """
+ {
+ "statistics":
+ {
+ "snapshot-id": 3055729675574597004,
+ "statistics-path": "s3://a/b/stats.puffin",
+ "file-size-in-bytes": 413,
+ "file-footer-size-in-bytes": 42,
+ "blob-metadata": [
+ {
+ "type": "apache-datasketches-theta-v1",
+ "snapshot-id": 3055729675574597004,
+ "sequence-number": 1,
+ "fields": [1]
+ }
+ ]
+ }
+ }
+ """
+
+ update_with_json = SetStatisticsUpdate.model_validate_json(update_json)
+ assert model_roundtrips(update_with_json)
+ assert update_with_json.snapshot_id == snapshot_id
+
+
def test_remove_statistics_update(table_v2_with_statistics: Table) -> None:
update = RemoveStatisticsUpdate(
snapshot_id=3055729675574597004,
@@ -1575,3 +1628,14 @@ def
test_add_snapshot_update_updates_next_row_id(table_v3: Table) -> None:
new_metadata = update_table_metadata(table_v3.metadata,
(AddSnapshotUpdate(snapshot=new_snapshot),))
assert new_metadata.next_row_id == 11
+
+
+def model_roundtrips(model: BaseModel) -> bool:
+ """Helper assertion that tests if a pydantic model roundtrips
+ successfully.
+ """
+ __tracebackhide__ = True
+ model_data = model.model_dump()
+ if model != type(model).model_validate(model_data):
+ pytest.fail(f"model {type(model)} did not roundtrip successfully")
+ return True