(iceberg-python) branch main updated: fix: Validate SetStatisticsUpdate correctly (#2866)

kevinjqliu Tue, 30 Dec 2025 11:43:52 -0800

This is an automated email from the ASF dual-hosted git repository.

kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 1b69a253 fix: Validate SetStatisticsUpdate correctly (#2866)
1b69a253 is described below

commit 1b69a2536640b3f8178fa6ac28d05123458a3dd8
Author: Ragnar Dahlén <[email protected]>
AuthorDate: Tue Dec 30 20:43:40 2025 +0100

    fix: Validate SetStatisticsUpdate correctly (#2866)
    
    Closes #2865
    
    Previously the pydantic `@model_validator` on `SetStatisticsUpdate`
    would fail because it assumed statistics was a model instance. In a
    "before"" validator that is not necessarily case.
    
    Check type and handle both models and dicts instead.
    
    ### Before
    
    ```
    >>> import pyiceberg.table.update
    >>> 
pyiceberg.table.update.SetStatisticsUpdate.model_validate({'statistics': 
{'snapshot-id': 1234, 'file-size-in-bytes': 0, 'statistics-path': '', 
'file-footer-size-in-bytes': 0, 'blob-metadata': []}})
    Traceback (most recent call last):
      File "<python-input-1>", line 1, in <module>
        
pyiceberg.table.update.SetStatisticsUpdate.model_validate({'statistics': 
{'snapshot-id': 1234, 'file-size-in-bytes': 0, 'statistics-path': '', 
'file-footer-size-in-bytes': 0, 'blob-metadata': []}})
        
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      File 
"/home/ragge/projects/github.com/ragnard/iceberg-python/.venv/lib/python3.14/site-packages/pydantic/main.py",
 line 716, in model_validate
        return cls.__pydantic_validator__.validate_python(
               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
            obj,
            ^^^^
        ...<5 lines>...
            by_name=by_name,
            ^^^^^^^^^^^^^^^^
        )
        ^
      File 
"/home/ragge/projects/github.com/ragnard/iceberg-python/pyiceberg/table/update/__init__.py",
 line 191, in validate_snapshot_id
        data["snapshot_id"] = stats.snapshot_id
                              ^^^^^^^^^^^^^^^^^
    AttributeError: 'dict' object has no attribute 'snapshot_id'
     ```
    
    ### After
    
    ```
    >>> import pyiceberg.table.update
    >>>
    pyiceberg.table.update.SetStatisticsUpdate.model_validate({'statistics':
    {'snapshot-id': 1234, 'file-size-in-bytes': 0, 'statistics-path': '',
    'file-footer-size-in-bytes': 0, 'blob-metadata': []}})
    SetStatisticsUpdate(action='set-statistics',
    statistics=StatisticsFile(snapshot_id=1234, statistics_path='',
    file_size_in_bytes=0, file_footer_size_in_bytes=0, key_metadata=None,
    blob_metadata=[]), snapshot_id=1234)
    ```
    
    
    # Rationale for this change
    
    ## Are these changes tested?
    
    Yes, but only using the two-liners above.
    
    ## Are there any user-facing changes?
    
    No.
    
    <!-- In the case of user-facing changes, please add the changelog label. -->
---
 pyiceberg/table/update/__init__.py | 12 +++++--
 tests/table/test_init.py           | 66 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/pyiceberg/table/update/__init__.py 
b/pyiceberg/table/update/__init__.py
index a79e2cb4..68e3d9f7 100644
--- a/pyiceberg/table/update/__init__.py
+++ b/pyiceberg/table/update/__init__.py
@@ -181,9 +181,15 @@ class SetStatisticsUpdate(IcebergBaseModel):
 
     @model_validator(mode="before")
     def validate_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]:
-        stats = cast(StatisticsFile, data["statistics"])
-
-        data["snapshot_id"] = stats.snapshot_id
+        stats = data["statistics"]
+        if isinstance(stats, StatisticsFile):
+            snapshot_id = stats.snapshot_id
+        elif isinstance(stats, dict):
+            snapshot_id = cast(int, stats.get("snapshot-id"))
+        else:
+            snapshot_id = None
+
+        data["snapshot_id"] = snapshot_id
 
         return data
 
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
index 37d7f46e..e40513fe 100644
--- a/tests/table/test_init.py
+++ b/tests/table/test_init.py
@@ -21,7 +21,7 @@ from copy import copy
 from typing import Any
 
 import pytest
-from pydantic import ValidationError
+from pydantic import BaseModel, ValidationError
 from sortedcontainers import SortedList
 
 from pyiceberg.catalog.noop import NoopCatalog
@@ -1391,6 +1391,8 @@ def test_set_statistics_update(table_v2_with_statistics: 
Table) -> None:
         statistics=statistics_file,
     )
 
+    assert model_roundtrips(update)
+
     new_metadata = update_table_metadata(
         table_v2_with_statistics.metadata,
         (update,),
@@ -1425,6 +1427,57 @@ def test_set_statistics_update(table_v2_with_statistics: 
Table) -> None:
     assert json.loads(updated_statistics[0].model_dump_json()) == 
json.loads(expected)
 
 
+def 
test_set_statistics_update_handles_deprecated_snapshot_id(table_v2_with_statistics:
 Table) -> None:
+    snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id
+
+    blob_metadata = BlobMetadata(
+        type="apache-datasketches-theta-v1",
+        snapshot_id=snapshot_id,
+        sequence_number=2,
+        fields=[1],
+        properties={"prop-key": "prop-value"},
+    )
+
+    statistics_file = StatisticsFile(
+        snapshot_id=snapshot_id,
+        statistics_path="s3://bucket/warehouse/stats.puffin",
+        file_size_in_bytes=124,
+        file_footer_size_in_bytes=27,
+        blob_metadata=[blob_metadata],
+    )
+    update_with_model = SetStatisticsUpdate(statistics=statistics_file)
+    assert model_roundtrips(update_with_model)
+    assert update_with_model.snapshot_id == snapshot_id
+
+    update_with_dict = SetStatisticsUpdate.model_validate({"statistics": 
statistics_file.model_dump()})
+    assert model_roundtrips(update_with_dict)
+    assert update_with_dict.snapshot_id == snapshot_id
+
+    update_json = """
+        {
+            "statistics":
+                 {
+                     "snapshot-id": 3055729675574597004,
+                     "statistics-path": "s3://a/b/stats.puffin",
+                     "file-size-in-bytes": 413,
+                     "file-footer-size-in-bytes": 42,
+                     "blob-metadata": [
+                         {
+                             "type": "apache-datasketches-theta-v1",
+                             "snapshot-id": 3055729675574597004,
+                             "sequence-number": 1,
+                             "fields": [1]
+                         }
+                     ]
+                 }
+        }
+    """
+
+    update_with_json = SetStatisticsUpdate.model_validate_json(update_json)
+    assert model_roundtrips(update_with_json)
+    assert update_with_json.snapshot_id == snapshot_id
+
+
 def test_remove_statistics_update(table_v2_with_statistics: Table) -> None:
     update = RemoveStatisticsUpdate(
         snapshot_id=3055729675574597004,
@@ -1575,3 +1628,14 @@ def 
test_add_snapshot_update_updates_next_row_id(table_v3: Table) -> None:
 
     new_metadata = update_table_metadata(table_v3.metadata, 
(AddSnapshotUpdate(snapshot=new_snapshot),))
     assert new_metadata.next_row_id == 11
+
+
+def model_roundtrips(model: BaseModel) -> bool:
+    """Helper assertion that tests if a pydantic model roundtrips
+    successfully.
+    """
+    __tracebackhide__ = True
+    model_data = model.model_dump()
+    if model != type(model).model_validate(model_data):
+        pytest.fail(f"model {type(model)} did not roundtrip successfully")
+    return True

(iceberg-python) branch main updated: fix: Validate SetStatisticsUpdate correctly (#2866)

Reply via email to