This is an automated email from the ASF dual-hosted git repository.

Fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new b7ca7be60 Reject empty `source-ids` in `PartitionField` / `SortField` 
(#3411)
b7ca7be60 is described below

commit b7ca7be605043b26e2078da357ad4f94e22e771a
Author: Kevin Liu <[email protected]>
AuthorDate: Mon May 25 16:01:52 2026 -0400

    Reject empty `source-ids` in `PartitionField` / `SortField` (#3411)
    
    Inspired by the walrus issue in #3353
    
    ## Problem
    
    `PartitionField` and `SortField` accept the v3 `source-ids` list (added
    in [#1554](https://github.com/apache/iceberg-python/pull/1554)) and map
    it to the legacy singular `source-id`. Both validators try to reject an
    empty list:
    
    ```python
    if "source-id" not in data and (source_ids := data["source-ids"]):
        if isinstance(source_ids, list):
            if len(source_ids) == 0:
                raise ValueError("Empty source-ids is not allowed")
            ...
            data["source-id"] = source_ids[0]
    ```
    
    The walrus uses truthiness, and `[]` is falsy — so the `len(source_ids)
    == 0` branch is unreachable. Passing `{"source-ids": []}` silently skips
    the mapping, and Pydantic then reports a generic "field required" error
    instead of the intended message. A missing `source-ids` key also raises
    `KeyError` instead of being handled cleanly.
    
    ## Fix
    
    Replace the walrus with an explicit key check in both validators:
    
    ```python
    if "source-id" not in data and "source-ids" in data:
        source_ids = data["source-ids"]
        ...
    ```
    
    This makes the empty-list validation reachable and avoids the
    `KeyError`.
    
    ## Tests
    
    Added regression tests that deserialize `{"source-ids": []}` and assert
    `ValueError("Empty source-ids is not allowed")` is raised, for both
    `PartitionField` and `SortField`.
---
 pyiceberg/partitioning.py        | 3 ++-
 pyiceberg/table/sorting.py       | 3 ++-
 tests/table/test_partitioning.py | 6 ++++++
 tests/table/test_sorting.py      | 6 ++++++
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
index 3de185d88..3074c30ea 100644
--- a/pyiceberg/partitioning.py
+++ b/pyiceberg/partitioning.py
@@ -109,7 +109,8 @@ class PartitionField(IcebergBaseModel):
     @classmethod
     def map_source_ids_onto_source_id(cls, data: Any) -> Any:
         if isinstance(data, dict):
-            if "source-id" not in data and (source_ids := data["source-ids"]):
+            if "source-id" not in data and "source-ids" in data:
+                source_ids = data["source-ids"]
                 if isinstance(source_ids, list):
                     if len(source_ids) == 0:
                         raise ValueError("Empty source-ids is not allowed")
diff --git a/pyiceberg/table/sorting.py b/pyiceberg/table/sorting.py
index 4a8d54861..61f34c478 100644
--- a/pyiceberg/table/sorting.py
+++ b/pyiceberg/table/sorting.py
@@ -101,7 +101,8 @@ class SortField(IcebergBaseModel):
     @classmethod
     def map_source_ids_onto_source_id(cls, data: Any) -> Any:
         if isinstance(data, dict):
-            if "source-id" not in data and (source_ids := data["source-ids"]):
+            if "source-id" not in data and "source-ids" in data:
+                source_ids = data["source-ids"]
                 if isinstance(source_ids, list):
                     if len(source_ids) == 0:
                         raise ValueError("Empty source-ids is not allowed")
diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py
index a27046ef3..b150fc2f6 100644
--- a/tests/table/test_partitioning.py
+++ b/tests/table/test_partitioning.py
@@ -267,6 +267,12 @@ def test_deserialize_partition_field_v3() -> None:
     assert field == PartitionField(source_id=1, field_id=1000, 
transform=TruncateTransform(width=19), name="str_truncate")
 
 
+def test_deserialize_partition_field_empty_source_ids_rejected() -> None:
+    json_partition_spec = """{"source-ids": [], "field-id": 1000, "transform": 
"identity", "name": "x"}"""
+    with pytest.raises(Exception, match="Empty source-ids is not allowed"):
+        PartitionField.model_validate_json(json_partition_spec)
+
+
 def test_incompatible_source_column_not_found() -> None:
     schema = Schema(NestedField(1, "foo", IntegerType()), NestedField(2, 
"bar", IntegerType()))
 
diff --git a/tests/table/test_sorting.py b/tests/table/test_sorting.py
index 91c7a25b0..5f7f5d016 100644
--- a/tests/table/test_sorting.py
+++ b/tests/table/test_sorting.py
@@ -138,6 +138,12 @@ def test_serialize_sort_field_v3() -> None:
     assert SortField.model_validate_json(payload) == expected
 
 
+def test_deserialize_sort_field_empty_source_ids_rejected() -> None:
+    payload = 
'{"source-ids":[],"transform":"identity","direction":"asc","null-order":"nulls-first"}'
+    with pytest.raises(Exception, match="Empty source-ids is not allowed"):
+        SortField.model_validate_json(payload)
+
+
 def test_incompatible_source_column_not_found(sort_order: SortOrder) -> None:
     schema = Schema(NestedField(1, "foo", IntegerType()), NestedField(2, 
"bar", IntegerType()))
 

Reply via email to