(iceberg-python) branch main updated: Fix walrus truthiness on metrics bounds and identity-partition projection (#3412)

fokko Mon, 25 May 2026 12:57:50 -0700

This is an automated email from the ASF dual-hosted git repository.

Fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 720708a00 Fix walrus truthiness on metrics bounds and 
identity-partition projection (#3412)
720708a00 is described below

commit 720708a00f8a259d2f2ad86c1c55d5b88ad6c373
Author: Kevin Liu <[email protected]>
AuthorDate: Mon May 25 15:57:38 2026 -0400

    Fix walrus truthiness on metrics bounds and identity-partition projection 
(#3412)
    
    Inspired by the walrus issue in #3353
    
    ## Summary
    
    Several `if x := dict.get(k):` checks treated legitimate falsy values as
    missing:
    
    - `lower_bounds.get(field_id)` / `upper_bounds.get(field_id)` return
    `bytes`. `b""` is a valid serialization of an empty string and was being
    skipped, causing metrics-based row filtering to return
    `ROWS_MIGHT_MATCH` when it should have been `ROWS_CANNOT_MATCH` (and
    vice versa for strict eval).
    - `accessors[...].get(file.partition)` can return `0` or `""` for an
    `IdentityTransform` partition. The walrus dropped it, so projected
    partition columns were filled with `null` instead of the actual value.
    - `inspect.py` `lower_bound` / `upper_bound` rendering had the same
    `b""` issue, showing `None` instead of `""` in `readable_metrics`.
    
    All conditions are switched to explicit `is not None` checks. A small
    `_readable_bound` helper deduplicates the inspect rendering.
    
    ## Changes
    
    - `pyiceberg/expressions/visitors.py` — `_InclusiveMetricsEvaluator` and
    `_StrictMetricsEvaluator` bound lookups (21 sites).
    - `pyiceberg/io/pyarrow.py` — `_get_column_projection_values` and
    `ArrowProjectionVisitor` missing-field handling.
    - `pyiceberg/table/inspect.py` — extract `_readable_bound`, use it in
    both the `entries` and `_get_files_from_manifest` rendering paths.
    
    ## Tests
    
    - `tests/expressions/test_evaluator.py` — inclusive and strict
    evaluators with empty-string bounds, covering lower-only, upper-only,
    and both-bounds branches.
    - `tests/io/test_pyarrow.py` — parametrized identity-transform
    projection with falsy partition values (`0`, `""`, `None`).
    - `tests/table/test_inspect.py` — `_readable_bound` helper plus
    integration tests via `tbl.inspect.entries()` and `tbl.inspect.files()`
    for empty-string and null bounds.
---
 pyiceberg/expressions/visitors.py   | 66 +++++++++++++++++++++++-----------
 pyiceberg/io/pyarrow.py             |  6 ++--
 pyiceberg/table/inspect.py          | 20 +++++------
 tests/expressions/test_evaluator.py | 56 +++++++++++++++++++++++++++++
 tests/io/test_pyarrow.py            | 69 ++++++++++++++++++++++++++++++++++++
 tests/table/test_inspect.py         | 70 +++++++++++++++++++++++++++++++++++++
 6 files changed, 252 insertions(+), 35 deletions(-)

diff --git a/pyiceberg/expressions/visitors.py 
b/pyiceberg/expressions/visitors.py
index 0beb0f3df..f3cfabf1a 100644
--- a/pyiceberg/expressions/visitors.py
+++ b/pyiceberg/expressions/visitors.py
@@ -1241,7 +1241,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         if not isinstance(field.field_type, PrimitiveType):
             raise ValueError(f"Expected PrimitiveType: {field.field_type}")
 
-        if lower_bound_bytes := self.lower_bounds.get(field_id):
+        lower_bound_bytes = self.lower_bounds.get(field_id)
+        if lower_bound_bytes is not None:
             lower_bound = from_bytes(field.field_type, lower_bound_bytes)
 
             if self._is_nan(lower_bound):
@@ -1263,7 +1264,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         if not isinstance(field.field_type, PrimitiveType):
             raise ValueError(f"Expected PrimitiveType: {field.field_type}")
 
-        if lower_bound_bytes := self.lower_bounds.get(field_id):
+        lower_bound_bytes = self.lower_bounds.get(field_id)
+        if lower_bound_bytes is not None:
             lower_bound = from_bytes(field.field_type, lower_bound_bytes)
             if self._is_nan(lower_bound):
                 # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
@@ -1284,7 +1286,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         if not isinstance(field.field_type, PrimitiveType):
             raise ValueError(f"Expected PrimitiveType: {field.field_type}")
 
-        if upper_bound_bytes := self.upper_bounds.get(field_id):
+        upper_bound_bytes = self.upper_bounds.get(field_id)
+        if upper_bound_bytes is not None:
             upper_bound = from_bytes(field.field_type, upper_bound_bytes)
             if upper_bound <= literal.value:
                 if self._is_nan(upper_bound):
@@ -1305,7 +1308,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         if not isinstance(field.field_type, PrimitiveType):
             raise ValueError(f"Expected PrimitiveType: {field.field_type}")
 
-        if upper_bound_bytes := self.upper_bounds.get(field_id):
+        upper_bound_bytes = self.upper_bounds.get(field_id)
+        if upper_bound_bytes is not None:
             upper_bound = from_bytes(field.field_type, upper_bound_bytes)
             if upper_bound < literal.value:
                 if self._is_nan(upper_bound):
@@ -1326,7 +1330,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         if not isinstance(field.field_type, PrimitiveType):
             raise ValueError(f"Expected PrimitiveType: {field.field_type}")
 
-        if lower_bound_bytes := self.lower_bounds.get(field_id):
+        lower_bound_bytes = self.lower_bounds.get(field_id)
+        if lower_bound_bytes is not None:
             lower_bound = from_bytes(field.field_type, lower_bound_bytes)
             if self._is_nan(lower_bound):
                 # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
@@ -1335,7 +1340,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
             if lower_bound > literal.value:
                 return ROWS_CANNOT_MATCH
 
-        if upper_bound_bytes := self.upper_bounds.get(field_id):
+        upper_bound_bytes = self.upper_bounds.get(field_id)
+        if upper_bound_bytes is not None:
             upper_bound = from_bytes(field.field_type, upper_bound_bytes)
             if self._is_nan(upper_bound):
                 # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
@@ -1363,7 +1369,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         if not isinstance(field.field_type, PrimitiveType):
             raise ValueError(f"Expected PrimitiveType: {field.field_type}")
 
-        if lower_bound_bytes := self.lower_bounds.get(field_id):
+        lower_bound_bytes = self.lower_bounds.get(field_id)
+        if lower_bound_bytes is not None:
             lower_bound = from_bytes(field.field_type, lower_bound_bytes)
             if self._is_nan(lower_bound):
                 # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
@@ -1373,7 +1380,8 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
             if len(literals) == 0:
                 return ROWS_CANNOT_MATCH
 
-        if upper_bound_bytes := self.upper_bounds.get(field_id):
+        upper_bound_bytes = self.upper_bounds.get(field_id)
+        if upper_bound_bytes is not None:
             upper_bound = from_bytes(field.field_type, upper_bound_bytes)
             # this is different from Java, here NaN is always larger
             if self._is_nan(upper_bound):
@@ -1403,14 +1411,16 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
         prefix = str(literal.value)
         len_prefix = len(prefix)
 
-        if lower_bound_bytes := self.lower_bounds.get(field_id):
+        lower_bound_bytes = self.lower_bounds.get(field_id)
+        if lower_bound_bytes is not None:
             lower_bound = str(from_bytes(field.field_type, lower_bound_bytes))
 
             # truncate lower bound so that its length is not greater than the 
length of prefix
             if lower_bound and lower_bound[:len_prefix] > prefix:
                 return ROWS_CANNOT_MATCH
 
-        if upper_bound_bytes := self.upper_bounds.get(field_id):
+        upper_bound_bytes = self.upper_bounds.get(field_id)
+        if upper_bound_bytes is not None:
             upper_bound = str(from_bytes(field.field_type, upper_bound_bytes))
 
             # truncate upper bound so that its length is not greater than the 
length of prefix
@@ -1434,7 +1444,9 @@ class _InclusiveMetricsEvaluator(_MetricsEvaluator):
 
         # not_starts_with will match unless all values must start with the 
prefix. This happens when
         # the lower and upper bounds both start with the prefix.
-        if (lower_bound_bytes := self.lower_bounds.get(field_id)) and 
(upper_bound_bytes := self.upper_bounds.get(field_id)):
+        lower_bound_bytes = self.lower_bounds.get(field_id)
+        upper_bound_bytes = self.upper_bounds.get(field_id)
+        if lower_bound_bytes is not None and upper_bound_bytes is not None:
             lower_bound = str(from_bytes(field.field_type, lower_bound_bytes))
             upper_bound = str(from_bytes(field.field_type, upper_bound_bytes))
 
@@ -1558,7 +1570,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
         if self._can_contain_nulls(field_id) or 
self._can_contain_nans(field_id):
             return ROWS_MIGHT_NOT_MATCH
 
-        if upper_bytes := self.upper_bounds.get(field_id):
+        upper_bytes = self.upper_bounds.get(field_id)
+        if upper_bytes is not None:
             field = self._get_field(field_id)
             upper = _from_byte_buffer(field.field_type, upper_bytes)
 
@@ -1575,7 +1588,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
         if self._can_contain_nulls(field_id) or 
self._can_contain_nans(field_id):
             return ROWS_MIGHT_NOT_MATCH
 
-        if upper_bytes := self.upper_bounds.get(field_id):
+        upper_bytes = self.upper_bounds.get(field_id)
+        if upper_bytes is not None:
             field = self._get_field(field_id)
             upper = _from_byte_buffer(field.field_type, upper_bytes)
 
@@ -1592,7 +1606,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
         if self._can_contain_nulls(field_id) or 
self._can_contain_nans(field_id):
             return ROWS_MIGHT_NOT_MATCH
 
-        if lower_bytes := self.lower_bounds.get(field_id):
+        lower_bytes = self.lower_bounds.get(field_id)
+        if lower_bytes is not None:
             field = self._get_field(field_id)
             lower = _from_byte_buffer(field.field_type, lower_bytes)
 
@@ -1613,7 +1628,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
         if self._can_contain_nulls(field_id) or 
self._can_contain_nans(field_id):
             return ROWS_MIGHT_NOT_MATCH
 
-        if lower_bytes := self.lower_bounds.get(field_id):
+        lower_bytes = self.lower_bounds.get(field_id)
+        if lower_bytes is not None:
             field = self._get_field(field_id)
             lower = _from_byte_buffer(field.field_type, lower_bytes)
 
@@ -1634,7 +1650,9 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
         if self._can_contain_nulls(field_id) or 
self._can_contain_nans(field_id):
             return ROWS_MIGHT_NOT_MATCH
 
-        if (lower_bytes := self.lower_bounds.get(field_id)) and (upper_bytes 
:= self.upper_bounds.get(field_id)):
+        lower_bytes = self.lower_bounds.get(field_id)
+        upper_bytes = self.upper_bounds.get(field_id)
+        if lower_bytes is not None and upper_bytes is not None:
             field = self._get_field(field_id)
             lower = _from_byte_buffer(field.field_type, lower_bytes)
             upper = _from_byte_buffer(field.field_type, upper_bytes)
@@ -1655,7 +1673,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
 
         field = self._get_field(field_id)
 
-        if lower_bytes := self.lower_bounds.get(field_id):
+        lower_bytes = self.lower_bounds.get(field_id)
+        if lower_bytes is not None:
             lower = _from_byte_buffer(field.field_type, lower_bytes)
 
             if self._is_nan(lower):
@@ -1666,7 +1685,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
             if lower > literal.value:
                 return ROWS_MUST_MATCH
 
-        if upper_bytes := self.upper_bounds.get(field_id):
+        upper_bytes = self.upper_bounds.get(field_id)
+        if upper_bytes is not None:
             upper = _from_byte_buffer(field.field_type, upper_bytes)
 
             if upper < literal.value:
@@ -1682,7 +1702,9 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
 
         field = self._get_field(field_id)
 
-        if (lower_bytes := self.lower_bounds.get(field_id)) and (upper_bytes 
:= self.upper_bounds.get(field_id)):
+        lower_bytes = self.lower_bounds.get(field_id)
+        upper_bytes = self.upper_bounds.get(field_id)
+        if lower_bytes is not None and upper_bytes is not None:
             # similar to the implementation in eq, first check if the lower 
bound is in the set
             lower = _from_byte_buffer(field.field_type, lower_bytes)
             if lower not in literals:
@@ -1711,7 +1733,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
 
         field = self._get_field(field_id)
 
-        if lower_bytes := self.lower_bounds.get(field_id):
+        lower_bytes = self.lower_bounds.get(field_id)
+        if lower_bytes is not None:
             lower = _from_byte_buffer(field.field_type, lower_bytes)
 
             if self._is_nan(lower):
@@ -1723,7 +1746,8 @@ class _StrictMetricsEvaluator(_MetricsEvaluator):
             if len(literals) == 0:
                 return ROWS_MUST_MATCH
 
-        if upper_bytes := self.upper_bounds.get(field_id):
+        upper_bytes = self.upper_bounds.get(field_id)
+        if upper_bytes is not None:
             upper = _from_byte_buffer(field.field_type, upper_bytes)
 
             literals = {val for val in literals if upper >= val}
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index db9a035bd..4ec7a73af 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1605,7 +1605,8 @@ def _get_column_projection_values(
     for field_id in project_schema_diff:
         for partition_field in partition_spec.fields_by_source_id(field_id):
             if isinstance(partition_field.transform, IdentityTransform):
-                if partition_value := 
accessors[partition_field.field_id].get(file.partition):
+                partition_value = 
accessors[partition_field.field_id].get(file.partition)
+                if partition_value is not None:
                     projected_missing_fields[field_id] = partition_value
 
     return projected_missing_fields
@@ -2010,7 +2011,8 @@ class 
ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None]
             elif field.optional or field.initial_default is not None:
                 # When an optional field is added, or when a required field 
with a non-null initial default is added
                 arrow_type = schema_to_pyarrow(field.field_type, 
include_field_ids=self._include_field_ids)
-                if projected_value := 
self._projected_missing_fields.get(field.field_id):
+                projected_value = 
self._projected_missing_fields.get(field.field_id)
+                if projected_value is not None:
                     field_arrays.append(pa.repeat(pa.scalar(projected_value, 
type=arrow_type), len(struct_array)))
                 elif field.initial_default is None:
                     field_arrays.append(pa.nulls(len(struct_array), 
type=arrow_type))
diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py
index 5da343ccb..54504d70d 100644
--- a/pyiceberg/table/inspect.py
+++ b/pyiceberg/table/inspect.py
@@ -38,6 +38,10 @@ if TYPE_CHECKING:
 ALWAYS_TRUE = AlwaysTrue()
 
 
+def _readable_bound(field_type: PrimitiveType, bound: bytes | None) -> Any | 
None:
+    return from_bytes(field_type, bound) if bound is not None else None
+
+
 class InspectTable:
     tbl: Table
 
@@ -180,12 +184,8 @@ class InspectTable:
                         "null_value_count": 
null_value_counts.get(field.field_id),
                         "nan_value_count": 
nan_value_counts.get(field.field_id),
                         # Makes them readable
-                        "lower_bound": from_bytes(field.field_type, 
lower_bound)
-                        if (lower_bound := lower_bounds.get(field.field_id))
-                        else None,
-                        "upper_bound": from_bytes(field.field_type, 
upper_bound)
-                        if (upper_bound := upper_bounds.get(field.field_id))
-                        else None,
+                        "lower_bound": _readable_bound(field.field_type, 
lower_bounds.get(field.field_id)),
+                        "upper_bound": _readable_bound(field.field_type, 
upper_bounds.get(field.field_id)),
                     }
                     for field in self.tbl.metadata.schema().fields
                 }
@@ -570,12 +570,8 @@ class InspectTable:
                     "value_count": value_counts.get(field.field_id),
                     "null_value_count": null_value_counts.get(field.field_id),
                     "nan_value_count": nan_value_counts.get(field.field_id),
-                    "lower_bound": from_bytes(field.field_type, lower_bound)
-                    if (lower_bound := lower_bounds.get(field.field_id))
-                    else None,
-                    "upper_bound": from_bytes(field.field_type, upper_bound)
-                    if (upper_bound := upper_bounds.get(field.field_id))
-                    else None,
+                    "lower_bound": _readable_bound(field.field_type, 
lower_bounds.get(field.field_id)),
+                    "upper_bound": _readable_bound(field.field_type, 
upper_bounds.get(field.field_id)),
                 }
                 for field in self.tbl.metadata.schema().fields
             }
diff --git a/tests/expressions/test_evaluator.py 
b/tests/expressions/test_evaluator.py
index b8e4d8704..ea1bef0a7 100644
--- a/tests/expressions/test_evaluator.py
+++ b/tests/expressions/test_evaluator.py
@@ -900,6 +900,62 @@ def test_string_starts_with(
     # assert not should_read, "Should not read: range doesn't match"
 
 
+def test_inclusive_metrics_evaluator_uses_empty_byte_lower_bound() -> None:
+    schema = Schema(NestedField(1, "empty_string", StringType(), 
required=True))
+    data_file = DataFile.from_args(
+        file_path="file.parquet",
+        file_format=FileFormat.PARQUET,
+        partition={},
+        record_count=10,
+        file_size_in_bytes=1,
+        value_counts={1: 10},
+        null_value_counts={1: 0},
+        nan_value_counts=None,
+        lower_bounds={1: to_bytes(StringType(), "")},
+        upper_bounds={1: to_bytes(StringType(), "")},
+    )
+
+    # Lower-bound branch: LessThan reads lower_bound only.
+    should_read = _InclusiveMetricsEvaluator(schema, LessThan("empty_string", 
"")).eval(data_file)
+    assert not should_read, "Should not read: lower bound is present and equal 
to the literal"
+
+    # Upper-bound branch: GreaterThan reads upper_bound only.
+    should_read = _InclusiveMetricsEvaluator(schema, 
GreaterThan("empty_string", "abc")).eval(data_file)
+    assert not should_read, "Should not read: upper bound '' is not greater 
than 'abc'"
+
+    # Both-bounds branch: EqualTo reads lower_bound and upper_bound.
+    should_read = _InclusiveMetricsEvaluator(schema, EqualTo("empty_string", 
"abc")).eval(data_file)
+    assert not should_read, "Should not read: 'abc' falls outside ['', '']"
+
+
+def test_strict_metrics_evaluator_uses_empty_byte_bounds() -> None:
+    schema = Schema(NestedField(1, "empty_string", StringType(), 
required=True))
+    data_file = DataFile.from_args(
+        file_path="file.parquet",
+        file_format=FileFormat.PARQUET,
+        partition={},
+        record_count=10,
+        file_size_in_bytes=1,
+        value_counts={1: 10},
+        null_value_counts={1: 0},
+        nan_value_counts=None,
+        lower_bounds={1: to_bytes(StringType(), "")},
+        upper_bounds={1: to_bytes(StringType(), "")},
+    )
+
+    # Both-bounds branch: EqualTo reads lower_bound and upper_bound.
+    should_read = _StrictMetricsEvaluator(schema, EqualTo("empty_string", 
"")).eval(data_file)
+    assert should_read, "Should match: lower and upper bounds are present and 
equal to the literal"
+
+    # Upper-bound branch: LessThan reads upper_bound only.
+    should_read = _StrictMetricsEvaluator(schema, LessThan("empty_string", 
"a")).eval(data_file)
+    assert should_read, "Should match: upper bound '' is strictly less than 
'a'"
+
+    # Both-bounds branch: NotEqualTo reads lower_bound and upper_bound.
+    should_read = _StrictMetricsEvaluator(schema, NotEqualTo("empty_string", 
"abc")).eval(data_file)
+    assert should_read, "Should match: 'abc' falls outside ['', '']"
+
+
 def test_string_not_starts_with(
     schema_data_file: Schema, data_file: DataFile, data_file_2: DataFile, 
data_file_3: DataFile, data_file_4: DataFile
 ) -> None:
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index a05b295fc..2f36661a1 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -1371,6 +1371,75 @@ def test_identity_transform_column_projection(tmp_path: 
str, catalog: InMemoryCa
     assert len(table.scan(row_filter="partition_id = -1").to_arrow()) == 0
 
 
[email protected](
+    "partition_field_type, arrow_partition_type, partition_value",
+    [
+        (IntegerType(), pa.int32(), 0),
+        (StringType(), pa.large_string(), ""),
+        (IntegerType(), pa.int32(), None),
+    ],
+)
+def test_identity_transform_column_projection_with_falsy_value(
+    tmp_path: str,
+    catalog: InMemoryCatalog,
+    partition_field_type: PrimitiveType,
+    arrow_partition_type: pa.DataType,
+    partition_value: Any,
+) -> None:
+    """Partition value projection must preserve falsy values (0, "") and still 
render None as null."""
+    schema = Schema(
+        NestedField(1, "other_field", StringType(), required=False),
+        NestedField(2, "partition_col", partition_field_type, required=False),
+    )
+    partition_spec = PartitionSpec(
+        PartitionField(2, 1000, IdentityTransform(), "partition_col"),
+    )
+
+    catalog.create_namespace("default")
+    table = catalog.create_table(
+        f"default.test_projection_partition_{partition_value!r}",
+        schema=schema,
+        partition_spec=partition_spec,
+        properties={TableProperties.DEFAULT_NAME_MAPPING: 
create_mapping_from_schema(schema).model_dump_json()},
+    )
+
+    file_data = pa.array(["foo", "bar"], type=pa.string())
+    file_loc = f"{tmp_path}/test.parquet"
+    pq.write_table(pa.table([file_data], names=["other_field"]), file_loc)
+
+    statistics = data_file_statistics_from_parquet_metadata(
+        parquet_metadata=pq.read_metadata(file_loc),
+        stats_columns=compute_statistics_plan(table.schema(), 
table.metadata.properties),
+        parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
+    )
+
+    unpartitioned_file = DataFile.from_args(
+        content=DataFileContent.DATA,
+        file_path=file_loc,
+        file_format=FileFormat.PARQUET,
+        partition=Record(partition_value),
+        file_size_in_bytes=os.path.getsize(file_loc),
+        sort_order_id=None,
+        spec_id=table.metadata.default_spec_id,
+        equality_ids=None,
+        key_metadata=None,
+        **statistics.to_serialized_dict(),
+    )
+
+    with table.transaction() as transaction:
+        with transaction.update_snapshot().overwrite() as update:
+            update.append_data_file(unpartitioned_file)
+
+    expected_schema = pa.schema([("other_field", pa.string()), 
("partition_col", arrow_partition_type)])
+    assert table.scan().to_arrow() == pa.table(
+        {
+            "other_field": ["foo", "bar"],
+            "partition_col": [partition_value, partition_value],
+        },
+        schema=expected_schema,
+    )
+
+
 def test_identity_transform_columns_projection(tmp_path: str, catalog: 
InMemoryCatalog) -> None:
     # Test by adding a non-partitioned data file to a multi-partitioned table, 
verifying partition value
     # projection from manifest metadata.
diff --git a/tests/table/test_inspect.py b/tests/table/test_inspect.py
new file mode 100644
index 000000000..c325af203
--- /dev/null
+++ b/tests/table/test_inspect.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pathlib import PosixPath
+
+import pyarrow as pa
+import pytest
+
+from pyiceberg.conversions import to_bytes
+from pyiceberg.schema import Schema
+from pyiceberg.table.inspect import _readable_bound
+from pyiceberg.types import NestedField, StringType
+from tests.catalog.test_base import InMemoryCatalog
+
+
+def test_readable_bound_with_empty_bytes() -> None:
+    assert _readable_bound(StringType(), to_bytes(StringType(), "")) == ""
+
+
+def test_readable_bound_without_bound() -> None:
+    assert _readable_bound(StringType(), None) is None
+
+
[email protected]
+def catalog(tmp_path: PosixPath) -> InMemoryCatalog:
+    cat = InMemoryCatalog("test.in_memory.catalog", 
warehouse=tmp_path.absolute().as_posix())
+    cat.create_namespace("default")
+    return cat
+
+
+def test_inspect_entries_and_files_render_empty_string_bound(catalog: 
InMemoryCatalog) -> None:
+    schema = Schema(NestedField(1, "s", StringType(), required=False))
+    tbl = catalog.create_table("default.empty_string_bound", schema)
+    tbl.append(pa.table({"s": [""]}, schema=pa.schema([pa.field("s", 
pa.large_string(), nullable=True)])))
+
+    entries_metrics = 
tbl.inspect.entries().to_pydict()["readable_metrics"][0]["s"]
+    assert entries_metrics["lower_bound"] == ""
+    assert entries_metrics["upper_bound"] == ""
+
+    files_metrics = tbl.inspect.files().to_pydict()["readable_metrics"][0]["s"]
+    assert files_metrics["lower_bound"] == ""
+    assert files_metrics["upper_bound"] == ""
+
+
+def test_inspect_entries_and_files_render_null_bound(catalog: InMemoryCatalog) 
-> None:
+    schema = Schema(NestedField(1, "s", StringType(), required=False))
+    tbl = catalog.create_table("default.null_bound", schema)
+    tbl.append(pa.table({"s": [None]}, schema=pa.schema([pa.field("s", 
pa.large_string(), nullable=True)])))
+
+    entries_metrics = 
tbl.inspect.entries().to_pydict()["readable_metrics"][0]["s"]
+    assert entries_metrics["lower_bound"] is None
+    assert entries_metrics["upper_bound"] is None
+
+    files_metrics = tbl.inspect.files().to_pydict()["readable_metrics"][0]["s"]
+    assert files_metrics["lower_bound"] is None
+    assert files_metrics["upper_bound"] is None

(iceberg-python) branch main updated: Fix walrus truthiness on metrics bounds and identity-partition projection (#3412)

Reply via email to