(iceberg-python) branch main updated: Fix: Support nested struct field filtering (#2628)

fokko Tue, 11 Nov 2025 00:43:00 -0800

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 33f6ab76 Fix: Support nested struct field filtering (#2628)
33f6ab76 is described below

commit 33f6ab762a1ced71869dc400e932c799a0af2605
Author: Yftach Zur <[email protected]>
AuthorDate: Tue Nov 11 09:42:46 2025 +0100

    Fix: Support nested struct field filtering (#2628)
    
    Fixes #953
    
    # Rationale for this change
    Fixes filtering on nested struct fields when using PyArrow for scan
    operations.
    
    ## Are these changes tested?
    Yes, the full test suite + new tests
    
    ## Are there any user-facing changes?
    Now, filtering  a scan using a nested field will work
    
    ## Problem
    
    When filtering on nested struct fields (e.g., `parentField.childField ==
    'value'`), PyArrow would fail with:
    ```
    ArrowInvalid: No match for FieldRef.Name(childField) in ...
    ```
    
    The issue occurred because PyArrow requires nested field references as
    tuples (e.g., `("parent", "child")`) rather than dotted strings (e.g.,
    `"parent.child"`).
    
    ## Solution
    
    1. Modified `_ConvertToArrowExpression` to accept an optional `Schema`
    parameter
    2. Added `_get_field_name()` method that converts dotted field paths to
    tuples for nested struct fields
    3. Updated `expression_to_pyarrow()` to accept and pass the schema
    parameter
    4. Updated all call sites to pass the schema when available
    
    ## Changes
    
    - `pyiceberg/io/pyarrow.py`:
    - Modified `_ConvertToArrowExpression` class to handle nested field
    paths
      - Updated `expression_to_pyarrow()` signature to accept schema
      - Updated `_expression_to_complementary_pyarrow()` signature
    - `pyiceberg/table/__init__.py`:
    - Updated call to `_expression_to_complementary_pyarrow()` to pass
    schema
    - Tests:
    - Added `test_ref_binding_nested_struct_field()` for comprehensive
    nested field testing
      - Enhanced `test_nested_fields()` with issue #953 scenarios
    
    ## Example
    
    ```python
    # Now works correctly:
    table.scan(row_filter="parent.child == 'abc123'").to_polars()
    ```
    
    The fix converts the field reference from:
    - ❌ `FieldRef.Name(run_id)` (fails - field not found)
    - ✅ `FieldRef.Nested(FieldRef.Name(mazeMetadata) FieldRef.Name(run_id))`
    (works!)
    
    ---------
    
    Co-authored-by: Yftach Zur <[email protected]>
    Co-authored-by: Claude <[email protected]>
---
 pyiceberg/io/pyarrow.py               | 80 ++++++++++++++++++++++++++---------
 pyiceberg/table/__init__.py           |  2 +-
 tests/expressions/test_expressions.py | 52 +++++++++++++++++++++++
 tests/expressions/test_parser.py      |  3 ++
 4 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 46d7fe6b..5be4c5d2 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -808,51 +808,83 @@ def _convert_scalar(value: Any, iceberg_type: 
IcebergType) -> pa.scalar:
 
 
 class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]):
+    """Convert Iceberg bound expressions to PyArrow expressions.
+
+    Args:
+        schema: Optional Iceberg schema to resolve full field paths for nested 
fields.
+                If not provided, only the field name will be used (not dotted 
path).
+    """
+
+    _schema: Schema | None
+
+    def __init__(self, schema: Schema | None = None):
+        self._schema = schema
+
+    def _get_field_name(self, term: BoundTerm[Any]) -> str | Tuple[str, ...]:
+        """Get the field name or nested field path for a bound term.
+
+        For nested struct fields, returns a tuple of field names (e.g., 
("mazeMetadata", "run_id")).
+        For top-level fields, returns just the field name as a string.
+
+        PyArrow requires nested field references as tuples, not dotted strings.
+        """
+        if self._schema is not None:
+            # Use the schema to get the full dotted path for nested fields
+            full_name = 
self._schema.find_column_name(term.ref().field.field_id)
+            if full_name is not None:
+                # If the field name contains dots, it's a nested field
+                # Convert "parent.child" to ("parent", "child") for PyArrow
+                if "." in full_name:
+                    return tuple(full_name.split("."))
+                return full_name
+        # Fallback to just the field name if schema is not available
+        return term.ref().field.name
+
     def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> 
pc.Expression:
         pyarrow_literals = pa.array(literals, 
type=schema_to_pyarrow(term.ref().field.field_type))
-        return pc.field(term.ref().field.name).isin(pyarrow_literals)
+        return pc.field(self._get_field_name(term)).isin(pyarrow_literals)
 
     def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> 
pc.Expression:
         pyarrow_literals = pa.array(literals, 
type=schema_to_pyarrow(term.ref().field.field_type))
-        return ~pc.field(term.ref().field.name).isin(pyarrow_literals)
+        return ~pc.field(self._get_field_name(term)).isin(pyarrow_literals)
 
     def visit_is_nan(self, term: BoundTerm[Any]) -> pc.Expression:
-        ref = pc.field(term.ref().field.name)
+        ref = pc.field(self._get_field_name(term))
         return pc.is_nan(ref)
 
     def visit_not_nan(self, term: BoundTerm[Any]) -> pc.Expression:
-        ref = pc.field(term.ref().field.name)
+        ref = pc.field(self._get_field_name(term))
         return ~pc.is_nan(ref)
 
     def visit_is_null(self, term: BoundTerm[Any]) -> pc.Expression:
-        return pc.field(term.ref().field.name).is_null(nan_is_null=False)
+        return pc.field(self._get_field_name(term)).is_null(nan_is_null=False)
 
     def visit_not_null(self, term: BoundTerm[Any]) -> pc.Expression:
-        return pc.field(term.ref().field.name).is_valid()
+        return pc.field(self._get_field_name(term)).is_valid()
 
     def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> 
pc.Expression:
-        return pc.field(term.ref().field.name) == 
_convert_scalar(literal.value, term.ref().field.field_type)
+        return pc.field(self._get_field_name(term)) == 
_convert_scalar(literal.value, term.ref().field.field_type)
 
     def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> 
pc.Expression:
-        return pc.field(term.ref().field.name) != 
_convert_scalar(literal.value, term.ref().field.field_type)
+        return pc.field(self._get_field_name(term)) != 
_convert_scalar(literal.value, term.ref().field.field_type)
 
     def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: 
Literal[Any]) -> pc.Expression:
-        return pc.field(term.ref().field.name) >= 
_convert_scalar(literal.value, term.ref().field.field_type)
+        return pc.field(self._get_field_name(term)) >= 
_convert_scalar(literal.value, term.ref().field.field_type)
 
     def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) 
-> pc.Expression:
-        return pc.field(term.ref().field.name) > 
_convert_scalar(literal.value, term.ref().field.field_type)
+        return pc.field(self._get_field_name(term)) > 
_convert_scalar(literal.value, term.ref().field.field_type)
 
     def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> 
pc.Expression:
-        return pc.field(term.ref().field.name) < 
_convert_scalar(literal.value, term.ref().field.field_type)
+        return pc.field(self._get_field_name(term)) < 
_convert_scalar(literal.value, term.ref().field.field_type)
 
     def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: 
Literal[Any]) -> pc.Expression:
-        return pc.field(term.ref().field.name) <= 
_convert_scalar(literal.value, term.ref().field.field_type)
+        return pc.field(self._get_field_name(term)) <= 
_convert_scalar(literal.value, term.ref().field.field_type)
 
     def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) 
-> pc.Expression:
-        return pc.starts_with(pc.field(term.ref().field.name), literal.value)
+        return pc.starts_with(pc.field(self._get_field_name(term)), 
literal.value)
 
     def visit_not_starts_with(self, term: BoundTerm[Any], literal: 
Literal[Any]) -> pc.Expression:
-        return ~pc.starts_with(pc.field(term.ref().field.name), literal.value)
+        return ~pc.starts_with(pc.field(self._get_field_name(term)), 
literal.value)
 
     def visit_true(self) -> pc.Expression:
         return pc.scalar(True)
@@ -988,11 +1020,21 @@ class 
_NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[None]):
         boolean_expression_visit(expr, self)
 
 
-def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression:
-    return boolean_expression_visit(expr, _ConvertToArrowExpression())
+def expression_to_pyarrow(expr: BooleanExpression, schema: Schema | None = 
None) -> pc.Expression:
+    """Convert an Iceberg boolean expression to a PyArrow expression.
+
+    Args:
+        expr: The Iceberg boolean expression to convert.
+        schema: Optional Iceberg schema to resolve full field paths for nested 
fields.
+                If provided, nested struct fields will use dotted paths (e.g., 
"parent.child").
+
+    Returns:
+        A PyArrow compute expression.
+    """
+    return boolean_expression_visit(expr, _ConvertToArrowExpression(schema))
 
 
-def _expression_to_complementary_pyarrow(expr: BooleanExpression) -> 
pc.Expression:
+def _expression_to_complementary_pyarrow(expr: BooleanExpression, schema: 
Schema | None = None) -> pc.Expression:
     """Complementary filter conversion function of expression_to_pyarrow.
 
     Could not use expression_to_pyarrow(Not(expr)) to achieve this 
complementary effect because ~ in pyarrow.compute.Expression does not handle 
null.
@@ -1013,7 +1055,7 @@ def _expression_to_complementary_pyarrow(expr: 
BooleanExpression) -> pc.Expressi
         preserve_expr = Or(preserve_expr, BoundIsNull(term=term))
     for term in nan_unmentioned_bound_terms:
         preserve_expr = Or(preserve_expr, BoundIsNaN(term=term))
-    return expression_to_pyarrow(preserve_expr)
+    return expression_to_pyarrow(preserve_expr, schema)
 
 
 @lru_cache
@@ -1553,7 +1595,7 @@ def _task_to_record_batches(
                 bound_row_filter, file_schema, case_sensitive=case_sensitive, 
projected_field_values=projected_missing_fields
             )
             bound_file_filter = bind(file_schema, translated_row_filter, 
case_sensitive=case_sensitive)
-            pyarrow_filter = expression_to_pyarrow(bound_file_filter)
+            pyarrow_filter = expression_to_pyarrow(bound_file_filter, 
file_schema)
 
         file_project_schema = prune_columns(file_schema, projected_field_ids, 
select_full_types=False)
 
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index 5fd83a81..cc7c1c6a 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -674,7 +674,7 @@ class Transaction:
         # Check if there are any files that require an actual rewrite of a 
data file
         if delete_snapshot.rewrites_needed is True:
             bound_delete_filter = bind(self.table_metadata.schema(), 
delete_filter, case_sensitive)
-            preserve_row_filter = 
_expression_to_complementary_pyarrow(bound_delete_filter)
+            preserve_row_filter = 
_expression_to_complementary_pyarrow(bound_delete_filter, 
self.table_metadata.schema())
 
             file_scan = self._scan(row_filter=delete_filter, 
case_sensitive=case_sensitive)
             if branch is not None:
diff --git a/tests/expressions/test_expressions.py 
b/tests/expressions/test_expressions.py
index 57a39e08..dbee2ca0 100644
--- a/tests/expressions/test_expressions.py
+++ b/tests/expressions/test_expressions.py
@@ -228,6 +228,58 @@ def 
test_ref_binding_case_insensitive_failure(table_schema_simple: Schema) -> No
         ref.bind(table_schema_simple, case_sensitive=False)
 
 
+def test_ref_binding_nested_struct_field() -> None:
+    """Test binding references to nested struct fields (issue #953)."""
+    schema = Schema(
+        NestedField(field_id=1, name="age", field_type=IntegerType(), 
required=True),
+        NestedField(
+            field_id=2,
+            name="employment",
+            field_type=StructType(
+                NestedField(field_id=3, name="status", 
field_type=StringType(), required=False),
+                NestedField(field_id=4, name="company", 
field_type=StringType(), required=False),
+            ),
+            required=False,
+        ),
+        NestedField(
+            field_id=5,
+            name="contact",
+            field_type=StructType(
+                NestedField(field_id=6, name="email", field_type=StringType(), 
required=False),
+            ),
+            required=False,
+        ),
+        schema_id=1,
+    )
+
+    # Test that nested field names are in the index
+    assert "employment.status" in schema._name_to_id
+    assert "employment.company" in schema._name_to_id
+    assert "contact.email" in schema._name_to_id
+
+    # Test binding a reference to nested fields
+    ref = Reference("employment.status")
+    bound = ref.bind(schema, case_sensitive=True)
+    assert bound.field.field_id == 3
+    assert bound.field.name == "status"
+
+    # Test with different nested field
+    ref2 = Reference("contact.email")
+    bound2 = ref2.bind(schema, case_sensitive=True)
+    assert bound2.field.field_id == 6
+    assert bound2.field.name == "email"
+
+    # Test case-insensitive binding
+    ref3 = Reference("EMPLOYMENT.STATUS")
+    bound3 = ref3.bind(schema, case_sensitive=False)
+    assert bound3.field.field_id == 3
+
+    # Test that binding fails for non-existent nested field
+    ref4 = Reference("employment.department")
+    with pytest.raises(ValueError):
+        ref4.bind(schema, case_sensitive=True)
+
+
 def test_in_to_eq() -> None:
     assert In("x", (34.56,)) == EqualTo("x", 34.56)
 
diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py
index 28d7cf11..2f0c444d 100644
--- a/tests/expressions/test_parser.py
+++ b/tests/expressions/test_parser.py
@@ -225,6 +225,9 @@ def test_with_function() -> None:
 def test_nested_fields() -> None:
     assert EqualTo("foo.bar", "data") == parser.parse("foo.bar = 'data'")
     assert LessThan("location.x", DecimalLiteral(Decimal(52.00))) == 
parser.parse("location.x < 52.00")
+    # Test issue #953 scenario - nested struct field filtering
+    assert EqualTo("employment.status", "Employed") == 
parser.parse("employment.status = 'Employed'")
+    assert EqualTo("contact.email", "[email protected]") == 
parser.parse("contact.email = '[email protected]'")
 
 
 def test_quoted_column_with_dots() -> None:

(iceberg-python) branch main updated: Fix: Support nested struct field filtering (#2628)

Reply via email to