HonahX commented on code in PR #955:
URL: https://github.com/apache/iceberg-python/pull/955#discussion_r1690176614
##########
pyiceberg/io/pyarrow.py:
##########
@@ -638,10 +634,152 @@ def visit_or(self, left_result: pc.Expression,
right_result: pc.Expression) -> p
return left_result | right_result
+class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[Any]):
+ # BoundTerms which have either is_null or is_not_null appearing at least
once in the boolean expr.
+ is_null_or_not_bound_terms: set[BoundTerm[Any]]
+ # The remaining BoundTerms appearing in the boolean expr.
+ null_unmentioned_bound_terms: set[BoundTerm[Any]]
+ # BoundTerms which have either is_nan or is_not_nan appearing at least
once in the boolean expr.
+ is_nan_or_not_bound_terms: set[BoundTerm[Any]]
+ # The remaining BoundTerms appearing in the boolean expr.
+ nan_unmentioned_bound_terms: set[BoundTerm[Any]]
+
+ def __init__(self) -> None:
+ self.is_null_or_not_bound_terms = set()
+ self.null_unmentioned_bound_terms = set()
+ self.is_nan_or_not_bound_terms = set()
+ self.nan_unmentioned_bound_terms = set()
+ super().__init__()
Review Comment:
[nit] I prefer to move `super().__init__()` to the beginning of `__init__`
if it is not required to initialize child class fields before parent class's
##########
pyiceberg/io/pyarrow.py:
##########
@@ -638,10 +634,152 @@ def visit_or(self, left_result: pc.Expression,
right_result: pc.Expression) -> p
return left_result | right_result
+class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[Any]):
Review Comment:
```suggestion
class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[None]):
```
How about specifying `None` here? The return type for this visitor is None
##########
tests/integration/test_deletes.py:
##########
@@ -417,3 +452,107 @@ def test_delete_truncate(session_catalog: RestCatalog) ->
None:
assert len(entries) == 1
assert entries[0].status == ManifestEntryStatus.DELETED
+
+
[email protected]
+def test_delete_overwrite_table_with_null(session_catalog: RestCatalog) ->
None:
+ arrow_schema = pa.schema([pa.field("ints", pa.int32())])
+ arrow_tbl = pa.Table.from_pylist(
+ [{"ints": 1}, {"ints": 2}, {"ints": None}],
+ schema=arrow_schema,
+ )
+
+ iceberg_schema = Schema(NestedField(1, "ints", IntegerType()))
+
+ tbl_identifier = "default.test_delete_overwrite_with_null"
+
+ try:
+ session_catalog.drop_table(tbl_identifier)
+ except NoSuchTableError:
+ pass
+
+ tbl = session_catalog.create_table(tbl_identifier, iceberg_schema)
+ tbl.append(arrow_tbl)
Review Comment:
[Optional] I like the `_create_table` util we have in other integration
tests:
https://github.com/apache/iceberg-python/blob/1ed3abdd1aec480911eeec4f0f46a04efe53dc06/tests/integration/test_add_files.py#L121-L138
It will be great if we can have the same for `test_deletes`. (totally ok to
be done in a follow-up PR after the release)
In the future, we may also want to consider unifying the behavior of
`_create_table` across different tests and moving it to `conftest.py`. We can
do this in a follow-up PR.
##########
tests/io/test_pyarrow.py:
##########
@@ -15,7 +15,6 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=protected-access,unused-argument,redefined-outer-name
-
Review Comment:
[Nit] Could you please revert this?
##########
pyiceberg/io/pyarrow.py:
##########
@@ -638,10 +634,152 @@ def visit_or(self, left_result: pc.Expression,
right_result: pc.Expression) -> p
return left_result | right_result
+class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[Any]):
+ # BoundTerms which have either is_null or is_not_null appearing at least
once in the boolean expr.
+ is_null_or_not_bound_terms: set[BoundTerm[Any]]
+ # The remaining BoundTerms appearing in the boolean expr.
+ null_unmentioned_bound_terms: set[BoundTerm[Any]]
+ # BoundTerms which have either is_nan or is_not_nan appearing at least
once in the boolean expr.
+ is_nan_or_not_bound_terms: set[BoundTerm[Any]]
+ # The remaining BoundTerms appearing in the boolean expr.
+ nan_unmentioned_bound_terms: set[BoundTerm[Any]]
+
+ def __init__(self) -> None:
+ self.is_null_or_not_bound_terms = set()
+ self.null_unmentioned_bound_terms = set()
+ self.is_nan_or_not_bound_terms = set()
+ self.nan_unmentioned_bound_terms = set()
+ super().__init__()
+
+ def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None:
+ """Handle the predicate case where either is_null or is_not_null is
included."""
+ if term in self.null_unmentioned_bound_terms:
+ self.null_unmentioned_bound_terms.remove(term)
+ self.is_null_or_not_bound_terms.add(term)
+
+ def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None:
+ """Handle the predicate case where neither is_null or is_not_null is
included."""
+ if term not in self.is_null_or_not_bound_terms:
+ self.null_unmentioned_bound_terms.add(term)
+
+ def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None:
+ """Handle the predicate case where either is_nan or is_not_nan is
included."""
+ if term in self.nan_unmentioned_bound_terms:
+ self.nan_unmentioned_bound_terms.remove(term)
+ self.is_nan_or_not_bound_terms.add(term)
+
+ def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None:
+ """Handle the predicate case where neither is_nan or is_not_nan is
included."""
+ if term not in self.is_nan_or_not_bound_terms:
+ self.nan_unmentioned_bound_terms.add(term)
+
+ def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) ->
None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any])
-> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_is_nan(self, term: BoundTerm[Any]) -> None:
+ self._handle_null_unmentioned(term)
+ self._handle_explicit_is_nan_or_not(term)
+
+ def visit_not_nan(self, term: BoundTerm[Any]) -> None:
+ self._handle_null_unmentioned(term)
+ self._handle_explicit_is_nan_or_not(term)
+
+ def visit_is_null(self, term: BoundTerm[Any]) -> None:
+ self._handle_explicit_is_null_or_not(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_not_null(self, term: BoundTerm[Any]) -> None:
+ self._handle_explicit_is_null_or_not(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) ->
None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal:
Literal[Any]) -> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any])
-> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) ->
None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_less_than_or_equal(self, term: BoundTerm[Any], literal:
Literal[Any]) -> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any])
-> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_not_starts_with(self, term: BoundTerm[Any], literal:
Literal[Any]) -> None:
+ self._handle_null_unmentioned(term)
+ self._handle_nan_unmentioned(term)
+
+ def visit_true(self) -> None:
+ return
+
+ def visit_false(self) -> None:
+ return
+
+ def visit_not(self, child_result: pc.Expression) -> None:
+ return
+
+ def visit_and(self, left_result: pc.Expression, right_result:
pc.Expression) -> None:
+ return
+
+ def visit_or(self, left_result: pc.Expression, right_result:
pc.Expression) -> None:
Review Comment:
The type of `_result` fields seems off here. In this case, I think we always
expect to receive `None` from child/left/right
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]