Fokko commented on code in PR #6714:
URL: https://github.com/apache/iceberg/pull/6714#discussion_r1110428814
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -986,3 +989,245 @@ def expression_to_plain_format(
# In the form of expr1 ∨ expr2 ∨ ... ∨ exprN
visitor = ExpressionToPlainFormat(cast_int_to_datetime)
return [visit(expression, visitor) for expression in expressions]
+
+
+class _InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]):
+ struct: StructType
+ expr: BooleanExpression
+
+ value_counts: Dict[int, int]
+ null_counts: Dict[int, int]
+ nan_counts: Dict[int, int]
+ lower_bounds: Dict[int, bytes]
+ upper_bounds: Dict[int, bytes]
+
+ def __init__(self, schema: Schema, expr: BooleanExpression,
case_sensitive: bool = True) -> None:
+ self.struct = schema.as_struct()
+ self.expr = bind(schema, rewrite_not(expr), case_sensitive)
+
+ def eval(self, file: DataFile) -> bool:
+ """Test whether the file may contain records that match the
expression."""
+
+ if file.record_count == 0:
+ return ROWS_CANNOT_MATCH
+
+ if file.record_count < 0:
+ # @TODO we haven't implemented parsing record count from avro file
and thus set record count -1
+ # when importing avro tables to iceberg tables. This should be
updated once we implemented
+ # and set correct record count.
+ return ROWS_MIGHT_MATCH
+
+ self.value_counts = file.value_counts or EMPTY_DICT
+ self.null_counts = file.null_value_counts or EMPTY_DICT
+ self.nan_counts = file.nan_value_counts or EMPTY_DICT
+ self.lower_bounds = file.lower_bounds or EMPTY_DICT
+ self.upper_bounds = file.upper_bounds or EMPTY_DICT
+
+ return visit(self.expr, self)
+
+ def _contains_nulls_only(self, idx: int) -> bool:
+ return idx in self.value_counts and idx in self.null_counts and
self.value_counts[idx] - self.null_counts[idx] == 0
+
+ def _contains_nans_only(self, idx: int) -> bool:
+ return idx in self.nan_counts and idx in self.value_counts and
self.nan_counts[idx] == self.value_counts[idx]
+
+ def _is_nan(self, val: Any) -> bool:
+ return math.isnan(val)
+
+ def visit_true(self) -> bool:
+ # all rows match
+ return ROWS_MIGHT_MATCH
+
+ def visit_false(self) -> bool:
+ # all rows fail
+ return ROWS_CANNOT_MATCH
+
+ def visit_not(self, child_result: bool) -> bool:
+ raise ValueError(f"NOT should be rewritten: {child_result}")
+
+ def visit_and(self, left_result: bool, right_result: bool) -> bool:
+ return left_result and right_result
+
+ def visit_or(self, left_result: bool, right_result: bool) -> bool:
+ return left_result or right_result
+
+ def visit_is_null(self, term: BoundTerm[L]) -> bool:
+ idx = term.ref().field.field_id
+
+ if self.null_counts is not None and idx in self.null_counts and
self.null_counts[idx] == 0:
Review Comment:
I think we need to check if it is non-null until we get default values,
updated this in the other places
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -986,3 +989,245 @@ def expression_to_plain_format(
# In the form of expr1 ∨ expr2 ∨ ... ∨ exprN
visitor = ExpressionToPlainFormat(cast_int_to_datetime)
return [visit(expression, visitor) for expression in expressions]
+
+
+class _InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]):
+ struct: StructType
+ expr: BooleanExpression
+
+ value_counts: Dict[int, int]
+ null_counts: Dict[int, int]
+ nan_counts: Dict[int, int]
+ lower_bounds: Dict[int, bytes]
+ upper_bounds: Dict[int, bytes]
+
+ def __init__(self, schema: Schema, expr: BooleanExpression,
case_sensitive: bool = True) -> None:
+ self.struct = schema.as_struct()
+ self.expr = bind(schema, rewrite_not(expr), case_sensitive)
+
+ def eval(self, file: DataFile) -> bool:
+ """Test whether the file may contain records that match the
expression."""
+
+ if file.record_count == 0:
+ return ROWS_CANNOT_MATCH
+
+ if file.record_count < 0:
+ # @TODO we haven't implemented parsing record count from avro file
and thus set record count -1
+ # when importing avro tables to iceberg tables. This should be
updated once we implemented
+ # and set correct record count.
+ return ROWS_MIGHT_MATCH
+
+ self.value_counts = file.value_counts or EMPTY_DICT
+ self.null_counts = file.null_value_counts or EMPTY_DICT
+ self.nan_counts = file.nan_value_counts or EMPTY_DICT
+ self.lower_bounds = file.lower_bounds or EMPTY_DICT
+ self.upper_bounds = file.upper_bounds or EMPTY_DICT
+
+ return visit(self.expr, self)
+
+ def _contains_nulls_only(self, idx: int) -> bool:
Review Comment:
`id` is a reserved keyword in Python:
```
python3
Python 3.10.10 (main, Feb 8 2023, 05:34:50) [Clang 14.0.0
(clang-1400.0.29.202)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> id("vo")
4369414704
```
I've updated this to `field_id`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]