rdblue commented on code in PR #6714:
URL: https://github.com/apache/iceberg/pull/6714#discussion_r1110131026
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -986,3 +989,245 @@ def expression_to_plain_format(
# In the form of expr1 ∨ expr2 ∨ ... ∨ exprN
visitor = ExpressionToPlainFormat(cast_int_to_datetime)
return [visit(expression, visitor) for expression in expressions]
+
+
+class _InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]):
+ struct: StructType
+ expr: BooleanExpression
+
+ value_counts: Dict[int, int]
+ null_counts: Dict[int, int]
+ nan_counts: Dict[int, int]
+ lower_bounds: Dict[int, bytes]
+ upper_bounds: Dict[int, bytes]
+
+ def __init__(self, schema: Schema, expr: BooleanExpression,
case_sensitive: bool = True) -> None:
+ self.struct = schema.as_struct()
+ self.expr = bind(schema, rewrite_not(expr), case_sensitive)
+
+ def eval(self, file: DataFile) -> bool:
+ """Test whether the file may contain records that match the
expression."""
+
+ if file.record_count == 0:
+ return ROWS_CANNOT_MATCH
+
+ if file.record_count < 0:
+ # @TODO we haven't implemented parsing record count from avro file
and thus set record count -1
+ # when importing avro tables to iceberg tables. This should be
updated once we implemented
+ # and set correct record count.
Review Comment:
I think that we have implemented correct value counts for Avro tables, so we
don't need a TODO. Instead, we should say that some older versions may have set
this to -1 and that we don't trust stats in that case so we must return that
rows might match.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]