This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 680241ba5e Python: Support optionally returning empty files in scans
(#8204)
680241ba5e is described below
commit 680241ba5ec1a17d169c13f4012fd8cfcb324e5b
Author: Rusty Conover <[email protected]>
AuthorDate: Sun Aug 6 15:58:41 2023 -0400
Python: Support optionally returning empty files in scans (#8204)
---
python/pyiceberg/expressions/visitors.py | 7 +++++--
python/pyiceberg/table/__init__.py | 4 +++-
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/python/pyiceberg/expressions/visitors.py
b/python/pyiceberg/expressions/visitors.py
index b701ce7e3c..76d0e6579d 100644
--- a/python/pyiceberg/expressions/visitors.py
+++ b/python/pyiceberg/expressions/visitors.py
@@ -1095,13 +1095,16 @@ class
_InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]):
lower_bounds: Dict[int, bytes]
upper_bounds: Dict[int, bytes]
- def __init__(self, schema: Schema, expr: BooleanExpression,
case_sensitive: bool = True) -> None:
+ def __init__(
+ self, schema: Schema, expr: BooleanExpression, case_sensitive: bool =
True, include_empty_files: bool = False
+ ) -> None:
self.struct = schema.as_struct()
+ self.include_empty_files = include_empty_files
self.expr = bind(schema, rewrite_not(expr), case_sensitive)
def eval(self, file: DataFile) -> bool:
"""Test whether the file may contain records that match the
expression."""
- if file.record_count == 0:
+ if not self.include_empty_files and file.record_count == 0:
return ROWS_CANNOT_MATCH
if file.record_count < 0:
diff --git a/python/pyiceberg/table/__init__.py
b/python/pyiceberg/table/__init__.py
index 997729c239..3851cf8c54 100644
--- a/python/pyiceberg/table/__init__.py
+++ b/python/pyiceberg/table/__init__.py
@@ -766,7 +766,9 @@ class DataScan(TableScan):
# this filter depends on the partition spec used to write the manifest
file
partition_evaluators: Dict[int, Callable[[DataFile], bool]] =
KeyDefaultDict(self._build_partition_evaluator)
- metrics_evaluator = _InclusiveMetricsEvaluator(self.table.schema(),
self.row_filter, self.case_sensitive).eval
+ metrics_evaluator = _InclusiveMetricsEvaluator(
+ self.table.schema(), self.row_filter, self.case_sensitive,
self.options.get("include_empty_files") == "true"
+ ).eval
min_data_sequence_number = _min_data_file_sequence_number(manifests)