This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 680241ba5e Python: Support optionally returning empty files in scans 
(#8204)
680241ba5e is described below

commit 680241ba5ec1a17d169c13f4012fd8cfcb324e5b
Author: Rusty Conover <[email protected]>
AuthorDate: Sun Aug 6 15:58:41 2023 -0400

    Python: Support optionally returning empty files in scans (#8204)
---
 python/pyiceberg/expressions/visitors.py | 7 +++++--
 python/pyiceberg/table/__init__.py       | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/pyiceberg/expressions/visitors.py 
b/python/pyiceberg/expressions/visitors.py
index b701ce7e3c..76d0e6579d 100644
--- a/python/pyiceberg/expressions/visitors.py
+++ b/python/pyiceberg/expressions/visitors.py
@@ -1095,13 +1095,16 @@ class 
_InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]):
     lower_bounds: Dict[int, bytes]
     upper_bounds: Dict[int, bytes]
 
-    def __init__(self, schema: Schema, expr: BooleanExpression, 
case_sensitive: bool = True) -> None:
+    def __init__(
+        self, schema: Schema, expr: BooleanExpression, case_sensitive: bool = 
True, include_empty_files: bool = False
+    ) -> None:
         self.struct = schema.as_struct()
+        self.include_empty_files = include_empty_files
         self.expr = bind(schema, rewrite_not(expr), case_sensitive)
 
     def eval(self, file: DataFile) -> bool:
         """Test whether the file may contain records that match the 
expression."""
-        if file.record_count == 0:
+        if not self.include_empty_files and file.record_count == 0:
             return ROWS_CANNOT_MATCH
 
         if file.record_count < 0:
diff --git a/python/pyiceberg/table/__init__.py 
b/python/pyiceberg/table/__init__.py
index 997729c239..3851cf8c54 100644
--- a/python/pyiceberg/table/__init__.py
+++ b/python/pyiceberg/table/__init__.py
@@ -766,7 +766,9 @@ class DataScan(TableScan):
         # this filter depends on the partition spec used to write the manifest 
file
 
         partition_evaluators: Dict[int, Callable[[DataFile], bool]] = 
KeyDefaultDict(self._build_partition_evaluator)
-        metrics_evaluator = _InclusiveMetricsEvaluator(self.table.schema(), 
self.row_filter, self.case_sensitive).eval
+        metrics_evaluator = _InclusiveMetricsEvaluator(
+            self.table.schema(), self.row_filter, self.case_sensitive, 
self.options.get("include_empty_files") == "true"
+        ).eval
 
         min_data_sequence_number = _min_data_file_sequence_number(manifests)
 

Reply via email to