rdblue commented on code in PR #6233:
URL: https://github.com/apache/iceberg/pull/6233#discussion_r1027370034
##########
python/pyiceberg/table/__init__.py:
##########
@@ -199,16 +223,114 @@ def use_ref(self, name: str):
raise ValueError(f"Cannot scan unknown ref={name}")
- def select(self, *field_names: str) -> TableScan:
+ def select(self, *field_names: str) -> S:
if "*" in self.selected_fields:
return self.update(selected_fields=field_names)
return
self.update(selected_fields=tuple(set(self.selected_fields).intersection(set(field_names))))
- def filter_rows(self, new_row_filter: BooleanExpression) -> TableScan:
+ def filter_rows(self, new_row_filter: BooleanExpression) -> S:
return self.update(row_filter=And(self.row_filter, new_row_filter))
- def filter_partitions(self, new_partition_filter: BooleanExpression) ->
TableScan:
+ def filter_partitions(self, new_partition_filter: BooleanExpression) -> S:
return self.update(partition_filter=And(self.partition_filter,
new_partition_filter))
- def with_case_sensitive(self, case_sensitive: bool = True) -> TableScan:
+ def with_case_sensitive(self, case_sensitive: bool = True) -> S:
return self.update(case_sensitive=case_sensitive)
+
+
+class ScanTask(ABC):
+ pass
+
+
+@dataclass(init=False)
+class FileScanTask(ScanTask):
+ data_file: DataFile
+ start: int
+ length: int
+
+ def __init__(self, data_file: DataFile, start: Optional[int] = None,
length: Optional[int] = None):
+ self.data_file = data_file
+ self.start = start or 0
+ self.length = length or data_file.file_size_in_bytes
+
+
+class _DictAsStruct(StructProtocol):
Review Comment:
For my small test table in S3, it took 81 seconds to plan a scan, reading 12
manifests in a single thread. Of that, 973 ms were user time. I suspect that
the `DataFile` read path, which converts generic positional records to
`DataFile` by copying to a recursive dict and mapping to field names and then
using pydantic to interpret that dict is what is slowing the scan down. Then
this class does the opposite.
I think we should probably make a `DataFile` class that implements
`StructLike` for reading fairly soon to speed up job planning.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]