kevinjqliu commented on issue #1229:
URL:
https://github.com/apache/iceberg-python/issues/1229#issuecomment-2414939042
> Most of the time is spent processing the manifests record-by-record and
converting each record to a dict
Heres a snippet using threads to parallelize both reading the manifest files
and conversion to records.
This is generated by Claude, so please double-check.
```
def get_message_table_files(
table: pyiceberg.table.Table,
) -> pa.Table:
schema = table.metadata.schema()
snapshot = table.current_snapshot()
if not snapshot:
return pa.Table.from_pylist([], schema=_FILES_SCHEMA)
with ThreadPoolExecutor() as pool:
return pa.Table.from_batches(
pool.map(
partial(_process_manifest, schema, table.io),
snapshot.manifests(table.io),
),
schema=_FILES_SCHEMA,
)
def _process_manifest(
table_schema: Schema,
io: FileIO,
manifest: ManifestFile,
) -> pa.RecordBatch:
ts_field = table_schema.find_field('ts')
entries = list(manifest.fetch_manifest_entry(io))
valid_entries = [
entry for entry in entries
if entry.data_file.file_format == FileFormat.PARQUET
and entry.status != ManifestEntryStatus.DELETED
]
with ThreadPoolExecutor() as pool:
rows = list(pool.map(
partial(_process_entry, ts_field),
valid_entries
))
return pa.RecordBatch.from_pylist(rows, schema=_FILES_SCHEMA)
def _process_entry(ts_field: Schema.Field, entry: ManifestEntry) -> Dict:
return {
'path': entry.data_file.file_path,
'event_id': entry.data_file.partition.event_id,
'ts_min': from_bytes(
ts_field.field_type,
entry.data_file.lower_bounds.get(ts_field.field_id),
),
'ts_max': from_bytes(
ts_field.field_type,
entry.data_file.upper_bounds.get(ts_field.field_id),
),
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]