This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 1f9c46b4 perf: Avoid copying metadata for each data file in summary
(#2674)
1f9c46b4 is described below
commit 1f9c46b4b567add9821583a4717641d70cebee1d
Author: Anton-Tarazi <[email protected]>
AuthorDate: Sun Nov 2 11:34:52 2025 -0500
perf: Avoid copying metadata for each data file in summary (#2674)
<!--
Thanks for opening a pull request!
-->
<!-- In the case this PR will resolve an issue, please replace
${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
<!-- Closes #${GITHUB_ISSUE_ID} -->
Resolves #2673
# Rationale for this change
`_SnapshotProducer._summary()` copies the metadata for _every_ added /
deleted DataFile. This is pretty expensive. Instead we just copy it once
at the beginning of the function and use the same value each DataFile.
On my data, which overwrites a few million rows at a time, I saw the
time for `table.overwrite` go from ~20 seconds to ~6 seconds.
## Are these changes tested?
Yes, existing unit / integration tests
## Are there any user-facing changes?
Just faster writes :)
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
pyiceberg/table/update/snapshot.py | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/pyiceberg/table/update/snapshot.py
b/pyiceberg/table/update/snapshot.py
index aed7ec04..a73961b5 100644
--- a/pyiceberg/table/update/snapshot.py
+++ b/pyiceberg/table/update/snapshot.py
@@ -240,8 +240,11 @@ class _SnapshotProducer(UpdateTableMetadata[U],
Generic[U]):
def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) ->
Summary:
from pyiceberg.table import TableProperties
+ # avoid copying metadata for each data file
+ table_metadata = self._transaction.table_metadata
+
partition_summary_limit = int(
- self._transaction.table_metadata.properties.get(
+ table_metadata.properties.get(
TableProperties.WRITE_PARTITION_SUMMARY_LIMIT,
TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT
)
)
@@ -250,23 +253,21 @@ class _SnapshotProducer(UpdateTableMetadata[U],
Generic[U]):
for data_file in self._added_data_files:
ssc.add_file(
data_file=data_file,
- partition_spec=self._transaction.table_metadata.spec(),
- schema=self._transaction.table_metadata.schema(),
+ partition_spec=table_metadata.spec(),
+ schema=table_metadata.schema(),
)
if len(self._deleted_data_files) > 0:
- specs = self._transaction.table_metadata.specs()
+ specs = table_metadata.specs()
for data_file in self._deleted_data_files:
ssc.remove_file(
data_file=data_file,
partition_spec=specs[data_file.spec_id],
- schema=self._transaction.table_metadata.schema(),
+ schema=table_metadata.schema(),
)
previous_snapshot = (
-
self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
- if self._parent_snapshot_id is not None
- else None
+ table_metadata.snapshot_by_id(self._parent_snapshot_id) if
self._parent_snapshot_id is not None else None
)
return update_snapshot_summaries(