This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 1f9c46b4 perf: Avoid copying metadata for each data file in summary 
(#2674)
1f9c46b4 is described below

commit 1f9c46b4b567add9821583a4717641d70cebee1d
Author: Anton-Tarazi <[email protected]>
AuthorDate: Sun Nov 2 11:34:52 2025 -0500

    perf: Avoid copying metadata for each data file in summary (#2674)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    Resolves #2673
    
    # Rationale for this change
    
    `_SnapshotProducer._summary()` copies the metadata for _every_ added /
    deleted DataFile. This is pretty expensive. Instead we just copy it once
    at the beginning of the function and use the same value each DataFile.
    
    On my data, which overwrites a few million rows at a time, I saw the
    time for `table.overwrite` go from ~20 seconds to ~6 seconds.
    
    ## Are these changes tested?
    
    Yes, existing unit / integration tests
    
    ## Are there any user-facing changes?
    
    Just faster writes :)
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 pyiceberg/table/update/snapshot.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pyiceberg/table/update/snapshot.py 
b/pyiceberg/table/update/snapshot.py
index aed7ec04..a73961b5 100644
--- a/pyiceberg/table/update/snapshot.py
+++ b/pyiceberg/table/update/snapshot.py
@@ -240,8 +240,11 @@ class _SnapshotProducer(UpdateTableMetadata[U], 
Generic[U]):
     def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> 
Summary:
         from pyiceberg.table import TableProperties
 
+        # avoid copying metadata for each data file
+        table_metadata = self._transaction.table_metadata
+
         partition_summary_limit = int(
-            self._transaction.table_metadata.properties.get(
+            table_metadata.properties.get(
                 TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, 
TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT
             )
         )
@@ -250,23 +253,21 @@ class _SnapshotProducer(UpdateTableMetadata[U], 
Generic[U]):
         for data_file in self._added_data_files:
             ssc.add_file(
                 data_file=data_file,
-                partition_spec=self._transaction.table_metadata.spec(),
-                schema=self._transaction.table_metadata.schema(),
+                partition_spec=table_metadata.spec(),
+                schema=table_metadata.schema(),
             )
 
         if len(self._deleted_data_files) > 0:
-            specs = self._transaction.table_metadata.specs()
+            specs = table_metadata.specs()
             for data_file in self._deleted_data_files:
                 ssc.remove_file(
                     data_file=data_file,
                     partition_spec=specs[data_file.spec_id],
-                    schema=self._transaction.table_metadata.schema(),
+                    schema=table_metadata.schema(),
                 )
 
         previous_snapshot = (
-            
self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
-            if self._parent_snapshot_id is not None
-            else None
+            table_metadata.snapshot_by_id(self._parent_snapshot_id) if 
self._parent_snapshot_id is not None else None
         )
 
         return update_snapshot_summaries(

Reply via email to