This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 6658187a Upsert: Don't produce empty snapshots (#1810)
6658187a is described below

commit 6658187ad7ba2392f44e22851b6cb931b167ae43
Author: Fokko Driesprong <[email protected]>
AuthorDate: Wed Mar 19 12:37:19 2025 +0100

    Upsert: Don't produce empty snapshots (#1810)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    
    Yikes! This makes sure to only produce a snapshot when there is anything
    to update or append.
    
    # Are these changes tested?
    
    Yes, by checking the snapshots that are being produced.
    
    # Are there any user-facing changes?
    
    Smaller metadata and faster commits when there is nothing to
    append/update :)
    
    <!-- In the case of user-facing changes, please add the changelog label.
    -->
---
 pyiceberg/table/__init__.py | 10 ++++++----
 tests/table/test_upsert.py  | 13 +++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index a189b07c..cab5d73d 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -1198,10 +1198,11 @@ class Table:
 
                 update_row_cnt = len(rows_to_update)
 
-                # build the match predicate filter
-                overwrite_mask_predicate = 
upsert_util.create_match_filter(rows_to_update, join_cols)
+                if len(rows_to_update) > 0:
+                    # build the match predicate filter
+                    overwrite_mask_predicate = 
upsert_util.create_match_filter(rows_to_update, join_cols)
 
-                tx.overwrite(rows_to_update, 
overwrite_filter=overwrite_mask_predicate)
+                    tx.overwrite(rows_to_update, 
overwrite_filter=overwrite_mask_predicate)
 
             if when_not_matched_insert_all:
                 expr_match = 
upsert_util.create_match_filter(matched_iceberg_table, join_cols)
@@ -1211,7 +1212,8 @@ class Table:
 
                 insert_row_cnt = len(rows_to_insert)
 
-                tx.append(rows_to_insert)
+                if insert_row_cnt > 0:
+                    tx.append(rows_to_insert)
 
         return UpsertResult(rows_updated=update_row_cnt, 
rows_inserted=insert_row_cnt)
 
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
index 7f9e13b5..19bfbc01 100644
--- a/tests/table/test_upsert.py
+++ b/tests/table/test_upsert.py
@@ -28,6 +28,7 @@ from pyiceberg.expressions.literals import LongLiteral
 from pyiceberg.io.pyarrow import schema_to_pyarrow
 from pyiceberg.schema import Schema
 from pyiceberg.table import UpsertResult
+from pyiceberg.table.snapshots import Operation
 from pyiceberg.table.upsert_util import create_match_filter
 from pyiceberg.types import IntegerType, NestedField, StringType
 from tests.catalog.test_base import InMemoryCatalog, Table
@@ -368,9 +369,21 @@ def test_upsert_with_identifier_fields(catalog: Catalog) 
-> None:
     )
     upd = tbl.upsert(df)
 
+    expected_operations = [Operation.APPEND, Operation.OVERWRITE, 
Operation.APPEND, Operation.APPEND]
+
     assert upd.rows_updated == 1
     assert upd.rows_inserted == 1
 
+    assert [snap.summary.operation for snap in tbl.snapshots() if snap.summary 
is not None] == expected_operations
+
+    # This should be a no-op
+    upd = tbl.upsert(df)
+
+    assert upd.rows_updated == 0
+    assert upd.rows_inserted == 0
+
+    assert [snap.summary.operation for snap in tbl.snapshots() if snap.summary 
is not None] == expected_operations
+
 
 def test_upsert_into_empty_table(catalog: Catalog) -> None:
     identifier = "default.test_upsert_into_empty_table"

Reply via email to