This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 6658187a Upsert: Don't produce empty snapshots (#1810)
6658187a is described below
commit 6658187ad7ba2392f44e22851b6cb931b167ae43
Author: Fokko Driesprong <[email protected]>
AuthorDate: Wed Mar 19 12:37:19 2025 +0100
Upsert: Don't produce empty snapshots (#1810)
<!--
Thanks for opening a pull request!
-->
<!-- In the case this PR will resolve an issue, please replace
${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
<!-- Closes #${GITHUB_ISSUE_ID} -->
# Rationale for this change
Yikes! This makes sure to only produce a snapshot when there is anything
to update or append.
# Are these changes tested?
Yes, by checking the snapshots that are being produced.
# Are there any user-facing changes?
Smaller metadata and faster commits when there is nothing to
append/update :)
<!-- In the case of user-facing changes, please add the changelog label.
-->
---
pyiceberg/table/__init__.py | 10 ++++++----
tests/table/test_upsert.py | 13 +++++++++++++
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index a189b07c..cab5d73d 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -1198,10 +1198,11 @@ class Table:
update_row_cnt = len(rows_to_update)
- # build the match predicate filter
- overwrite_mask_predicate =
upsert_util.create_match_filter(rows_to_update, join_cols)
+ if len(rows_to_update) > 0:
+ # build the match predicate filter
+ overwrite_mask_predicate =
upsert_util.create_match_filter(rows_to_update, join_cols)
- tx.overwrite(rows_to_update,
overwrite_filter=overwrite_mask_predicate)
+ tx.overwrite(rows_to_update,
overwrite_filter=overwrite_mask_predicate)
if when_not_matched_insert_all:
expr_match =
upsert_util.create_match_filter(matched_iceberg_table, join_cols)
@@ -1211,7 +1212,8 @@ class Table:
insert_row_cnt = len(rows_to_insert)
- tx.append(rows_to_insert)
+ if insert_row_cnt > 0:
+ tx.append(rows_to_insert)
return UpsertResult(rows_updated=update_row_cnt,
rows_inserted=insert_row_cnt)
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
index 7f9e13b5..19bfbc01 100644
--- a/tests/table/test_upsert.py
+++ b/tests/table/test_upsert.py
@@ -28,6 +28,7 @@ from pyiceberg.expressions.literals import LongLiteral
from pyiceberg.io.pyarrow import schema_to_pyarrow
from pyiceberg.schema import Schema
from pyiceberg.table import UpsertResult
+from pyiceberg.table.snapshots import Operation
from pyiceberg.table.upsert_util import create_match_filter
from pyiceberg.types import IntegerType, NestedField, StringType
from tests.catalog.test_base import InMemoryCatalog, Table
@@ -368,9 +369,21 @@ def test_upsert_with_identifier_fields(catalog: Catalog)
-> None:
)
upd = tbl.upsert(df)
+ expected_operations = [Operation.APPEND, Operation.OVERWRITE,
Operation.APPEND, Operation.APPEND]
+
assert upd.rows_updated == 1
assert upd.rows_inserted == 1
+ assert [snap.summary.operation for snap in tbl.snapshots() if snap.summary
is not None] == expected_operations
+
+ # This should be a no-op
+ upd = tbl.upsert(df)
+
+ assert upd.rows_updated == 0
+ assert upd.rows_inserted == 0
+
+ assert [snap.summary.operation for snap in tbl.snapshots() if snap.summary
is not None] == expected_operations
+
def test_upsert_into_empty_table(catalog: Catalog) -> None:
identifier = "default.test_upsert_into_empty_table"