kevinjqliu commented on code in PR #1607:
URL: https://github.com/apache/iceberg-python/pull/1607#discussion_r1948155351
##########
pyiceberg/table/__init__.py:
##########
@@ -1179,6 +1182,11 @@ def refs(self) -> Dict[str, SnapshotRef]:
def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements:
Tuple[TableRequirement, ...]) -> None:
response = self.catalog.commit_table(self, requirements, updates)
+
+ #
https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527
+ # delete old metadata if METADATA_DELETE_AFTER_COMMIT_ENABLED is set
to true
+ self.catalog._delete_old_metadata(self.io, self.metadata,
response.metadata)
Review Comment:
similar to
https://github.com/apache/iceberg/blob/f6faa58dac57e03be6e02a43937ac7c15c770225/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L539-L544
can we add a comment here explaining how `METADATA_PREVIOUS_VERSIONS_MAX` is
taken into account?
https://github.com/apache/iceberg-python/blob/826a00628216993de50f7f0c2111b6c824b02939/pyiceberg/table/update/__init__.py#L576
##########
pyiceberg/table/__init__.py:
##########
@@ -1179,6 +1182,11 @@ def refs(self) -> Dict[str, SnapshotRef]:
def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements:
Tuple[TableRequirement, ...]) -> None:
response = self.catalog.commit_table(self, requirements, updates)
+
+ #
https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527
+ # delete old metadata if METADATA_DELETE_AFTER_COMMIT_ENABLED is set
to true
+ self.catalog._delete_old_metadata(self.io, self.metadata,
response.metadata)
Review Comment:
also maybe we want to wrap this in try/catch and throw a warning as to not
block the commit process
##########
tests/catalog/test_sql.py:
##########
@@ -1613,3 +1614,56 @@ def test_merge_manifests_local_file_system(catalog:
SqlCatalog, arrow_table_with
tbl.append(arrow_table_with_null)
assert len(tbl.scan().to_arrow()) == 5 * len(arrow_table_with_null)
+
+
[email protected](
+ "catalog",
+ [
+ lazy_fixture("catalog_memory"),
+ lazy_fixture("catalog_sqlite"),
+ lazy_fixture("catalog_sqlite_without_rowcount"),
+ ],
+)
[email protected](
+ "table_identifier",
+ [
+ lazy_fixture("random_table_identifier"),
+ lazy_fixture("random_hierarchical_identifier"),
+ ],
+)
+def test_delete_metadata_multiple(catalog: SqlCatalog, table_schema_nested:
Schema, table_identifier: Identifier) -> None:
+ namespace = Catalog.namespace_from(table_identifier)
+ catalog.create_namespace(namespace)
+ table = catalog.create_table(table_identifier, table_schema_nested)
+
+ original_metadata_location = table.metadata_location
+
+ for i in range(5):
+ with table.transaction() as transaction:
+ with transaction.update_schema() as update:
+ update.add_column(path=f"new_column_{i}",
field_type=IntegerType())
+
+ assert len(table.metadata.metadata_log) == 5
+ assert os.path.exists(original_metadata_location[len("file://") :])
+
+ # Set the max versions property to 2, and delete after commit
+ new_property = {
+ TableProperties.METADATA_PREVIOUS_VERSIONS_MAX: "2",
+ TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED: "true",
+ }
+
+ with table.transaction() as transaction:
+ transaction.set_properties(properties=new_property)
+
+ # Verify that only the most recent metadata files are kept
+ assert len(table.metadata.metadata_log) == 2
+ updated_metadata_1, updated_metadata_2 = table.metadata.metadata_log
+
+ with table.transaction() as transaction:
Review Comment:
nit: add a comment to mention that a new metadata log was added so the first
one is removed
##########
tests/catalog/test_sql.py:
##########
@@ -1613,3 +1614,56 @@ def test_merge_manifests_local_file_system(catalog:
SqlCatalog, arrow_table_with
tbl.append(arrow_table_with_null)
assert len(tbl.scan().to_arrow()) == 5 * len(arrow_table_with_null)
+
+
[email protected](
+ "catalog",
+ [
+ lazy_fixture("catalog_memory"),
+ lazy_fixture("catalog_sqlite"),
+ lazy_fixture("catalog_sqlite_without_rowcount"),
+ ],
+)
[email protected](
+ "table_identifier",
+ [
+ lazy_fixture("random_table_identifier"),
+ lazy_fixture("random_hierarchical_identifier"),
+ ],
+)
Review Comment:
nit: we dont really need this part for testing metadata log deletion
##########
pyiceberg/catalog/__init__.py:
##########
@@ -858,6 +875,7 @@ def _update_and_stage_table(
enforce_validation=current_table is None,
metadata_location=current_table.metadata_location if current_table
else None,
)
+ io = self._load_file_io(properties=updated_metadata.properties,
location=updated_metadata.location)
Review Comment:
should we revert this change since its unrelated to this PR? is
`updated_metadata.location` the same as `new_metadata_location`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]