This is an automated email from the ASF dual-hosted git repository.

kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 26ecfe77 fix: preserve HMS table properties during commits (#2927)
26ecfe77 is described below

commit 26ecfe77ffc9525e43a180c52b4ab49e76e66cb4
Author: Bharath Krishna <[email protected]>
AuthorDate: Thu Jan 22 09:24:40 2026 -0800

    fix: preserve HMS table properties during commits (#2927)
    
    <!--
    Thanks for opening a pull request!
    -->
    
    <!-- In the case this PR will resolve an issue, please replace
    ${GITHUB_ISSUE_ID} below with the actual Github issue id. -->
    <!-- Closes #${GITHUB_ISSUE_ID} -->
    
    # Rationale for this change
    HMS-specific table properties (e.g., `table_category` from data
    contracts) were lost during PyIceberg commits because `commit_table()`
    replaced all HMS parameters instead of merging them.
    
    Fixes: #2926
    
    ## Are these changes tested?
    
    Yes. Added `test_hive_preserves_hms_specific_properties()` that:
    1. Sets HMS properties via Hive client (simulating external systems)
    2. Performs PyIceberg commit
    3. Verifies HMS properties are preserved
    
    ## Are there any user-facing changes?
    
    Bug fix only - HMS properties set outside PyIceberg are now preserved
    during commits. No API changes.
    
    ---------
    
    Co-authored-by: Fokko Driesprong <[email protected]>
---
 pyiceberg/catalog/hive.py       |  17 ++++-
 tests/integration/test_reads.py | 151 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 2 deletions(-)

diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py
index e0964704..3046e256 100644
--- a/pyiceberg/catalog/hive.py
+++ b/pyiceberg/catalog/hive.py
@@ -551,16 +551,29 @@ class HiveCatalog(MetastoreCatalog):
 
                 if hive_table and current_table:
                     # Table exists, update it.
-                    hive_table.parameters = _construct_parameters(
+                    new_parameters = _construct_parameters(
                         
metadata_location=updated_staged_table.metadata_location,
                         
previous_metadata_location=current_table.metadata_location,
                         metadata_properties=updated_staged_table.properties,
                     )
+
+                    # Detect properties that were removed from Iceberg metadata
+                    removed_keys = current_table.properties.keys() - 
updated_staged_table.properties.keys()
+
+                    # Sync HMS parameters: Iceberg metadata is the source of 
truth, HMS parameters are
+                    # a projection of Iceberg state plus any HMS-only 
properties.
+                    # Start with existing HMS params, remove deleted Iceberg 
properties, then apply Iceberg values.
+                    merged_params = dict(hive_table.parameters or {})
+                    for key in removed_keys:
+                        merged_params.pop(key, None)
+                    merged_params.update(new_parameters)
+                    hive_table.parameters = merged_params
+
                     # Update hive's schema and properties
                     hive_table.sd = _construct_hive_storage_descriptor(
                         updated_staged_table.schema(),
                         updated_staged_table.location(),
-                        property_as_bool(updated_staged_table.properties, 
HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT),
+                        property_as_bool(self.properties, HIVE2_COMPATIBLE, 
HIVE2_COMPATIBLE_DEFAULT),
                     )
                     open_client.alter_table_with_environment_context(
                         dbname=database_name,
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index 0d52365d..8de1925e 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -135,6 +135,157 @@ def test_hive_properties(catalog: Catalog) -> None:
         assert hive_table.parameters.get("abc") is None
 
 
[email protected]
[email protected]("catalog", 
[pytest.lazy_fixture("session_catalog_hive")])
+def test_hive_preserves_hms_specific_properties(catalog: Catalog) -> None:
+    """Test that HMS-specific table properties are preserved during table 
commits.
+
+    This verifies that HMS-specific properties that are not managed by Iceberg
+    are preserved during commits, rather than being lost.
+
+    Regression test for: https://github.com/apache/iceberg-python/issues/2926
+    """
+    table = create_table(catalog)
+    hive_client: _HiveClient = _HiveClient(catalog.properties["uri"])
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        # Add HMS-specific properties that aren't managed by Iceberg
+        hive_table.parameters["table_category"] = "production"
+        hive_table.parameters["data_owner"] = "data_team"
+        open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table)
+
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("table_category") == "production"
+        assert hive_table.parameters.get("data_owner") == "data_team"
+
+    table.transaction().set_properties({"iceberg_property": 
"new_value"}).commit_transaction()
+
+    # Verify that HMS-specific properties are STILL present after commit
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        # HMS-specific properties should be preserved
+        assert hive_table.parameters.get("table_category") == "production", (
+            "HMS property 'table_category' was lost during commit!"
+        )
+        assert hive_table.parameters.get("data_owner") == "data_team", "HMS 
property 'data_owner' was lost during commit!"
+        # Iceberg properties should also be present
+        assert hive_table.parameters.get("iceberg_property") == "new_value"
+
+
[email protected]
+def 
test_iceberg_property_deletion_not_restored_from_old_hms_state(session_catalog_hive:
 Catalog) -> None:
+    """Test that deleted Iceberg properties are truly removed and not restored 
from old HMS state.
+
+    When a property is removed through Iceberg, it should be deleted from HMS 
and not
+    come back from the old HMS state during merge operations.
+    """
+    table = create_table(session_catalog_hive)
+    hive_client: _HiveClient = 
_HiveClient(session_catalog_hive.properties["uri"])
+
+    # Set multiple Iceberg properties
+    table.transaction().set_properties({"prop_to_keep": "keep_value", 
"prop_to_delete": "delete_me"}).commit_transaction()
+
+    # Verify both properties exist
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("prop_to_keep") == "keep_value"
+        assert hive_table.parameters.get("prop_to_delete") == "delete_me"
+
+    # Delete one property through Iceberg
+    
table.transaction().remove_properties("prop_to_delete").commit_transaction()
+
+    # Verify property is deleted from HMS
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("prop_to_keep") == "keep_value"
+        assert hive_table.parameters.get("prop_to_delete") is None, "Deleted 
property should not exist in HMS!"
+
+    # Perform another Iceberg commit
+    table.transaction().set_properties({"new_prop": 
"new_value"}).commit_transaction()
+
+    # Ensure deleted property doesn't come back from old state
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("prop_to_keep") == "keep_value"
+        assert hive_table.parameters.get("new_prop") == "new_value"
+        assert hive_table.parameters.get("prop_to_delete") is None, "Deleted 
property should NOT be restored from old HMS state!"
+
+
[email protected]
[email protected]("catalog", 
[pytest.lazy_fixture("session_catalog_hive")])
+def test_iceberg_metadata_is_source_of_truth(catalog: Catalog) -> None:
+    """Test that Iceberg metadata is the source of truth for all 
Iceberg-managed properties.
+
+    If an external tool sets an HMS property with the same name as an 
Iceberg-managed
+    property, Iceberg's value should win during commits.
+    """
+    table = create_table(catalog)
+    hive_client: _HiveClient = _HiveClient(catalog.properties["uri"])
+
+    # Set an Iceberg property
+    table.transaction().set_properties({"my_prop": 
"iceberg_value"}).commit_transaction()
+
+    # External tool modifies the same property in HMS
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        hive_table.parameters["my_prop"] = "hms_value"  # Conflicting value
+        open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table)
+
+    # Verify HMS has the external value
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("my_prop") == "hms_value"
+
+    # Perform another Iceberg commit
+    table.transaction().set_properties({"another_prop": 
"test"}).commit_transaction()
+
+    # Iceberg's value should take precedence
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("my_prop") == "iceberg_value", (
+            "Iceberg property value should take precedence over conflicting 
HMS value!"
+        )
+        assert hive_table.parameters.get("another_prop") == "test"
+
+
[email protected]
[email protected]("catalog", 
[pytest.lazy_fixture("session_catalog_hive")])
+def test_hive_critical_properties_always_from_iceberg(catalog: Catalog) -> 
None:
+    """Test that critical properties (EXTERNAL, table_type, metadata_location) 
always come from Iceberg.
+
+    These properties should never be carried over from old HMS state.
+    """
+    table = create_table(catalog)
+    hive_client: _HiveClient = _HiveClient(catalog.properties["uri"])
+
+    # Get original metadata_location
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        original_metadata_location = 
hive_table.parameters.get("metadata_location")
+        assert original_metadata_location is not None
+        assert hive_table.parameters.get("EXTERNAL") == "TRUE"
+        assert hive_table.parameters.get("table_type") == "ICEBERG"
+
+    # Try to tamper with critical properties via HMS
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        hive_table.parameters["EXTERNAL"] = "FALSE"  # Try to change
+        open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table)
+
+    # Perform Iceberg commit
+    table.transaction().set_properties({"test_prop": 
"value"}).commit_transaction()
+
+    # Critical properties should be restored by Iceberg
+    with hive_client as open_client:
+        hive_table = open_client.get_table(*TABLE_NAME)
+        assert hive_table.parameters.get("EXTERNAL") == "TRUE", "EXTERNAL 
should always be TRUE from Iceberg!"
+        assert hive_table.parameters.get("table_type") == "ICEBERG", 
"table_type should always be ICEBERG!"
+        # metadata_location should be updated (new metadata file)
+        new_metadata_location = hive_table.parameters.get("metadata_location")
+        assert new_metadata_location != original_metadata_location, 
"metadata_location should be updated!"
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("catalog", 
[pytest.lazy_fixture("session_catalog_hive"), 
pytest.lazy_fixture("session_catalog")])
 def test_table_properties_dict(catalog: Catalog) -> None:

Reply via email to