This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push: new 47e2cc3bb IMPALA-11710: Table properties are not updated in Iceberg metadata files 47e2cc3bb is described below commit 47e2cc3bb9b982e6b9f30535fa7374f2da1b8dd3 Author: Zoltan Borok-Nagy <borokna...@cloudera.com> AuthorDate: Tue Jan 16 18:03:24 2024 +0100 IMPALA-11710: Table properties are not updated in Iceberg metadata files Impala has some troubles with Iceberg tables that don't specify 'external.table.purge'='true'. Schema changes like ADD COLUMN is not working, also table properties that have been set by an ALTER TABLE SET TBLPROPERTIES statement can be reset by subsequent INSERT statements. There was a bug in CatalogOpExecutor.alterIcebergTables(). Its return value determines whether we need to also update the table definition in HMS, or it was already done by the Iceberg library. In fact, the HMS table definition is always updated by the Iceberg library if the table is handled by the HiveCatalog. In every other case we need to update the HMS table definition ourselves (unless the change won't affect HMS). The issue was that CatalogOpExecutor.alterIcebergTables() returned true (which means we need to update HMS) in case of Iceberg tables that didn't specify 'external.table.purge'='true'. This was a problem, because Iceberg already modified the HMS table and set 'metadata_location' to a new metadata file. But then Impala came and modified some properties of the HMS table definition, but also reset 'metadata_location' to the original value. Therefore subsequent operations/refreshes used the earlier state of the Iceberg table and they reset the modifications. Testing: * added e2e tests for Iceberg tables in different catalogs with 'external.table.merge'='false' Change-Id: I2a82d022534e1e212d542fbd7916ae033c381c20 Reviewed-on: http://gerrit.cloudera.org:8080/20907 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- .../apache/impala/service/CatalogOpExecutor.java | 5 +- .../queries/QueryTest/iceberg-external.test | 71 ++++++++++++++++++++++ tests/query_test/test_iceberg.py | 3 + 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java index edd68215c..3e620aa51 100644 --- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java +++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java @@ -1405,13 +1405,16 @@ public class CatalogOpExecutor { /** * Executes the ALTER TABLE command for an Iceberg table and reloads its metadata. + * Returns true if we also need to update the table definition in HMS. Returns false + * if the HMS table is already updated by Iceberg, or there is nothing to update in + * HMS (the change is internal to Iceberg). */ private boolean alterIcebergTable(TAlterTableParams params, TDdlExecResponse response, IcebergTable tbl, long newCatalogVersion, boolean wantMinimalResult, String debugAction) throws ImpalaException { Preconditions.checkState(tbl.isWriteLockedByCurrentThread()); - boolean needsToUpdateHms = !isIcebergHmsIntegrationEnabled(tbl.getMetaStoreTable()); + boolean needsToUpdateHms = !IcebergUtil.isHiveCatalog(tbl.getMetaStoreTable()); try { org.apache.iceberg.Transaction iceTxn = IcebergUtil.getIcebergTransaction(tbl); switch (params.getAlter_type()) { diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-external.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-external.test new file mode 100644 index 000000000..e57dc2a19 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-external.test @@ -0,0 +1,71 @@ +==== +---- QUERY +create table ice_hive_ext (i int) +stored by iceberg +tblproperties ('external.table.purge'='false'); +insert into ice_hive_ext values (4); +---- DML_RESULTS: ice_hive_ext +4 +---- TYPES +INT +==== +---- QUERY +alter table ice_hive_ext add column j int; +select * from ice_hive_ext; +---- RESULTS +4,NULL +---- TYPES +INT,INT +==== +---- QUERY +insert into ice_hive_ext values (5,5); +---- DML_RESULTS: ice_hive_ext +4,NULL +5,5 +---- TYPES +INT,INT +==== +---- QUERY +alter table ice_hive_ext set tblproperties ('external.table.purge'='true'); +insert into ice_hive_ext values (6,6); +describe formatted ice_hive_ext +---- RESULTS: VERIFY_IS_SUBSET +'','external.table.purge','true ' +---- TYPES +STRING,STRING,STRING +==== +---- QUERY +create table ice_hadoop_tables_ext (i int) +stored by iceberg +tblproperties ('iceberg.catalog'='hadoop.tables', 'external.table.purge'='false'); +insert into ice_hadoop_tables_ext values (4); +---- DML_RESULTS: ice_hadoop_tables_ext +4 +---- TYPES +INT +==== +---- QUERY +alter table ice_hadoop_tables_ext add column j int; +select * from ice_hadoop_tables_ext; +---- RESULTS +4,NULL +---- TYPES +INT,INT +==== +---- QUERY +insert into ice_hadoop_tables_ext values (5,5); +---- DML_RESULTS: ice_hadoop_tables_ext +4,NULL +5,5 +---- TYPES +INT,INT +==== +---- QUERY +alter table ice_hadoop_tables_ext set tblproperties ('external.table.purge'='true'); +insert into ice_hadoop_tables_ext values (6,6); +describe formatted ice_hadoop_tables_ext +---- RESULTS: VERIFY_IS_SUBSET +'','external.table.purge','true ' +---- TYPES +STRING,STRING,STRING +==== diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py index dd87a006b..d1737df78 100644 --- a/tests/query_test/test_iceberg.py +++ b/tests/query_test/test_iceberg.py @@ -72,6 +72,9 @@ class TestIcebergTable(IcebergTestSuite): def test_alter_iceberg_tables(self, vector, unique_database): self.run_test_case('QueryTest/iceberg-alter', vector, use_db=unique_database) + def test_external_iceberg_tables(self, vector, unique_database): + self.run_test_case('QueryTest/iceberg-external', vector, unique_database) + def test_expire_snapshots(self, unique_database): tbl_name = unique_database + ".expire_snapshots" iceberg_catalogs = IcebergCatalogs(unique_database)