This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 564f2ced7330e6ccb807b39c47a42ea691a1f733 Author: wzhou-code <wz...@cloudera.com> AuthorDate: Tue Feb 27 11:29:55 2024 -0800 IMPALA-12848: Fixed flaky test test_catalogd_ha_failover TestExtDataSources::test_catalogd_ha_failover failed to delete data source object after catalog service failed over to standby catalogd. Log messages showed that coordinator tried to submit the DDL request to original active catalogd since it did not receive failover notification from statestored yet. To fix the flaky test, wait until coordinator receive failover notification from statestored before executing DDL request to drop data source. Testing: - Looped to run the test for more than hundred times without failure. Change-Id: Ia6225271357740c055c25fdd349f1dc9162c2f53 Reviewed-on: http://gerrit.cloudera.org:8080/21078 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- tests/custom_cluster/test_ext_data_sources.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/custom_cluster/test_ext_data_sources.py b/tests/custom_cluster/test_ext_data_sources.py index 765f1bb26..2c7d405ca 100644 --- a/tests/custom_cluster/test_ext_data_sources.py +++ b/tests/custom_cluster/test_ext_data_sources.py @@ -24,6 +24,7 @@ import subprocess from tests.common.custom_cluster_test_suite import CustomClusterTestSuite from tests.common.environ import build_flavor_timeout from tests.common.skip import SkipIfApacheHive +from time import sleep class TestExtDataSources(CustomClusterTestSuite): @@ -104,6 +105,7 @@ class TestExtDataSources(CustomClusterTestSuite): @CustomClusterTestSuite.with_args( statestored_args="--use_subscriber_id_as_catalogd_priority=true " "--statestore_heartbeat_frequency_ms=1000", + catalogd_args="--catalogd_ha_reset_metadata_on_failover=false", start_args="--enable_catalogd_ha") def test_catalogd_ha_failover(self): """The test case for cluster started with catalogd HA enabled.""" @@ -136,6 +138,23 @@ class TestExtDataSources(CustomClusterTestSuite): "catalog-server.active-status", expected_value=True, timeout=30) assert(catalogd_service_2.get_metric_value("catalog-server.active-status")) + # Wait until coordinator receive failover notification. + coordinator_service = self.cluster.impalads[0].service + expected_catalog_service_port = catalogd_service_2.get_catalog_service_port() + received_failover_notification = False + retry_count = 30 + while (retry_count > 0): + active_catalogd_address = \ + coordinator_service.get_metric_value("catalog.active-catalogd-address") + _, catalog_service_port = active_catalogd_address.split(":") + if (int(catalog_service_port) == expected_catalog_service_port): + received_failover_notification = True + break + retry_count -= 1 + sleep(1) + assert received_failover_notification, \ + "Coordinator did not receive notification of Catalog service failover." + # Verify that the data source object is available in the catalogd of HA pair. result = self.execute_query(SHOW_DATA_SOURCE_QUERY) assert result.success, str(result)