This is an automated email from the ASF dual-hosted git repository. laiyingchun pushed a commit to branch branch-1.17.x in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 4c46ece626c8c2eca3b73072feeb2ac71358153a Author: Alexey Serbin <ale...@apache.org> AuthorDate: Fri Apr 21 20:16:01 2023 -0700 [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid The TestNoMoreRetryWithWongServerUuid scenario of the DeleteTabletITest was a bit flaky. The system catalog was sometimes sending in DeleteTablet RPCs when the original tablet server was still starting up, so even if DeleteTablet requests went through and were accounted for, they were responded with an error status because the tablet manager of the original tablet server wasn't ready to receive such requests yet, and the catalog manager retried its requests after receiving such responses. This patch addresses the flakiness and fixes a typo in the name of the scenario. The scenario's dist-test stats for a DEBUG build are below. Before: 6 out of 1024 failed: http://dist-test.cloudera.org/job?job_id=aserbin.1682129573.9602 After: 0 out of 1024 failed: http://dist-test.cloudera.org/job?job_id=aserbin.1682133036.42291 Change-Id: I3d3dee89b32d1e33d1f0f41e8b83835b02eae336 Reviewed-on: http://gerrit.cloudera.org:8080/19785 Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com> Tested-by: Alexey Serbin <ale...@apache.org> Reviewed-on: http://gerrit.cloudera.org:8080/19947 Reviewed-by: Yingchun Lai <laiyingc...@apache.org> Tested-by: Kudu Jenkins Reviewed-by: Yuqi Du <shenxingwuy...@gmail.com> Reviewed-by: Yifan Zhang <chinazhangyi...@163.com> --- src/kudu/integration-tests/delete_tablet-itest.cc | 44 +++++++++++++++++------ 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc b/src/kudu/integration-tests/delete_tablet-itest.cc index 9b58c4e4b..be8c4b9e7 100644 --- a/src/kudu/integration-tests/delete_tablet-itest.cc +++ b/src/kudu/integration-tests/delete_tablet-itest.cc @@ -238,14 +238,15 @@ TEST_F(DeleteTabletITest, TestNoOpDeleteTabletRPC) { ASSERT_EQ(0, flush_count_after - flush_count_before); } -// Regression test for KUDU-3341: Ensure that master would not retry to send +// Regression test for KUDU-3341: ensure that master would not retry sending // DeleteTablet() RPC to a "wrong server". -TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) { +TEST_F(DeleteTabletITest, NoMoreRetryWithWrongServerUuid) { SKIP_IF_SLOW_NOT_ALLOWED(); + constexpr int kNumTablets = 3; + constexpr int kNumTabletServers = 4; + FLAGS_raft_heartbeat_interval_ms = 100; FLAGS_follower_unavailable_considered_failed_sec = 2; - const int kNumTablets = 3; - const int kNumTabletServers = 4; // Start a cluster and wait all tablets running and leader elected. NO_FATALS(StartCluster(kNumTabletServers)); @@ -253,8 +254,7 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) { workload.set_num_tablets(kNumTablets); workload.Setup(); ASSERT_EVENTUALLY([&] { - ClusterVerifier v(cluster_.get()); - ASSERT_OK(v.RunKsck()); + ASSERT_OK(ClusterVerifier(cluster_.get()).RunKsck()); }); // Get number of replicas on ts-0. @@ -267,7 +267,7 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) { } // Stop ts-0 and wait for replacement of replicas finished. - Sockaddr addr = ts->bound_rpc_addr(); + const Sockaddr addr = ts->bound_rpc_addr(); ts->Shutdown(); SleepFor(MonoDelta::FromSeconds(2 * FLAGS_follower_unavailable_considered_failed_sec)); @@ -275,24 +275,46 @@ TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) { ASSERT_OK(cluster_->AddTabletServer(HostPort(addr))); auto* new_ts = cluster_->mini_tablet_server(kNumTabletServers); + int64_t num_delete_tablet_rpcs = 0; ASSERT_EVENTUALLY([&] { ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), &num_delete_tablet_rpcs)); ASSERT_EQ(num_replicas, num_delete_tablet_rpcs); }); - // Sleep enough time to verify no additional DeleteTablet RPCs are sent by master. + // Sleep for some time and verify that no additional DeleteTablet RPCs + // are sent by the system catalog. SleepFor(MonoDelta::FromSeconds(5)); + ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), &num_delete_tablet_rpcs)); ASSERT_EQ(num_replicas, num_delete_tablet_rpcs); - // Stop the new tablet server and restart ts-0, finally outdated tablets on ts-0 would be deleted. + // Stop the new tablet server and start ts-0 back. The outdated tablet + // replicas on ts-0 should be deleted. new_ts->Shutdown(); + + // Shutdown the system catalog before starting up ts-0. That's to avoid + // receiving DeleteTablet calls when tablet server isn't yet able to process + // them, but it still counts them in the metric that GetNumDeleteTabletRPCs() + // retrieves the stats from. + for (auto i = 0; i < cluster_->num_masters(); ++i) { + cluster_->mini_master(i)->Shutdown(); + } + ASSERT_OK(ts->Start()); + + for (auto i = 0; i < cluster_->num_masters(); ++i) { + ASSERT_OK(cluster_->mini_master(i)->Restart()); + } + ASSERT_EVENTUALLY([&] { ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), &num_delete_tablet_rpcs)); ASSERT_EQ(num_replicas, num_delete_tablet_rpcs); - int num_live_tablets = ts->server()->tablet_manager()->GetNumLiveTablets(); - ASSERT_EQ(0, num_live_tablets); + ASSERT_EQ(0, ts->server()->tablet_manager()->GetNumLiveTablets()); }); + // Sleep for some time and verify that no additional DeleteTablet RPCs + // are sent by the system catalog. + SleepFor(MonoDelta::FromSeconds(5)); + ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), &num_delete_tablet_rpcs)); + ASSERT_EQ(num_replicas, num_delete_tablet_rpcs); } } // namespace kudu