This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to branch branch-1.17.x
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 4c46ece626c8c2eca3b73072feeb2ac71358153a
Author: Alexey Serbin <ale...@apache.org>
AuthorDate: Fri Apr 21 20:16:01 2023 -0700

    [tests] fix flakiness in TestNoMoreRetryWithWongServerUuid
    
    The TestNoMoreRetryWithWongServerUuid scenario of the DeleteTabletITest
    was a bit flaky.  The system catalog was sometimes sending in
    DeleteTablet RPCs when the original tablet server was still starting
    up, so even if DeleteTablet requests went through and were accounted
    for, they were responded with an error status because the tablet
    manager of the original tablet server wasn't ready to receive such
    requests yet, and the catalog manager retried its requests after
    receiving such responses.
    
    This patch addresses the flakiness and fixes a typo in the name of the
    scenario.  The scenario's dist-test stats for a DEBUG build are below.
    
    Before:
      6 out of 1024 failed:
        http://dist-test.cloudera.org/job?job_id=aserbin.1682129573.9602
    
    After:
      0 out of 1024 failed:
        http://dist-test.cloudera.org/job?job_id=aserbin.1682133036.42291
    
    Change-Id: I3d3dee89b32d1e33d1f0f41e8b83835b02eae336
    Reviewed-on: http://gerrit.cloudera.org:8080/19785
    Reviewed-by: Abhishek Chennaka <achenn...@cloudera.com>
    Tested-by: Alexey Serbin <ale...@apache.org>
    Reviewed-on: http://gerrit.cloudera.org:8080/19947
    Reviewed-by: Yingchun Lai <laiyingc...@apache.org>
    Tested-by: Kudu Jenkins
    Reviewed-by: Yuqi Du <shenxingwuy...@gmail.com>
    Reviewed-by: Yifan Zhang <chinazhangyi...@163.com>
---
 src/kudu/integration-tests/delete_tablet-itest.cc | 44 +++++++++++++++++------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/kudu/integration-tests/delete_tablet-itest.cc 
b/src/kudu/integration-tests/delete_tablet-itest.cc
index 9b58c4e4b..be8c4b9e7 100644
--- a/src/kudu/integration-tests/delete_tablet-itest.cc
+++ b/src/kudu/integration-tests/delete_tablet-itest.cc
@@ -238,14 +238,15 @@ TEST_F(DeleteTabletITest, TestNoOpDeleteTabletRPC) {
   ASSERT_EQ(0, flush_count_after - flush_count_before);
 }
 
-// Regression test for KUDU-3341: Ensure that master would not retry to send
+// Regression test for KUDU-3341: ensure that master would not retry sending
 // DeleteTablet() RPC to a "wrong server".
-TEST_F(DeleteTabletITest, TestNoMoreRetryWithWongServerUuid) {
+TEST_F(DeleteTabletITest, NoMoreRetryWithWrongServerUuid) {
   SKIP_IF_SLOW_NOT_ALLOWED();
+  constexpr int kNumTablets = 3;
+  constexpr int kNumTabletServers = 4;
+
   FLAGS_raft_heartbeat_interval_ms = 100;
   FLAGS_follower_unavailable_considered_failed_sec = 2;
-  const int kNumTablets = 3;
-  const int kNumTabletServers = 4;
 
   // Start a cluster and wait all tablets running and leader elected.
   NO_FATALS(StartCluster(kNumTabletServers));
@@ -253,8 +254,7 @@ TEST_F(DeleteTabletITest, 
TestNoMoreRetryWithWongServerUuid) {
   workload.set_num_tablets(kNumTablets);
   workload.Setup();
   ASSERT_EVENTUALLY([&] {
-    ClusterVerifier v(cluster_.get());
-    ASSERT_OK(v.RunKsck());
+    ASSERT_OK(ClusterVerifier(cluster_.get()).RunKsck());
   });
 
   // Get number of replicas on ts-0.
@@ -267,7 +267,7 @@ TEST_F(DeleteTabletITest, 
TestNoMoreRetryWithWongServerUuid) {
   }
 
   // Stop ts-0 and wait for replacement of replicas finished.
-  Sockaddr addr = ts->bound_rpc_addr();
+  const Sockaddr addr = ts->bound_rpc_addr();
   ts->Shutdown();
   SleepFor(MonoDelta::FromSeconds(2 * 
FLAGS_follower_unavailable_considered_failed_sec));
 
@@ -275,24 +275,46 @@ TEST_F(DeleteTabletITest, 
TestNoMoreRetryWithWongServerUuid) {
   ASSERT_OK(cluster_->AddTabletServer(HostPort(addr)));
 
   auto* new_ts = cluster_->mini_tablet_server(kNumTabletServers);
+
   int64_t num_delete_tablet_rpcs = 0;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), 
&num_delete_tablet_rpcs));
     ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
   });
-  // Sleep enough time to verify no additional DeleteTablet RPCs are sent by 
master.
+  // Sleep for some time and verify that no additional DeleteTablet RPCs
+  // are sent by the system catalog.
   SleepFor(MonoDelta::FromSeconds(5));
+  ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(new_ts->bound_http_addr()), 
&num_delete_tablet_rpcs));
   ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
 
-  // Stop the new tablet server and restart ts-0, finally outdated tablets on 
ts-0 would be deleted.
+  // Stop the new tablet server and start ts-0 back. The outdated tablet
+  // replicas on ts-0 should be deleted.
   new_ts->Shutdown();
+
+  // Shutdown the system catalog before starting up ts-0. That's to avoid
+  // receiving DeleteTablet calls when tablet server isn't yet able to process
+  // them, but it still counts them in the metric that GetNumDeleteTabletRPCs()
+  // retrieves the stats from.
+  for (auto i = 0; i < cluster_->num_masters(); ++i) {
+    cluster_->mini_master(i)->Shutdown();
+  }
+
   ASSERT_OK(ts->Start());
+
+  for (auto i = 0; i < cluster_->num_masters(); ++i) {
+    ASSERT_OK(cluster_->mini_master(i)->Restart());
+  }
+
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), 
&num_delete_tablet_rpcs));
     ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
-    int num_live_tablets = ts->server()->tablet_manager()->GetNumLiveTablets();
-    ASSERT_EQ(0, num_live_tablets);
+    ASSERT_EQ(0, ts->server()->tablet_manager()->GetNumLiveTablets());
   });
+  // Sleep for some time and verify that no additional DeleteTablet RPCs
+  // are sent by the system catalog.
+  SleepFor(MonoDelta::FromSeconds(5));
+  ASSERT_OK(GetNumDeleteTabletRPCs(HostPort(ts->bound_http_addr()), 
&num_delete_tablet_rpcs));
+  ASSERT_EQ(num_replicas, num_delete_tablet_rpcs);
 }
 
 } // namespace kudu

Reply via email to