This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to branch branch-1.17.x
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit b3e64b07f73ed1ee756c6a338c2fa4e76e1e7265
Author: duyuqi <shenxingwuy...@gmail.com>
AuthorDate: Fri Mar 10 19:44:08 2023 +0800

    [master] Exclude tservers in MAINTENANCE_MODE when leader rebalancing
    
    Currently, auto leader rebalancing may transfer new leaders to tservers
    which have entered MAINTENANCE_MODE. Because a tserver in MAINTENANCE_MODE
    is being maintained by administrators, maybe it is decommissioning.
    Since it might be decommissioned/decommissioning, it doesn't make sense
    for the leader rebalancer to transfer a leader to a tserver in 
MAINTENANCE_MODE
    as leaders serve many users' requests. For this reason, we should exclude
    such tservers.
    
    This patch fixes the problem and adds an unit test for this.
    
    Change-Id: I2f85a675e69fd02a62e2625881dad2ca5e27acd9
    Reviewed-on: http://gerrit.cloudera.org:8080/19608
    Tested-by: Kudu Jenkins
    Reviewed-by: Yingchun Lai <laiyingc...@apache.org>
    Reviewed-by: Yuqi Du <shenxingwuy...@gmail.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/19923
    Tested-by: Yingchun Lai <laiyingc...@apache.org>
---
 src/kudu/master/auto_leader_rebalancer-test.cc | 75 +++++++++++++++++++++++++-
 src/kudu/master/auto_leader_rebalancer.cc      | 22 +++++++-
 src/kudu/master/auto_leader_rebalancer.h       |  2 +
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/src/kudu/master/auto_leader_rebalancer-test.cc 
b/src/kudu/master/auto_leader_rebalancer-test.cc
index 8ff1d0d0e..5357bdd15 100644
--- a/src/kudu/master/auto_leader_rebalancer-test.cc
+++ b/src/kudu/master/auto_leader_rebalancer-test.cc
@@ -16,6 +16,7 @@
 // under the License.
 #include "kudu/master/auto_leader_rebalancer.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <ostream>
@@ -27,15 +28,20 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "kudu/consensus/metadata.pb.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/integration-tests/test_workload.h"
 #include "kudu/master/catalog_manager.h"
 #include "kudu/master/master.h"
+#include "kudu/master/master.pb.h"
 #include "kudu/master/mini_master.h"
 #include "kudu/master/ts_descriptor.h"
 #include "kudu/master/ts_manager.h"
 #include "kudu/mini-cluster/internal_mini_cluster.h"
+#include "kudu/rpc/rpc_controller.h"
 #include "kudu/tserver/mini_tablet_server.h"
+#include "kudu/tserver/tserver.pb.h"
+#include "kudu/tserver/tserver_service.proxy.h" // IWYU pragma: keep
 #include "kudu/util/monotime.h"
 #include "kudu/util/status.h"
 #include "kudu/util/test_macros.h"
@@ -50,8 +56,10 @@ class AutoRebalancerTask;
 
 using kudu::cluster::InternalMiniCluster;
 using kudu::cluster::InternalMiniClusterOptions;
+using kudu::tserver::ListTabletsResponsePB;
 using std::string;
 using std::unique_ptr;
+using std::vector;
 
 DECLARE_bool(auto_leader_rebalancing_enabled);
 DECLARE_bool(auto_rebalancing_enabled);
@@ -118,7 +126,7 @@ class LeaderRebalancerTest : public KuduTest {
     }
 
     return 
catalog_manager->auto_leader_rebalancer()->RunLeaderRebalanceForTable(
-        table_info, tserver_uuids, 
AutoLeaderRebalancerTask::ExecuteMode::TEST);
+        table_info, tserver_uuids, {}, 
AutoLeaderRebalancerTask::ExecuteMode::TEST);
   }
 
  protected:
@@ -203,5 +211,70 @@ TEST_F(LeaderRebalancerTest, RestartTserver) {
   ASSERT_OK(CheckLeaderBalance());
 }
 
+TEST_F(LeaderRebalancerTest, TestMaintenanceMode) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  constexpr const int kNumTServers = 3;
+  constexpr const int kNumTablets = 10;
+  cluster_opts_.num_tablet_servers = kNumTServers;
+  FLAGS_leader_rebalancing_max_moves_per_round = 5;
+  ASSERT_OK(CreateAndStartCluster());
+
+  CreateWorkloadTable(kNumTablets, /*num_replicas*/ 3);
+
+  // Leader master
+  master::Master* master = cluster_->mini_master()->master();
+
+  master::AutoLeaderRebalancerTask* leader_rebalancer =
+      master->catalog_manager()->auto_leader_rebalancer();
+  ASSERT_NE(leader_rebalancer, nullptr);
+
+  constexpr const int kCurrentTserverIndex = 0;
+  tserver::MiniTabletServer* mini_tserver = 
cluster_->mini_tablet_server(kCurrentTserverIndex);
+  // Sets the tserver state for a tserver to 'MAINTENANCE_MODE'.
+  ASSERT_OK(
+      master->ts_manager()->SetTServerState(mini_tserver->uuid(),
+                                            TServerStatePB::MAINTENANCE_MODE,
+                                            
ChangeTServerStateRequestPB::ALLOW_MISSING_TSERVER,
+                                            
master->catalog_manager()->sys_catalog()));
+  ASSERT_EQ(TServerStatePB::MAINTENANCE_MODE,
+            master->ts_manager()->GetTServerState(mini_tserver->uuid()));
+  // Restart the tserver to force transferring all leaders on it to make 
leaders not balanced.
+  mini_tserver->Shutdown();
+  SleepFor(MonoDelta::FromMilliseconds(5 * FLAGS_heartbeat_interval_ms));
+  ASSERT_OK(mini_tserver->Start());
+
+  // Try to run some 'leader rebalance' iterations. If mini_tserver is not in 
MAINTENANCE_MODE,
+  // it's enough to reach leader balanced, more tries is not necessary and 
less tries
+  // may not reach leader rebalanced.
+  constexpr const int32_t retries = std::max(kNumTablets / 2, 3);
+  for (int i = 0; i < retries; i++) {
+    ASSERT_OK(leader_rebalancer->RunLeaderRebalancer());
+    if (CheckLeaderBalance().ok()) {
+      break;
+    }
+    SleepFor(MonoDelta::FromMilliseconds(FLAGS_heartbeat_interval_ms));
+  }
+  // This cluster cannot reach the state of rebalanced leadership distribution 
since 1 of the 3
+  // tservers is in maintenance mode.
+  Status status = CheckLeaderBalance();
+  ASSERT_TRUE(status.IsIllegalState()) << status.ToString();
+
+  {
+    // The tserver in maintenance mode should have no leaders.
+    std::shared_ptr<tserver::TabletServerServiceProxy> proxy =
+        cluster_->tserver_proxy(kCurrentTserverIndex);
+    tserver::ListTabletsRequestPB req;
+    ListTabletsResponsePB resp;
+    rpc::RpcController rpc;
+    rpc.set_timeout(MonoDelta::FromMilliseconds(1000));
+    ASSERT_OK(proxy->ListTablets(req, &resp, &rpc));
+    ASSERT_FALSE(resp.has_error());
+    ASSERT_FALSE(resp.status_and_schema().empty());
+    for (const auto& replica : resp.status_and_schema()) {
+      ASSERT_NE(consensus::RaftPeerPB::LEADER, replica.role());
+    }
+  }
+}
+
 }  // namespace master
 }  // namespace kudu
diff --git a/src/kudu/master/auto_leader_rebalancer.cc 
b/src/kudu/master/auto_leader_rebalancer.cc
index 1a5adf143..3f9030c9c 100644
--- a/src/kudu/master/auto_leader_rebalancer.cc
+++ b/src/kudu/master/auto_leader_rebalancer.cc
@@ -25,6 +25,8 @@
 #include <ostream>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -37,6 +39,7 @@
 #include "kudu/consensus/metadata.pb.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/map-util.h"
+#include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/catalog_manager.h"
 #include "kudu/master/master.pb.h"
@@ -62,6 +65,7 @@ using kudu::rpc::RpcController;
 using std::map;
 using std::nullopt;
 using std::string;
+using std::unordered_set;
 using std::vector;
 using strings::Substitute;
 
@@ -116,6 +120,7 @@ void AutoLeaderRebalancerTask::Shutdown() {
 Status AutoLeaderRebalancerTask::RunLeaderRebalanceForTable(
     const scoped_refptr<TableInfo>& table_info,
     const vector<string>& tserver_uuids,
+    const unordered_set<string>& exclude_dest_uuids,
     AutoLeaderRebalancerTask::ExecuteMode mode) {
   LOG(INFO) << Substitute("leader rebalance for table $0", 
table_info->table_name());
   TableMetadataLock table_l(table_info.get(), LockMode::READ);
@@ -243,6 +248,9 @@ Status AutoLeaderRebalancerTask::RunLeaderRebalanceForTable(
       }
       double min_score = 1;
       for (int j = 0; j < uuid_followers.size(); j++) {
+        if (ContainsKey(exclude_dest_uuids, uuid_followers[j])) {
+          continue;
+        }
         std::pair<int32_t, int32_t>& replica_and_leader_count =
             replica_and_leader_count_by_ts_uuid[uuid_followers[j]];
         int32_t replica_count = replica_and_leader_count.first;
@@ -328,6 +336,10 @@ Status 
AutoLeaderRebalancerTask::RunLeaderRebalanceForTable(
     if (!ts_manager_->LookupTSByUUID(leader_uuid, &leader_desc)) {
       continue;
     }
+    if (PREDICT_FALSE(TServerStatePB::MAINTENANCE_MODE ==
+                      ts_manager_->GetTServerState(task.second.second))) {
+      continue;
+    }
 
     vector<Sockaddr> resolved;
     RETURN_NOT_OK(host_port->ResolveAddresses(&resolved));
@@ -380,6 +392,14 @@ Status AutoLeaderRebalancerTask::RunLeaderRebalancer() {
     tserver_uuids.emplace_back(e->permanent_uuid());
   }
 
+  // Avoid transferring leaders to tservers that are in MAINTENANCE_MODE.
+  auto tserver_state_by_uuid = ts_manager_->GetTServerStates();
+  unordered_set<string> exclude_dest_uuids;
+  for (const auto& uuid_state : tserver_state_by_uuid) {
+    if (uuid_state.second.first == TServerStatePB::MAINTENANCE_MODE) {
+      exclude_dest_uuids.insert(uuid_state.first);
+    }
+  }
   vector<scoped_refptr<TableInfo>> table_infos;
   {
     CatalogManager::ScopedLeaderSharedLock leader_lock(catalog_manager_);
@@ -387,7 +407,7 @@ Status AutoLeaderRebalancerTask::RunLeaderRebalancer() {
     catalog_manager_->GetAllTables(&table_infos);
   }
   for (const auto& table_info : table_infos) {
-    RunLeaderRebalanceForTable(table_info, tserver_uuids);
+    RunLeaderRebalanceForTable(table_info, tserver_uuids, exclude_dest_uuids);
   }
   // @TODO(duyuqi)
   // Enrich the log and add metrics for leader rebalancer.
diff --git a/src/kudu/master/auto_leader_rebalancer.h 
b/src/kudu/master/auto_leader_rebalancer.h
index 004e19b1d..4dd19ab64 100644
--- a/src/kudu/master/auto_leader_rebalancer.h
+++ b/src/kudu/master/auto_leader_rebalancer.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <random>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "kudu/gutil/ref_counted.h"
@@ -75,6 +76,7 @@ class AutoLeaderRebalancerTask {
   Status RunLeaderRebalanceForTable(
       const scoped_refptr<TableInfo>& table_info,
       const std::vector<std::string>& tserver_uuids,
+      const std::unordered_set<std::string>& exclude_dest_uuids,
       AutoLeaderRebalancerTask::ExecuteMode mode = 
AutoLeaderRebalancerTask::ExecuteMode::NORMAL);
 
   // Only one task can be scheduled at a time.

Reply via email to