This ticket revisit the waiting for peer info and
fix the problem of disordered peer_up and peer info
in the commit d1593b03b3c9bec292b14dde65264c261760bf46
---
 src/rde/rded/rde_main.cc |  1 +
 src/rde/rded/role.cc     | 63 +++++++++++++++++++++++++++++++++++++++-
 src/rde/rded/role.h      |  7 +++++
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 8ed6b046e..33dd645e2 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -125,6 +125,7 @@ static void handle_mbx_event() {
     }
     case RDE_MSG_PEER_DOWN:
       LOG_NO("Peer down on node 0x%x", msg->fr_node_id);
+      role->RemovePeer(msg->fr_node_id);
       break;
     case RDE_MSG_NEW_ACTIVE_CALLBACK: {
       const std::string my_node = base::Conf::NodeName();
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index 3732be449..344702e63 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -196,9 +196,13 @@ Role::Role(NODE_ID own_node_id)
       discover_peer_timeout_{base::GetEnv("RDE_DISCOVER_PEER_TIMEOUT",
                                           kDefaultDiscoverPeerTimeout)},
       pre_active_script_timeout_{base::GetEnv(
-          "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)} {}
+          "RDE_PRE_ACTIVE_SCRIPT_TIMEOUT", kDefaultPreActiveScriptTimeout)},
+      received_peer_info_{true},
+      peer_info_wait_time_{},
+      peer_info_wait_timeout_ {kDefaultWaitPeerInfoTimeout} {}
 
 timespec* Role::Poll(timespec* ts) {
+  TRACE_ENTER();
   timespec* timeout = nullptr;
   if (role_ == PCS_RDA_UNDEFINED) {
     timespec now = base::ReadMonotonicClock();
@@ -238,6 +242,25 @@ timespec* Role::Poll(timespec* ts) {
         cb->state_refresh_thread_started = true;
         std::thread(&Role::RefreshConsensusState, this, cb).detach();
       }
+      if (consensus_service.IsEnabled() == false) {
+        // We are already ACTIVE, and has just discovered a new node
+        // which makes the election_end_time_ reset
+        if (received_peer_info_ == false) {
+          timespec now = base::ReadMonotonicClock();
+          if (peer_info_wait_time_ >= now) {
+            *ts = peer_info_wait_time_ - now;
+            timeout = ts;
+          } else {
+            // Timeout but haven't received peer info
+            // The peer RDE could be in ACTIVE
+            // thus self-fence to avoid split-brain risk
+            LOG_ER("Discovery peer up without peer info. Risk in split-brain,"
+                "rebooting this node");
+            opensaf_quick_reboot("Probable split-brain due to "
+                "unknown RDE peer info");
+          }
+        }
+      }
     }
   }
   return timeout;
@@ -251,12 +274,25 @@ void Role::ExecutePreActiveScript() {
 }
 
 void Role::AddPeer(NODE_ID node_id) {
+  TRACE_ENTER();
   auto result = known_nodes_.insert(node_id);
   if (result.second) {
     ResetElectionTimer();
+    if (role_ == PCS_RDA_ACTIVE) {
+      ResetPeerInfoWaitTimer();
+      received_peer_info_ = false;
+    }
   }
 }
 
+void Role::RemovePeer(NODE_ID node_id) {
+  TRACE_ENTER();
+  if (received_peer_info_ == false && role_ != PCS_RDA_ACTIVE) {
+    StopPeerInfoWaitTimer();
+  }
+  known_nodes_.erase(node_id);
+}
+
 // call from main thread only
 bool Role::IsCandidate() {
   TRACE_ENTER();
@@ -330,10 +366,24 @@ uint32_t Role::SetRole(PCS_RDA_ROLE new_role) {
 }
 
 void Role::ResetElectionTimer() {
+  TRACE_ENTER();
   election_end_time_ = base::ReadMonotonicClock() +
                        base::MillisToTimespec(discover_peer_timeout_);
 }
 
+void Role::ResetPeerInfoWaitTimer() {
+  TRACE_ENTER();
+  LOG_NO("Start/restart waiting peer info timer");
+  peer_info_wait_time_ = base::ReadMonotonicClock() +
+                       base::MillisToTimespec(peer_info_wait_timeout_);
+}
+
+void Role::StopPeerInfoWaitTimer() {
+  TRACE_ENTER();
+  // Turn off peer_info_timer
+  received_peer_info_ = true;
+}
+
 uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role,
                                      PCS_RDA_ROLE old_role) {
   uint32_t rc = NCSCC_RC_SUCCESS;
@@ -357,6 +407,7 @@ uint32_t Role::UpdateMdsRegistration(PCS_RDA_ROLE new_role,
 
 void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID node_id,
                         uint64_t peer_promote_pending) {
+  TRACE_ENTER();
   if (role() == PCS_RDA_UNDEFINED) {
     bool give_up = false;
     RDE_CONTROL_BLOCK *cb = rde_get_control_block();
@@ -372,6 +423,14 @@ void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID 
node_id,
     }
     if (node_role == PCS_RDA_ACTIVE || node_role == PCS_RDA_STANDBY ||
         give_up) {
+      // broadcast QUIESCED role to all peers to stop their waiting peer
+      // info timer
+      rde_msg peer_info_req;
+      peer_info_req.type = RDE_MSG_PEER_INFO_RESP;
+      peer_info_req.info.peer_info.ha_role = PCS_RDA_QUIESCED;
+      peer_info_req.info.peer_info.promote_pending = 0;
+      rde_mds_broadcast(&peer_info_req);
+
       SetRole(PCS_RDA_QUIESCED);
       LOG_NO("Giving up election against 0x%" PRIx32
              " with role %s. "
@@ -379,6 +438,8 @@ void Role::SetPeerState(PCS_RDA_ROLE node_role, NODE_ID 
node_id,
              node_id, to_string(node_role), to_string(role()));
     }
   }
+  known_nodes_.insert(node_id);
+  StopPeerInfoWaitTimer();
 }
 
 void Role::PromoteNodeLate() {
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index 2d24361c5..218897892 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -50,12 +50,16 @@ class Role {
   void NodePromoted();
   void PromoteNodeLate();
   void RefreshConsensusState(RDE_CONTROL_BLOCK* cb);
+  void RemovePeer(NODE_ID node_id);
 
  private:
   static const uint64_t kDefaultDiscoverPeerTimeout = 2000;
+  static const uint64_t kDefaultWaitPeerInfoTimeout = 2000;
   static const uint64_t kDefaultPreActiveScriptTimeout = 5000;
   void ExecutePreActiveScript();
   void ResetElectionTimer();
+  void ResetPeerInfoWaitTimer();
+  void StopPeerInfoWaitTimer();
   uint32_t UpdateMdsRegistration(PCS_RDA_ROLE new_role, PCS_RDA_ROLE old_role);
   void PromoteNode(const uint64_t cluster_size, const bool relaxed_mode);
 
@@ -68,6 +72,9 @@ class Role {
   uint64_t pre_active_script_timeout_;
   static const char* const role_names_[];
   static const char* const pre_active_script_;
+  bool received_peer_info_;
+  timespec peer_info_wait_time_;
+  uint64_t peer_info_wait_timeout_;
 
   DELETE_COPY_AND_MOVE_OPERATORS(Role);
 };
-- 
2.20.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to