When connectivity to consensus service is lost, it is recorded
in a state variable. When all RDE peers are lost, the node will
now self-fence immediately.
---
 src/rde/rded/rde_cb.h    |  5 +++++
 src/rde/rded/rde_main.cc | 18 ++++++++++++++++--
 src/rde/rded/role.cc     | 24 ++++++++++++++++++++++++
 src/rde/rded/role.h      |  3 +++
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index 9a0919c..e35fdab 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -18,6 +18,7 @@
 #ifndef RDE_RDED_RDE_CB_H_
 #define RDE_RDED_RDE_CB_H_
 
+#include <atomic>
 #include <cstdint>
 #include <set>
 #include "base/osaf_utility.h"
@@ -37,6 +38,8 @@
 enum class State {kNotActive = 0, kNotActiveSeenPeer, kActiveElected,
                   kActiveElectedSeenPeer, kActiveFailover};
 
+enum class ConsensusState {kUnknown = 0, kConnected, kDisconnected};
+
 struct RDE_CONTROL_BLOCK {
   SYSF_MBX mbx;
   NCSCONTEXT task_handle;
@@ -49,6 +52,8 @@ struct RDE_CONTROL_BLOCK {
   // used for discovering peer controllers, regardless of their role
   std::set<NODE_ID> peer_controllers{};
   State state{State::kNotActive};
+  std::atomic<ConsensusState> 
consensus_service_state{ConsensusState::kUnknown};
+  std::atomic<bool> state_refresh_thread_started{false}; // consensus service
 };
 
 enum RDE_MSG_TYPE {
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 456d2ce..1a7e587 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -178,6 +178,19 @@ static void handle_mbx_event() {
     case RDE_MSG_CONTROLLER_DOWN:
       rde_cb->peer_controllers.erase(msg->fr_node_id);
       TRACE("peer_controllers: size %zu", rde_cb->peer_controllers.size());
+      if (role->role() == PCS_RDA_ACTIVE) {
+        Consensus consensus_service;
+        if (consensus_service.IsEnabled() == true &&
+            rde_cb->consensus_service_state == ConsensusState::kDisconnected &&
+            consensus_service.IsRelaxedNodePromotionEnabled() == true &&
+            role->IsPeerPresent() == false) {
+            LOG_NO("Lost connectivity to consensus service. No peer present");
+            if (consensus_service.IsRemoteFencingEnabled() == false) {
+                opensaf_quick_reboot("Lost connectivity to consensus service. "
+                                     "Rebooting this node");
+            }
+        }
+      }
       break;
     case RDE_MSG_TAKEOVER_REQUEST_CALLBACK: {
       rde_cb->monitor_takeover_req_thread_running = false;
@@ -214,7 +227,7 @@ static void handle_mbx_event() {
           if (consensus_service.IsRelaxedNodePromotionEnabled() == true) {
               if (rde_cb->state == State::kActiveElected) {
                 TRACE("Relaxed mode is enabled");
-                TRACE(" No peer SC yet seen, ignore consensus service 
failure");
+                TRACE("No peer SC yet seen, ignore consensus service failure");
                 // if relaxed node promotion is enabled, and we have yet to see
                 // a peer SC after being promoted, tolerate consensus service
                 // not working
@@ -227,13 +240,14 @@ static void handle_mbx_event() {
                 // we have seen the peer, and peer is still connected, tolerate
                 // consensus service not working
                 fencing_required = false;
+                rde_cb->consensus_service_state = 
ConsensusState::kDisconnected;
               }
           }
           if (fencing_required == true) {
             LOG_NO("Lost connectivity to consensus service");
             if (consensus_service.IsRemoteFencingEnabled() == false) {
                 opensaf_quick_reboot("Lost connectivity to consensus service. "
-                               "Rebooting this node");
+                                     "Rebooting this node");
             }
           }
         }
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index 3effc25..b8c8157 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -215,6 +215,18 @@ timespec* Role::Poll(timespec* ts) {
                     is_candidate).detach();
       }
     }
+  } else if (role_ == PCS_RDA_ACTIVE) {
+    RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+    if (cb->consensus_service_state == ConsensusState::kUnknown ||
+        cb->consensus_service_state == ConsensusState::kDisconnected) {
+      // consensus service was previously disconnected, refresh state
+      Consensus consensus_service;
+      if (consensus_service.IsEnabled() == true &&
+        cb->state_refresh_thread_started == false) {
+        cb->state_refresh_thread_started = true;
+        std::thread(&Role::RefreshConsensusState, this, cb).detach();
+      }
+    }
   }
   return timeout;
 }
@@ -351,3 +363,15 @@ void Role::PromoteNodeLate() {
               this, cb->cluster_members.size(),
               true).detach();
 }
+
+void Role::RefreshConsensusState(RDE_CONTROL_BLOCK* cb) {
+  TRACE_ENTER();
+
+  Consensus consensus_service;
+  if (consensus_service.IsWritable() == true) {
+    LOG_NO("Connectivity to consensus service established");
+    cb->consensus_service_state = ConsensusState::kConnected;
+  }
+
+  cb->state_refresh_thread_started = false;
+}
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index ecb67cf..9c63cbe 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -30,6 +30,8 @@ namespace base {
 class Process;
 }
 
+struct RDE_CONTROL_BLOCK;
+
 class Role {
  public:
   explicit Role(NODE_ID own_node_id);
@@ -45,6 +47,7 @@ class Role {
                               const std::string& new_value, SYSF_MBX mbx);
   void NodePromoted();
   void PromoteNodeLate();
+  void RefreshConsensusState(RDE_CONTROL_BLOCK* cb);
 
  private:
   static const uint64_t kDefaultDiscoverPeerTimeout = 2000;
-- 
2.7.4



_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to