When connectivity to consensus service is lost, it is recorded
in a state variable. When all RDE peers are lost, the node will
now self-fence immediately.
---
src/rde/rded/rde_cb.h | 5 +++++
src/rde/rded/rde_main.cc | 18 ++++++++++++++++--
src/rde/rded/role.cc | 24 ++++++++++++++++++++++++
src/rde/rded/role.h | 3 +++
4 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index 9a0919c..e35fdab 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -18,6 +18,7 @@
#ifndef RDE_RDED_RDE_CB_H_
#define RDE_RDED_RDE_CB_H_
+#include <atomic>
#include <cstdint>
#include <set>
#include "base/osaf_utility.h"
@@ -37,6 +38,8 @@
enum class State {kNotActive = 0, kNotActiveSeenPeer, kActiveElected,
kActiveElectedSeenPeer, kActiveFailover};
+enum class ConsensusState {kUnknown = 0, kConnected, kDisconnected};
+
struct RDE_CONTROL_BLOCK {
SYSF_MBX mbx;
NCSCONTEXT task_handle;
@@ -49,6 +52,8 @@ struct RDE_CONTROL_BLOCK {
// used for discovering peer controllers, regardless of their role
std::set<NODE_ID> peer_controllers{};
State state{State::kNotActive};
+ std::atomic<ConsensusState>
consensus_service_state{ConsensusState::kUnknown};
+ std::atomic<bool> state_refresh_thread_started{false}; // consensus service
};
enum RDE_MSG_TYPE {
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index 456d2ce..1a7e587 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -178,6 +178,19 @@ static void handle_mbx_event() {
case RDE_MSG_CONTROLLER_DOWN:
rde_cb->peer_controllers.erase(msg->fr_node_id);
TRACE("peer_controllers: size %zu", rde_cb->peer_controllers.size());
+ if (role->role() == PCS_RDA_ACTIVE) {
+ Consensus consensus_service;
+ if (consensus_service.IsEnabled() == true &&
+ rde_cb->consensus_service_state == ConsensusState::kDisconnected &&
+ consensus_service.IsRelaxedNodePromotionEnabled() == true &&
+ role->IsPeerPresent() == false) {
+ LOG_NO("Lost connectivity to consensus service. No peer present");
+ if (consensus_service.IsRemoteFencingEnabled() == false) {
+ opensaf_quick_reboot("Lost connectivity to consensus service. "
+ "Rebooting this node");
+ }
+ }
+ }
break;
case RDE_MSG_TAKEOVER_REQUEST_CALLBACK: {
rde_cb->monitor_takeover_req_thread_running = false;
@@ -214,7 +227,7 @@ static void handle_mbx_event() {
if (consensus_service.IsRelaxedNodePromotionEnabled() == true) {
if (rde_cb->state == State::kActiveElected) {
TRACE("Relaxed mode is enabled");
- TRACE(" No peer SC yet seen, ignore consensus service
failure");
+ TRACE("No peer SC yet seen, ignore consensus service failure");
// if relaxed node promotion is enabled, and we have yet to see
// a peer SC after being promoted, tolerate consensus service
// not working
@@ -227,13 +240,14 @@ static void handle_mbx_event() {
// we have seen the peer, and peer is still connected, tolerate
// consensus service not working
fencing_required = false;
+ rde_cb->consensus_service_state =
ConsensusState::kDisconnected;
}
}
if (fencing_required == true) {
LOG_NO("Lost connectivity to consensus service");
if (consensus_service.IsRemoteFencingEnabled() == false) {
opensaf_quick_reboot("Lost connectivity to consensus service. "
- "Rebooting this node");
+ "Rebooting this node");
}
}
}
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index 3effc25..b8c8157 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -215,6 +215,18 @@ timespec* Role::Poll(timespec* ts) {
is_candidate).detach();
}
}
+ } else if (role_ == PCS_RDA_ACTIVE) {
+ RDE_CONTROL_BLOCK* cb = rde_get_control_block();
+ if (cb->consensus_service_state == ConsensusState::kUnknown ||
+ cb->consensus_service_state == ConsensusState::kDisconnected) {
+ // consensus service was previously disconnected, refresh state
+ Consensus consensus_service;
+ if (consensus_service.IsEnabled() == true &&
+ cb->state_refresh_thread_started == false) {
+ cb->state_refresh_thread_started = true;
+ std::thread(&Role::RefreshConsensusState, this, cb).detach();
+ }
+ }
}
return timeout;
}
@@ -351,3 +363,15 @@ void Role::PromoteNodeLate() {
this, cb->cluster_members.size(),
true).detach();
}
+
+void Role::RefreshConsensusState(RDE_CONTROL_BLOCK* cb) {
+ TRACE_ENTER();
+
+ Consensus consensus_service;
+ if (consensus_service.IsWritable() == true) {
+ LOG_NO("Connectivity to consensus service established");
+ cb->consensus_service_state = ConsensusState::kConnected;
+ }
+
+ cb->state_refresh_thread_started = false;
+}
diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
index ecb67cf..9c63cbe 100644
--- a/src/rde/rded/role.h
+++ b/src/rde/rded/role.h
@@ -30,6 +30,8 @@ namespace base {
class Process;
}
+struct RDE_CONTROL_BLOCK;
+
class Role {
public:
explicit Role(NODE_ID own_node_id);
@@ -45,6 +47,7 @@ class Role {
const std::string& new_value, SYSF_MBX mbx);
void NodePromoted();
void PromoteNodeLate();
+ void RefreshConsensusState(RDE_CONTROL_BLOCK* cb);
private:
static const uint64_t kDefaultDiscoverPeerTimeout = 2000;
--
2.7.4
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel