Hi Gary,
ack, review only/Thanks HansN
On 5/27/19 2:09 AM, Gary Lee wrote:
> When connectivity to consensus service is lost, it is recorded
> in a state variable. When all RDE peers are lost, the node will
> now self-fence immediately.
> ---
> src/rde/rded/rde_cb.h | 5 +++++
> src/rde/rded/rde_main.cc | 18 ++++++++++++++++--
> src/rde/rded/role.cc | 24 ++++++++++++++++++++++++
> src/rde/rded/role.h | 3 +++
> 4 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
> index 9a0919c..e35fdab 100644
> --- a/src/rde/rded/rde_cb.h
> +++ b/src/rde/rded/rde_cb.h
> @@ -18,6 +18,7 @@
> #ifndef RDE_RDED_RDE_CB_H_
> #define RDE_RDED_RDE_CB_H_
>
> +#include <atomic>
> #include <cstdint>
> #include <set>
> #include "base/osaf_utility.h"
> @@ -37,6 +38,8 @@
> enum class State {kNotActive = 0, kNotActiveSeenPeer, kActiveElected,
> kActiveElectedSeenPeer, kActiveFailover};
>
> +enum class ConsensusState {kUnknown = 0, kConnected, kDisconnected};
> +
> struct RDE_CONTROL_BLOCK {
> SYSF_MBX mbx;
> NCSCONTEXT task_handle;
> @@ -49,6 +52,8 @@ struct RDE_CONTROL_BLOCK {
> // used for discovering peer controllers, regardless of their role
> std::set<NODE_ID> peer_controllers{};
> State state{State::kNotActive};
> + std::atomic<ConsensusState>
> consensus_service_state{ConsensusState::kUnknown};
> + std::atomic<bool> state_refresh_thread_started{false}; // consensus service
> };
>
> enum RDE_MSG_TYPE {
> diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
> index 456d2ce..1a7e587 100644
> --- a/src/rde/rded/rde_main.cc
> +++ b/src/rde/rded/rde_main.cc
> @@ -178,6 +178,19 @@ static void handle_mbx_event() {
> case RDE_MSG_CONTROLLER_DOWN:
> rde_cb->peer_controllers.erase(msg->fr_node_id);
> TRACE("peer_controllers: size %zu", rde_cb->peer_controllers.size());
> + if (role->role() == PCS_RDA_ACTIVE) {
> + Consensus consensus_service;
> + if (consensus_service.IsEnabled() == true &&
> + rde_cb->consensus_service_state == ConsensusState::kDisconnected
> &&
> + consensus_service.IsRelaxedNodePromotionEnabled() == true &&
> + role->IsPeerPresent() == false) {
> + LOG_NO("Lost connectivity to consensus service. No peer
> present");
> + if (consensus_service.IsRemoteFencingEnabled() == false) {
> + opensaf_quick_reboot("Lost connectivity to consensus
> service. "
> + "Rebooting this node");
> + }
> + }
> + }
> break;
> case RDE_MSG_TAKEOVER_REQUEST_CALLBACK: {
> rde_cb->monitor_takeover_req_thread_running = false;
> @@ -214,7 +227,7 @@ static void handle_mbx_event() {
> if (consensus_service.IsRelaxedNodePromotionEnabled() == true) {
> if (rde_cb->state == State::kActiveElected) {
> TRACE("Relaxed mode is enabled");
> - TRACE(" No peer SC yet seen, ignore consensus service
> failure");
> + TRACE("No peer SC yet seen, ignore consensus service
> failure");
> // if relaxed node promotion is enabled, and we have yet to
> see
> // a peer SC after being promoted, tolerate consensus
> service
> // not working
> @@ -227,13 +240,14 @@ static void handle_mbx_event() {
> // we have seen the peer, and peer is still connected,
> tolerate
> // consensus service not working
> fencing_required = false;
> + rde_cb->consensus_service_state =
> ConsensusState::kDisconnected;
> }
> }
> if (fencing_required == true) {
> LOG_NO("Lost connectivity to consensus service");
> if (consensus_service.IsRemoteFencingEnabled() == false) {
> opensaf_quick_reboot("Lost connectivity to consensus
> service. "
> - "Rebooting this node");
> + "Rebooting this node");
> }
> }
> }
> diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
> index 3effc25..b8c8157 100644
> --- a/src/rde/rded/role.cc
> +++ b/src/rde/rded/role.cc
> @@ -215,6 +215,18 @@ timespec* Role::Poll(timespec* ts) {
> is_candidate).detach();
> }
> }
> + } else if (role_ == PCS_RDA_ACTIVE) {
> + RDE_CONTROL_BLOCK* cb = rde_get_control_block();
> + if (cb->consensus_service_state == ConsensusState::kUnknown ||
> + cb->consensus_service_state == ConsensusState::kDisconnected) {
> + // consensus service was previously disconnected, refresh state
> + Consensus consensus_service;
> + if (consensus_service.IsEnabled() == true &&
> + cb->state_refresh_thread_started == false) {
> + cb->state_refresh_thread_started = true;
> + std::thread(&Role::RefreshConsensusState, this, cb).detach();
> + }
> + }
> }
> return timeout;
> }
> @@ -351,3 +363,15 @@ void Role::PromoteNodeLate() {
> this, cb->cluster_members.size(),
> true).detach();
> }
> +
> +void Role::RefreshConsensusState(RDE_CONTROL_BLOCK* cb) {
> + TRACE_ENTER();
> +
> + Consensus consensus_service;
> + if (consensus_service.IsWritable() == true) {
> + LOG_NO("Connectivity to consensus service established");
> + cb->consensus_service_state = ConsensusState::kConnected;
> + }
> +
> + cb->state_refresh_thread_started = false;
> +}
> diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
> index ecb67cf..9c63cbe 100644
> --- a/src/rde/rded/role.h
> +++ b/src/rde/rded/role.h
> @@ -30,6 +30,8 @@ namespace base {
> class Process;
> }
>
> +struct RDE_CONTROL_BLOCK;
> +
> class Role {
> public:
> explicit Role(NODE_ID own_node_id);
> @@ -45,6 +47,7 @@ class Role {
> const std::string& new_value, SYSF_MBX mbx);
> void NodePromoted();
> void PromoteNodeLate();
> + void RefreshConsensusState(RDE_CONTROL_BLOCK* cb);
>
> private:
> static const uint64_t kDefaultDiscoverPeerTimeout = 2000;
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel