Hi Gary,

ack, review only/Thanks HansN

On 5/27/19 2:09 AM, Gary Lee wrote:
> When connectivity to consensus service is lost, it is recorded
> in a state variable. When all RDE peers are lost, the node will
> now self-fence immediately.
> ---
>   src/rde/rded/rde_cb.h    |  5 +++++
>   src/rde/rded/rde_main.cc | 18 ++++++++++++++++--
>   src/rde/rded/role.cc     | 24 ++++++++++++++++++++++++
>   src/rde/rded/role.h      |  3 +++
>   4 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
> index 9a0919c..e35fdab 100644
> --- a/src/rde/rded/rde_cb.h
> +++ b/src/rde/rded/rde_cb.h
> @@ -18,6 +18,7 @@
>   #ifndef RDE_RDED_RDE_CB_H_
>   #define RDE_RDED_RDE_CB_H_
>   
> +#include <atomic>
>   #include <cstdint>
>   #include <set>
>   #include "base/osaf_utility.h"
> @@ -37,6 +38,8 @@
>   enum class State {kNotActive = 0, kNotActiveSeenPeer, kActiveElected,
>                     kActiveElectedSeenPeer, kActiveFailover};
>   
> +enum class ConsensusState {kUnknown = 0, kConnected, kDisconnected};
> +
>   struct RDE_CONTROL_BLOCK {
>     SYSF_MBX mbx;
>     NCSCONTEXT task_handle;
> @@ -49,6 +52,8 @@ struct RDE_CONTROL_BLOCK {
>     // used for discovering peer controllers, regardless of their role
>     std::set<NODE_ID> peer_controllers{};
>     State state{State::kNotActive};
> +  std::atomic<ConsensusState> 
> consensus_service_state{ConsensusState::kUnknown};
> +  std::atomic<bool> state_refresh_thread_started{false}; // consensus service
>   };
>   
>   enum RDE_MSG_TYPE {
> diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
> index 456d2ce..1a7e587 100644
> --- a/src/rde/rded/rde_main.cc
> +++ b/src/rde/rded/rde_main.cc
> @@ -178,6 +178,19 @@ static void handle_mbx_event() {
>       case RDE_MSG_CONTROLLER_DOWN:
>         rde_cb->peer_controllers.erase(msg->fr_node_id);
>         TRACE("peer_controllers: size %zu", rde_cb->peer_controllers.size());
> +      if (role->role() == PCS_RDA_ACTIVE) {
> +        Consensus consensus_service;
> +        if (consensus_service.IsEnabled() == true &&
> +            rde_cb->consensus_service_state == ConsensusState::kDisconnected 
> &&
> +            consensus_service.IsRelaxedNodePromotionEnabled() == true &&
> +            role->IsPeerPresent() == false) {
> +            LOG_NO("Lost connectivity to consensus service. No peer 
> present");
> +            if (consensus_service.IsRemoteFencingEnabled() == false) {
> +                opensaf_quick_reboot("Lost connectivity to consensus 
> service. "
> +                                     "Rebooting this node");
> +            }
> +        }
> +      }
>         break;
>       case RDE_MSG_TAKEOVER_REQUEST_CALLBACK: {
>         rde_cb->monitor_takeover_req_thread_running = false;
> @@ -214,7 +227,7 @@ static void handle_mbx_event() {
>             if (consensus_service.IsRelaxedNodePromotionEnabled() == true) {
>                 if (rde_cb->state == State::kActiveElected) {
>                   TRACE("Relaxed mode is enabled");
> -                TRACE(" No peer SC yet seen, ignore consensus service 
> failure");
> +                TRACE("No peer SC yet seen, ignore consensus service 
> failure");
>                   // if relaxed node promotion is enabled, and we have yet to 
> see
>                   // a peer SC after being promoted, tolerate consensus 
> service
>                   // not working
> @@ -227,13 +240,14 @@ static void handle_mbx_event() {
>                   // we have seen the peer, and peer is still connected, 
> tolerate
>                   // consensus service not working
>                   fencing_required = false;
> +                rde_cb->consensus_service_state = 
> ConsensusState::kDisconnected;
>                 }
>             }
>             if (fencing_required == true) {
>               LOG_NO("Lost connectivity to consensus service");
>               if (consensus_service.IsRemoteFencingEnabled() == false) {
>                   opensaf_quick_reboot("Lost connectivity to consensus 
> service. "
> -                               "Rebooting this node");
> +                                     "Rebooting this node");
>               }
>             }
>           }
> diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
> index 3effc25..b8c8157 100644
> --- a/src/rde/rded/role.cc
> +++ b/src/rde/rded/role.cc
> @@ -215,6 +215,18 @@ timespec* Role::Poll(timespec* ts) {
>                       is_candidate).detach();
>         }
>       }
> +  } else if (role_ == PCS_RDA_ACTIVE) {
> +    RDE_CONTROL_BLOCK* cb = rde_get_control_block();
> +    if (cb->consensus_service_state == ConsensusState::kUnknown ||
> +        cb->consensus_service_state == ConsensusState::kDisconnected) {
> +      // consensus service was previously disconnected, refresh state
> +      Consensus consensus_service;
> +      if (consensus_service.IsEnabled() == true &&
> +        cb->state_refresh_thread_started == false) {
> +        cb->state_refresh_thread_started = true;
> +        std::thread(&Role::RefreshConsensusState, this, cb).detach();
> +      }
> +    }
>     }
>     return timeout;
>   }
> @@ -351,3 +363,15 @@ void Role::PromoteNodeLate() {
>                 this, cb->cluster_members.size(),
>                 true).detach();
>   }
> +
> +void Role::RefreshConsensusState(RDE_CONTROL_BLOCK* cb) {
> +  TRACE_ENTER();
> +
> +  Consensus consensus_service;
> +  if (consensus_service.IsWritable() == true) {
> +    LOG_NO("Connectivity to consensus service established");
> +    cb->consensus_service_state = ConsensusState::kConnected;
> +  }
> +
> +  cb->state_refresh_thread_started = false;
> +}
> diff --git a/src/rde/rded/role.h b/src/rde/rded/role.h
> index ecb67cf..9c63cbe 100644
> --- a/src/rde/rded/role.h
> +++ b/src/rde/rded/role.h
> @@ -30,6 +30,8 @@ namespace base {
>   class Process;
>   }
>   
> +struct RDE_CONTROL_BLOCK;
> +
>   class Role {
>    public:
>     explicit Role(NODE_ID own_node_id);
> @@ -45,6 +47,7 @@ class Role {
>                                 const std::string& new_value, SYSF_MBX mbx);
>     void NodePromoted();
>     void PromoteNodeLate();
> +  void RefreshConsensusState(RDE_CONTROL_BLOCK* cb);
>   
>    private:
>     static const uint64_t kDefaultDiscoverPeerTimeout = 2000;

_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to