When a node goes down and split-brain prevention is enabled, check that we still have write access to the consensus service. If not and fencing is disabled, reboot the node to prevent split brain. --- src/amf/amfd/ndproc.cc | 12 +++++++++++- src/amf/amfd/osaf-amfd.in | 4 ++++ src/amf/amfd/role.cc | 30 +++++++++++++++++++++++++----- 3 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index 0c6316627..df68b3dbf 100644 --- a/src/amf/amfd/ndproc.cc +++ b/src/amf/amfd/ndproc.cc @@ -32,8 +32,8 @@ */ #include "osaf/immutil/immutil.h" +#include "osaf/consensus/service.h" #include "base/logtrace.h" - #include "amf/amfd/amfd.h" #include "amf/amfd/imm.h" #include "amf/amfd/cluster.h" @@ -1221,5 +1221,15 @@ void avd_node_failover(AVD_AVND *node) { avd_pg_node_csi_del_all(avd_cb, node); avd_node_down_mw_susi_failover(avd_cb, node); avd_node_down_appl_susi_failover(avd_cb, node); + + Consensus consensus_service; + if (consensus_service.IsRemoteFencingEnabled() == false && + consensus_service.IsWritable() == false) { + // remote fencing is disabled and we have lost write access + // reboot this node to prevent split brain + opensaf_reboot(0, nullptr, + "Quorum lost. Rebooting this node to prevent split-brain"); + } + TRACE_LEAVE(); } diff --git a/src/amf/amfd/osaf-amfd.in b/src/amf/amfd/osaf-amfd.in index 45c5ab9e4..26a77ef52 100644 --- a/src/amf/amfd/osaf-amfd.in +++ b/src/amf/amfd/osaf-amfd.in @@ -28,6 +28,10 @@ else . $pkgsysconfdir/amfd.conf fi +if [ -f "$pkgsysconfdir/fmd.conf" ]; then + . "$pkgsysconfdir/fmd.conf" +fi + binary=$pkglibdir/$osafprog pidfile=$pkgpiddir/$osafprog.pid lockfile=$lockdir/$initscript diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc index 865d89d94..862ac3653 100644 --- a/src/amf/amfd/role.cc +++ b/src/amf/amfd/role.cc @@ -38,6 +38,7 @@ #include "osaf/immutil/immutil.h" #include "base/logtrace.h" #include "rde/agent/rda_papi.h" +#include "osaf/consensus/service.h" #include "amf/amfd/amfd.h" #include "amf/amfd/imm.h" @@ -1085,6 +1086,12 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) { avd_d2n_msg_dequeue(cb); } + Consensus consensus_service; + rc = consensus_service.DemoteThisNode(); + if (rc != SA_AIS_OK) { + LOG_ER("Failed to demote this node from consensus service"); + } + TRACE_LEAVE(); return NCSCC_RC_SUCCESS; } @@ -1209,13 +1216,21 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) { cb->avail_state_avd = SA_AMF_HA_ACTIVE; osaf_mutex_unlock_ordie(&imm_reinit_mutex); + Consensus consensus_service; + rc = consensus_service.PromoteThisNode(); + if (rc != SA_AIS_OK) { + LOG_ER("Unable to set active controller in consensus service"); + osafassert(false); + } + /* Declare this standby as Active. Set Vdest role role */ if (NCSCC_RC_SUCCESS != (status = avd_mds_set_vdest_role(cb, SA_AMF_HA_ACTIVE))) { LOG_ER("Switch Standby --> Active FAILED, MDS role set failed"); cb->swap_switch = false; avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE); - return NCSCC_RC_FAILURE; + status = NCSCC_RC_FAILURE; + goto done; } /* Time to send fail-over messages to all the AVND's */ @@ -1240,7 +1255,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) { } else { cb->swap_switch = false; avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE); - return NCSCC_RC_FAILURE; + status = NCSCC_RC_FAILURE; + goto done; } } @@ -1259,7 +1275,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) { in avd_imm_reinit_bg_thread.*/ } else { avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE); - return NCSCC_RC_FAILURE; + status = NCSCC_RC_FAILURE; + goto done; } } else osaf_mutex_unlock_ordie(&imm_reinit_mutex); @@ -1274,7 +1291,8 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) { LOG_ER("Switch Standby --> Active, clm track start failed"); Fifo::queue(new ClmTrackStart()); avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE); - return NCSCC_RC_FAILURE; + status = NCSCC_RC_FAILURE; + goto done; } /* Send the message to other avd for role change rsp as success */ @@ -1291,8 +1309,10 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) { } } + status = NCSCC_RC_SUCCESS; +done: TRACE_LEAVE(); - return NCSCC_RC_SUCCESS; + return status; } /****************************************************************************\ -- 2.14.1 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel