osaf/services/infrastructure/fm/fms/fm_cb.h | 10 +++++ osaf/services/infrastructure/fm/fms/fm_main.c | 16 +++++++- osaf/services/infrastructure/fm/fms/fm_mds.c | 51 +++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
In situations where remote fencing is not possible, this patch adds support for self-fencing. diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h b/osaf/services/infrastructure/fm/fms/fm_cb.h --- a/osaf/services/infrastructure/fm/fms/fm_cb.h +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h @@ -27,6 +27,10 @@ #include "rda_papi.h" #include "fm_amf.h" +#include <stdbool.h> +#include <stdint.h> +#include <time.h> + uint32_t gl_fm_hdl; typedef enum { @@ -92,6 +96,12 @@ typedef struct fm_cb { bool amfnd_down; bool amfd_down; bool fm_down; + + bool peer_sc_up; + bool well_connected; + uint64_t cluster_size; + struct timespec last_well_connected; + struct timespec node_isolation_timeout; } FM_CB; extern char *role_string[]; diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c b/osaf/services/infrastructure/fm/fms/fm_main.c --- a/osaf/services/infrastructure/fm/fms/fm_main.c +++ b/osaf/services/infrastructure/fm/fms/fm_main.c @@ -30,7 +30,7 @@ This file contains the main() routine fo #include <nid_api.h> #include "fm.h" - +#include "osaf_time.h" enum { FD_TERM = 0, @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE; fm_cb->activation_supervision_tmr.type = FM_TMR_ACTIVATION_SUPERVISION; - TRACE_LEAVE(); + char* node_isolation_timeout = getenv("FMS_NODE_ISOLATION_TIMEOUT"); + if (node_isolation_timeout != NULL) { + osaf_millis_to_timespec(atoi(node_isolation_timeout), + &fm_cb->node_isolation_timeout); + } else { + fm_cb->node_isolation_timeout.tv_sec = 10; + fm_cb->node_isolation_timeout.tv_nsec = 0; + } + TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld", + (int64_t) fm_cb->node_isolation_timeout.tv_sec, + fm_cb->node_isolation_timeout.tv_nsec); + + TRACE_LEAVE(); return NCSCC_RC_SUCCESS; } diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c b/osaf/services/infrastructure/fm/fms/fm_mds.c --- a/osaf/services/infrastructure/fm/fms/fm_mds.c +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c @@ -16,6 +16,8 @@ */ #include "fm.h" +#include "osaf_time.h" +#include "ncssysf_def.h" const MDS_CLIENT_MSG_FORMAT_VER fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = { FM_FM_MSG_FMT_VER_1 }; @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info); static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO *enc_info); static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO *dec_info); +static void check_for_node_isolation(FM_CB *cb); +static bool has_been_well_connected_recently(FM_CB *cb); static uint32_t fm_mds_node_evt(FM_CB *cb, MDS_CALLBACK_NODE_EVENT_INFO * node_evt); static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code); @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C return; } +static void check_for_node_isolation(FM_CB *cb) +{ + bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3; + if (cb->well_connected && !well_connected) { + osaf_clock_gettime(CLOCK_MONOTONIC, &cb->last_well_connected); + } + cb->well_connected = well_connected; +} + +static bool has_been_well_connected_recently(FM_CB *cb) +{ + if (cb->well_connected) return true; + struct timespec current; + struct timespec difference; + osaf_clock_gettime(CLOCK_MONOTONIC, ¤t); + if (osaf_timespec_compare(¤t, &cb->last_well_connected) < 0) return false; + osaf_timespec_subtract(¤t, &cb->last_well_connected, &difference); + if (osaf_timespec_compare(&difference, &cb->node_isolation_timeout) < 0) return true; + return false; +} + /**************************************************************************** * Name : fm_mds_node_evt * @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c switch (node_evt->node_chg) { case NCSMDS_NODE_DOWN: + if (cb->cluster_size != 0) { + --cb->cluster_size; + TRACE("Node down event for node id %x, cluster size is now: %llu", + node_evt->node_id, (unsigned long long) cb->cluster_size); + check_for_node_isolation(cb); + if (cb->cluster_size == 1 && has_been_well_connected_recently(cb)) { + opensaf_reboot(0, NULL, + "Self-fencing due to sudden loss of contact with the rest of the cluster"); + } + } else { + TRACE("Node down event for node id %x ignored", node_evt->node_id); + LOG_ER("Received unexpected node down event for node id %x", node_evt->node_id); + } + if (node_evt->node_id == cb->peer_node_id && cb->control_tipc) { /* Process NODE_DOWN only if OpenSAF is controling TIPC */ LOG_NO("Node Down event for node id %x:", node_evt->node_id); @@ -326,6 +365,10 @@ static uint32_t fm_mds_node_evt(FM_CB *c break; case NCSMDS_NODE_UP: + ++cb->cluster_size; + TRACE("Node up event for node id %x, cluster size is now: %llu", + node_evt->node_id, (unsigned long long) cb->cluster_size); + check_for_node_isolation(cb); break; default: @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb /* Depend on service downs if OpenSAF is not controling TIPC */ case NCSMDS_SVC_ID_GFM: if (svc_evt->i_node_id == cb->peer_node_id) { + TRACE("Peer fm status change: %d -> %d, peer node id is: %x, cluster size is %llu", + (int) cb->peer_sc_up, 0, svc_evt->i_node_id, (unsigned long long) cb->cluster_size); + cb->peer_sc_up = false; + check_for_node_isolation(cb); cb->peer_adest = 0; if (!cb->control_tipc) { fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, svc_evt->i_svc_id); @@ -415,6 +462,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb switch (svc_evt->i_svc_id) { case NCSMDS_SVC_ID_GFM: if ((svc_evt->i_node_id != cb->node_id) && (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) { + TRACE("Peer fm status change: %d -> %d, peer node id is: %x, cluster size is %llu", + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, (unsigned long long) cb->cluster_size); + cb->peer_sc_up = true; + check_for_node_isolation(cb); fm_evt = m_MMGR_ALLOC_FM_EVT; if (NULL == fm_evt) { ------------------------------------------------------------------------------ Attend Shape: An AT&T Tech Expo July 15-16. Meet us at AT&T Park in San Francisco, CA to explore cutting-edge tech and listen to tech luminaries present their vision of the future. This family event has something for everyone, including kids. Get more information and register today. http://sdm.link/attshape _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel