Hi, a gentle reminder. /Thanks HansN -----Original Message----- From: Mathivanan Naickan Palanivelu [mailto:mathi.naic...@oracle.com] Sent: den 14 juli 2016 11:37 To: Anders Widell <anders.wid...@ericsson.com>; Hans Nordebäck <hans.nordeb...@ericsson.com>; Praveen Malviya <praveen.malv...@oracle.com>; Ramesh Babu Betham <ramesh.bet...@oracle.com> Cc: opensaf-devel@lists.sourceforge.net Subject: RE: [PATCH 1 of 2] fm: Add support for self-fencing [#1859]
Iam testing this, shall revert by tomorrow. Thanks, Mathi. > -----Original Message----- > From: Anders Widell [mailto:anders.wid...@ericsson.com] > Sent: Tuesday, July 12, 2016 7:08 PM > To: Hans Nordeback; Praveen Malviya; Mathivanan Naickan Palanivelu; > Ramesh Babu Betham > Cc: opensaf-devel@lists.sourceforge.net > Subject: Re: [PATCH 1 of 2] fm: Add support for self-fencing [#1859] > > One comment: in the prototype patch the feature was on by default, but > it ought to be off by default when we introduce this feature officially. > > / Anders Widell > > On 06/30/2016 10:32 AM, Anders Widell wrote: > > Hi! > > > > This patch is actually identical to the prototype code that I wrote > > and attached to the ticket, so I am not sure if I am supposed to > > also review it... anyways it is ack from from me for the first > > patch. :-) > > > > regards, > > Anders Widell > > > > On 06/23/2016 07:31 AM, Hans Nordeback wrote: > >> osaf/services/infrastructure/fm/fms/fm_cb.h | 10 +++++ > >> osaf/services/infrastructure/fm/fms/fm_main.c | 16 +++++++- > >> osaf/services/infrastructure/fm/fms/fm_mds.c | 51 > >> +++++++++++++++++++++++++++ > >> 3 files changed, 75 insertions(+), 2 deletions(-) > >> > >> > >> In situations where remote fencing is not possible, this patch adds > >> support for self-fencing. > >> > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h > >> b/osaf/services/infrastructure/fm/fms/fm_cb.h > >> --- a/osaf/services/infrastructure/fm/fms/fm_cb.h > >> +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h > >> @@ -27,6 +27,10 @@ > >> #include "rda_papi.h" > >> #include "fm_amf.h" > >> +#include <stdbool.h> > >> +#include <stdint.h> > >> +#include <time.h> > >> + > >> uint32_t gl_fm_hdl; > >> typedef enum { > >> @@ -92,6 +96,12 @@ typedef struct fm_cb { > >> bool amfnd_down; > >> bool amfd_down; > >> bool fm_down; > >> + > >> + bool peer_sc_up; > >> + bool well_connected; > >> + uint64_t cluster_size; > >> + struct timespec last_well_connected; > >> + struct timespec node_isolation_timeout; > >> } FM_CB; > >> extern char *role_string[]; > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c > >> b/osaf/services/infrastructure/fm/fms/fm_main.c > >> --- a/osaf/services/infrastructure/fm/fms/fm_main.c > >> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c > >> @@ -30,7 +30,7 @@ This file contains the main() routine fo > >> #include <nid_api.h> > >> #include "fm.h" > >> - > >> +#include "osaf_time.h" > >> enum { > >> FD_TERM = 0, > >> @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb > >> fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE; > >> fm_cb->activation_supervision_tmr.type = > >> FM_TMR_ACTIVATION_SUPERVISION; > >> - TRACE_LEAVE(); > >> + char* node_isolation_timeout = > >> getenv("FMS_NODE_ISOLATION_TIMEOUT"); > >> + if (node_isolation_timeout != NULL) { > >> + osaf_millis_to_timespec(atoi(node_isolation_timeout), > >> + &fm_cb->node_isolation_timeout); > >> + } else { > >> + fm_cb->node_isolation_timeout.tv_sec = 10; > >> + fm_cb->node_isolation_timeout.tv_nsec = 0; > >> + } > >> + TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld", > >> + (int64_t) fm_cb->node_isolation_timeout.tv_sec, > >> + fm_cb->node_isolation_timeout.tv_nsec); > >> + > >> + TRACE_LEAVE(); > >> return NCSCC_RC_SUCCESS; > >> } > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c > >> b/osaf/services/infrastructure/fm/fms/fm_mds.c > >> --- a/osaf/services/infrastructure/fm/fms/fm_mds.c > >> +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c > >> @@ -16,6 +16,8 @@ > >> */ > >> #include "fm.h" > >> +#include "osaf_time.h" > >> +#include "ncssysf_def.h" > >> const MDS_CLIENT_MSG_FORMAT_VER > >> fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = { > FM_FM_MSG_FMT_VER_1 }; > >> @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E > >> static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info); > >> static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO *enc_info); > >> static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO *dec_info); > >> +static void check_for_node_isolation(FM_CB *cb); static bool > >> +has_been_well_connected_recently(FM_CB *cb); > >> static uint32_t fm_mds_node_evt(FM_CB *cb, > >> MDS_CALLBACK_NODE_EVENT_INFO * node_evt); > >> static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT > >> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code); > >> @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C > >> return; > >> } > >> +static void check_for_node_isolation(FM_CB *cb) > >> +{ > >> + bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3; > >> + if (cb->well_connected && !well_connected) { > >> + osaf_clock_gettime(CLOCK_MONOTONIC, &cb- > >last_well_connected); > >> + } > >> + cb->well_connected = well_connected; } > >> + > >> +static bool has_been_well_connected_recently(FM_CB *cb) { > >> + if (cb->well_connected) return true; > >> + struct timespec current; > >> + struct timespec difference; > >> + osaf_clock_gettime(CLOCK_MONOTONIC, ¤t); > >> + if (osaf_timespec_compare(¤t, &cb->last_well_connected) > >> +< > >> 0) return false; > >> + osaf_timespec_subtract(¤t, &cb->last_well_connected, > >> &difference); > >> + if (osaf_timespec_compare(&difference, > >> &cb->node_isolation_timeout) < 0) return true; > >> + return false; > >> +} > >> + > >> > /********************************************************** > ****************** > >> * Name : fm_mds_node_evt > >> * > >> @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c > >> switch (node_evt->node_chg) { > >> case NCSMDS_NODE_DOWN: > >> + if (cb->cluster_size != 0) { > >> + --cb->cluster_size; > >> + TRACE("Node down event for node id %x, cluster size is > >> now: %llu", > >> + node_evt->node_id, (unsigned long long) > >> cb->cluster_size); > >> + check_for_node_isolation(cb); > >> + if (cb->cluster_size == 1 && > >> has_been_well_connected_recently(cb)) { > >> + opensaf_reboot(0, NULL, > >> + "Self-fencing due to sudden loss of > >> + contact > >> with the rest of the cluster"); > >> + } > >> + } else { > >> + TRACE("Node down event for node id %x ignored", > >> node_evt->node_id); > >> + LOG_ER("Received unexpected node down event for node > >> + id > >> %x", node_evt->node_id); > >> + } > >> + > >> if (node_evt->node_id == cb->peer_node_id && > >> cb->control_tipc) { > >> /* Process NODE_DOWN only if OpenSAF is controling TIPC */ > >> LOG_NO("Node Down event for node id %x:", > >> node_evt->node_id); @@ -326,6 +365,10 @@ static uint32_t > >> fm_mds_node_evt(FM_CB *c > >> break; > >> case NCSMDS_NODE_UP: > >> + ++cb->cluster_size; > >> + TRACE("Node up event for node id %x, cluster size is now: > >> %llu", > >> + node_evt->node_id, (unsigned long long) > >> cb->cluster_size); > >> + check_for_node_isolation(cb); > >> break; > >> default: > >> @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb > >> /* Depend on service downs if OpenSAF is not > >> controling TIPC */ > >> case NCSMDS_SVC_ID_GFM: > >> if (svc_evt->i_node_id == cb->peer_node_id) { > >> + TRACE("Peer fm status change: %d -> %d, peer > >> node id is: %x, cluster size is %llu", > >> + (int) cb->peer_sc_up, 0, > >> svc_evt->i_node_id, (unsigned long long) cb->cluster_size); > >> + cb->peer_sc_up = false; > >> + check_for_node_isolation(cb); > >> cb->peer_adest = 0; > >> if (!cb->control_tipc) { > >> fm_send_svc_down_to_mbx(cb, > >> svc_evt->i_node_id, svc_evt->i_svc_id); @@ -415,6 +462,10 @@ static > >> uint32_t fm_mds_svc_evt(FM_CB *cb > >> switch (svc_evt->i_svc_id) { > >> case NCSMDS_SVC_ID_GFM: > >> if ((svc_evt->i_node_id != cb->node_id) && > >> (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) { > >> + TRACE("Peer fm status change: %d -> %d, peer node > >> + id > >> is: %x, cluster size is %llu", > >> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, > >> (unsigned long long) cb->cluster_size); > >> + cb->peer_sc_up = true; > >> + check_for_node_isolation(cb); > >> fm_evt = m_MMGR_ALLOC_FM_EVT; > >> if (NULL == fm_evt) { > > > ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel