It's an ack from me after fixing ander's comments. We can take up further comments as a next stab.
Mathi. > -----Original Message----- > From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com] > Sent: Wednesday, August 03, 2016 1:18 PM > To: Mathivanan Naickan Palanivelu; Praveen Malviya; Ramesh Babu Betham > Cc: opensaf-devel@lists.sourceforge.net; Anders Widell > Subject: RE: [PATCH 1 of 2] fm: Add support for self-fencing [#1859] > > Hi, a gentle reminder. /Thanks HansN > > -----Original Message----- > From: Mathivanan Naickan Palanivelu [mailto:mathi.naic...@oracle.com] > Sent: den 14 juli 2016 11:37 > To: Anders Widell <anders.wid...@ericsson.com>; Hans Nordebäck > <hans.nordeb...@ericsson.com>; Praveen Malviya > <praveen.malv...@oracle.com>; Ramesh Babu Betham > <ramesh.bet...@oracle.com> > Cc: opensaf-devel@lists.sourceforge.net > Subject: RE: [PATCH 1 of 2] fm: Add support for self-fencing [#1859] > > Iam testing this, shall revert by tomorrow. > Thanks, > Mathi. > > > -----Original Message----- > > From: Anders Widell [mailto:anders.wid...@ericsson.com] > > Sent: Tuesday, July 12, 2016 7:08 PM > > To: Hans Nordeback; Praveen Malviya; Mathivanan Naickan Palanivelu; > > Ramesh Babu Betham > > Cc: opensaf-devel@lists.sourceforge.net > > Subject: Re: [PATCH 1 of 2] fm: Add support for self-fencing [#1859] > > > > One comment: in the prototype patch the feature was on by default, but > > it ought to be off by default when we introduce this feature officially. > > > > / Anders Widell > > > > On 06/30/2016 10:32 AM, Anders Widell wrote: > > > Hi! > > > > > > This patch is actually identical to the prototype code that I wrote > > > and attached to the ticket, so I am not sure if I am supposed to > > > also review it... anyways it is ack from from me for the first > > > patch. :-) > > > > > > regards, > > > Anders Widell > > > > > > On 06/23/2016 07:31 AM, Hans Nordeback wrote: > > >> osaf/services/infrastructure/fm/fms/fm_cb.h | 10 +++++ > > >> osaf/services/infrastructure/fm/fms/fm_main.c | 16 +++++++- > > >> osaf/services/infrastructure/fm/fms/fm_mds.c | 51 > > >> +++++++++++++++++++++++++++ > > >> 3 files changed, 75 insertions(+), 2 deletions(-) > > >> > > >> > > >> In situations where remote fencing is not possible, this patch adds > > >> support for self-fencing. > > >> > > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h > > >> b/osaf/services/infrastructure/fm/fms/fm_cb.h > > >> --- a/osaf/services/infrastructure/fm/fms/fm_cb.h > > >> +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h > > >> @@ -27,6 +27,10 @@ > > >> #include "rda_papi.h" > > >> #include "fm_amf.h" > > >> +#include <stdbool.h> > > >> +#include <stdint.h> > > >> +#include <time.h> > > >> + > > >> uint32_t gl_fm_hdl; > > >> typedef enum { > > >> @@ -92,6 +96,12 @@ typedef struct fm_cb { > > >> bool amfnd_down; > > >> bool amfd_down; > > >> bool fm_down; > > >> + > > >> + bool peer_sc_up; > > >> + bool well_connected; > > >> + uint64_t cluster_size; > > >> + struct timespec last_well_connected; > > >> + struct timespec node_isolation_timeout; > > >> } FM_CB; > > >> extern char *role_string[]; > > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c > > >> b/osaf/services/infrastructure/fm/fms/fm_main.c > > >> --- a/osaf/services/infrastructure/fm/fms/fm_main.c > > >> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c > > >> @@ -30,7 +30,7 @@ This file contains the main() routine fo > > >> #include <nid_api.h> > > >> #include "fm.h" > > >> - > > >> +#include "osaf_time.h" > > >> enum { > > >> FD_TERM = 0, > > >> @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb > > >> fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE; > > >> fm_cb->activation_supervision_tmr.type = > > >> FM_TMR_ACTIVATION_SUPERVISION; > > >> - TRACE_LEAVE(); > > >> + char* node_isolation_timeout = > > >> getenv("FMS_NODE_ISOLATION_TIMEOUT"); > > >> + if (node_isolation_timeout != NULL) { > > >> + osaf_millis_to_timespec(atoi(node_isolation_timeout), > > >> + &fm_cb->node_isolation_timeout); > > >> + } else { > > >> + fm_cb->node_isolation_timeout.tv_sec = 10; > > >> + fm_cb->node_isolation_timeout.tv_nsec = 0; > > >> + } > > >> + TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld", > > >> + (int64_t) fm_cb->node_isolation_timeout.tv_sec, > > >> + fm_cb->node_isolation_timeout.tv_nsec); > > >> + > > >> + TRACE_LEAVE(); > > >> return NCSCC_RC_SUCCESS; > > >> } > > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c > > >> b/osaf/services/infrastructure/fm/fms/fm_mds.c > > >> --- a/osaf/services/infrastructure/fm/fms/fm_mds.c > > >> +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c > > >> @@ -16,6 +16,8 @@ > > >> */ > > >> #include "fm.h" > > >> +#include "osaf_time.h" > > >> +#include "ncssysf_def.h" > > >> const MDS_CLIENT_MSG_FORMAT_VER > > >> fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = { > > FM_FM_MSG_FMT_VER_1 }; > > >> @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E > > >> static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info); > > >> static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO > *enc_info); > > >> static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO > *dec_info); > > >> +static void check_for_node_isolation(FM_CB *cb); static bool > > >> +has_been_well_connected_recently(FM_CB *cb); > > >> static uint32_t fm_mds_node_evt(FM_CB *cb, > > >> MDS_CALLBACK_NODE_EVENT_INFO * node_evt); > > >> static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT > > >> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code); > > >> @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C > > >> return; > > >> } > > >> +static void check_for_node_isolation(FM_CB *cb) > > >> +{ > > >> + bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3; > > >> + if (cb->well_connected && !well_connected) { > > >> + osaf_clock_gettime(CLOCK_MONOTONIC, &cb- > > >last_well_connected); > > >> + } > > >> + cb->well_connected = well_connected; } > > >> + > > >> +static bool has_been_well_connected_recently(FM_CB *cb) { > > >> + if (cb->well_connected) return true; > > >> + struct timespec current; > > >> + struct timespec difference; > > >> + osaf_clock_gettime(CLOCK_MONOTONIC, ¤t); > > >> + if (osaf_timespec_compare(¤t, &cb->last_well_connected) > > >> +< > > >> 0) return false; > > >> + osaf_timespec_subtract(¤t, &cb->last_well_connected, > > >> &difference); > > >> + if (osaf_timespec_compare(&difference, > > >> &cb->node_isolation_timeout) < 0) return true; > > >> + return false; > > >> +} > > >> + > > >> > > > /********************************************************** > > ****************** > > >> * Name : fm_mds_node_evt > > >> * > > >> @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c > > >> switch (node_evt->node_chg) { > > >> case NCSMDS_NODE_DOWN: > > >> + if (cb->cluster_size != 0) { > > >> + --cb->cluster_size; > > >> + TRACE("Node down event for node id %x, cluster size is > > >> now: %llu", > > >> + node_evt->node_id, (unsigned long long) > > >> cb->cluster_size); > > >> + check_for_node_isolation(cb); > > >> + if (cb->cluster_size == 1 && > > >> has_been_well_connected_recently(cb)) { > > >> + opensaf_reboot(0, NULL, > > >> + "Self-fencing due to sudden loss of > > >> + contact > > >> with the rest of the cluster"); > > >> + } > > >> + } else { > > >> + TRACE("Node down event for node id %x ignored", > > >> node_evt->node_id); > > >> + LOG_ER("Received unexpected node down event for node > > >> + id > > >> %x", node_evt->node_id); > > >> + } > > >> + > > >> if (node_evt->node_id == cb->peer_node_id && > > >> cb->control_tipc) { > > >> /* Process NODE_DOWN only if OpenSAF is controling TIPC */ > > >> LOG_NO("Node Down event for node id %x:", > > >> node_evt->node_id); @@ -326,6 +365,10 @@ static uint32_t > > >> fm_mds_node_evt(FM_CB *c > > >> break; > > >> case NCSMDS_NODE_UP: > > >> + ++cb->cluster_size; > > >> + TRACE("Node up event for node id %x, cluster size is now: > > >> %llu", > > >> + node_evt->node_id, (unsigned long long) > > >> cb->cluster_size); > > >> + check_for_node_isolation(cb); > > >> break; > > >> default: > > >> @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb > > >> /* Depend on service downs if OpenSAF is not > > >> controling TIPC */ > > >> case NCSMDS_SVC_ID_GFM: > > >> if (svc_evt->i_node_id == cb->peer_node_id) { > > >> + TRACE("Peer fm status change: %d -> %d, peer > > >> node id is: %x, cluster size is %llu", > > >> + (int) cb->peer_sc_up, 0, > > >> svc_evt->i_node_id, (unsigned long long) cb->cluster_size); > > >> + cb->peer_sc_up = false; > > >> + check_for_node_isolation(cb); > > >> cb->peer_adest = 0; > > >> if (!cb->control_tipc) { > > >> fm_send_svc_down_to_mbx(cb, > > >> svc_evt->i_node_id, svc_evt->i_svc_id); @@ -415,6 +462,10 @@ static > > >> uint32_t fm_mds_svc_evt(FM_CB *cb > > >> switch (svc_evt->i_svc_id) { > > >> case NCSMDS_SVC_ID_GFM: > > >> if ((svc_evt->i_node_id != cb->node_id) && > > >> (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) { > > >> + TRACE("Peer fm status change: %d -> %d, peer node > > >> + id > > >> is: %x, cluster size is %llu", > > >> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, > > >> (unsigned long long) cb->cluster_size); > > >> + cb->peer_sc_up = true; > > >> + check_for_node_isolation(cb); > > >> fm_evt = m_MMGR_ALLOC_FM_EVT; > > >> if (NULL == fm_evt) { > > > > > ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel