It's an ack from me after fixing ander's comments. 
We can take up further comments as a next stab.

Mathi.


> -----Original Message-----
> From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com]
> Sent: Wednesday, August 03, 2016 1:18 PM
> To: Mathivanan Naickan Palanivelu; Praveen Malviya; Ramesh Babu Betham
> Cc: opensaf-devel@lists.sourceforge.net; Anders Widell
> Subject: RE: [PATCH 1 of 2] fm: Add support for self-fencing [#1859]
> 
> Hi, a gentle reminder. /Thanks HansN
> 
> -----Original Message-----
> From: Mathivanan Naickan Palanivelu [mailto:mathi.naic...@oracle.com]
> Sent: den 14 juli 2016 11:37
> To: Anders Widell <anders.wid...@ericsson.com>; Hans Nordebäck
> <hans.nordeb...@ericsson.com>; Praveen Malviya
> <praveen.malv...@oracle.com>; Ramesh Babu Betham
> <ramesh.bet...@oracle.com>
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: RE: [PATCH 1 of 2] fm: Add support for self-fencing [#1859]
> 
> Iam testing this, shall revert by tomorrow.
> Thanks,
> Mathi.
> 
> > -----Original Message-----
> > From: Anders Widell [mailto:anders.wid...@ericsson.com]
> > Sent: Tuesday, July 12, 2016 7:08 PM
> > To: Hans Nordeback; Praveen Malviya; Mathivanan Naickan Palanivelu;
> > Ramesh Babu Betham
> > Cc: opensaf-devel@lists.sourceforge.net
> > Subject: Re: [PATCH 1 of 2] fm: Add support for self-fencing [#1859]
> >
> > One comment: in the prototype patch the feature was on by default, but
> > it ought to be off by default when we introduce this feature officially.
> >
> > / Anders Widell
> >
> > On 06/30/2016 10:32 AM, Anders Widell wrote:
> > > Hi!
> > >
> > > This patch is actually identical to the prototype code that I wrote
> > > and attached to the ticket, so I am not sure if I am supposed to
> > > also review it... anyways it is ack from from me for the first
> > > patch. :-)
> > >
> > > regards,
> > > Anders Widell
> > >
> > > On 06/23/2016 07:31 AM, Hans Nordeback wrote:
> > >> osaf/services/infrastructure/fm/fms/fm_cb.h   |  10 +++++
> > >>   osaf/services/infrastructure/fm/fms/fm_main.c |  16 +++++++-
> > >>   osaf/services/infrastructure/fm/fms/fm_mds.c  |  51
> > >> +++++++++++++++++++++++++++
> > >>   3 files changed, 75 insertions(+), 2 deletions(-)
> > >>
> > >>
> > >> In situations where remote fencing is not possible, this patch adds
> > >> support for self-fencing.
> > >>
> > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h
> > >> b/osaf/services/infrastructure/fm/fms/fm_cb.h
> > >> --- a/osaf/services/infrastructure/fm/fms/fm_cb.h
> > >> +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
> > >> @@ -27,6 +27,10 @@
> > >>   #include "rda_papi.h"
> > >>   #include "fm_amf.h"
> > >>   +#include <stdbool.h>
> > >> +#include <stdint.h>
> > >> +#include <time.h>
> > >> +
> > >>   uint32_t gl_fm_hdl;
> > >>     typedef enum {
> > >> @@ -92,6 +96,12 @@ typedef struct fm_cb {
> > >>       bool amfnd_down;
> > >>       bool amfd_down;
> > >>       bool fm_down;
> > >> +
> > >> +    bool peer_sc_up;
> > >> +    bool well_connected;
> > >> +    uint64_t cluster_size;
> > >> +    struct timespec last_well_connected;
> > >> +    struct timespec node_isolation_timeout;
> > >>   } FM_CB;
> > >>     extern char *role_string[];
> > >> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c
> > >> b/osaf/services/infrastructure/fm/fms/fm_main.c
> > >> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
> > >> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
> > >> @@ -30,7 +30,7 @@ This file contains the main() routine fo
> > >>     #include <nid_api.h>
> > >>   #include "fm.h"
> > >> -
> > >> +#include "osaf_time.h"
> > >>     enum {
> > >>       FD_TERM = 0,
> > >> @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb
> > >>       fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE;
> > >>       fm_cb->activation_supervision_tmr.type =
> > >> FM_TMR_ACTIVATION_SUPERVISION;
> > >>   -      TRACE_LEAVE();
> > >> +    char* node_isolation_timeout =
> > >> getenv("FMS_NODE_ISOLATION_TIMEOUT");
> > >> +    if (node_isolation_timeout != NULL) {
> > >> +        osaf_millis_to_timespec(atoi(node_isolation_timeout),
> > >> +                    &fm_cb->node_isolation_timeout);
> > >> +    } else {
> > >> +        fm_cb->node_isolation_timeout.tv_sec = 10;
> > >> +        fm_cb->node_isolation_timeout.tv_nsec = 0;
> > >> +    }
> > >> +    TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld",
> > >> +          (int64_t) fm_cb->node_isolation_timeout.tv_sec,
> > >> +          fm_cb->node_isolation_timeout.tv_nsec);
> > >> +
> > >> +    TRACE_LEAVE();
> > >>       return NCSCC_RC_SUCCESS;
> > >>   }
> > >>   diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c
> > >> b/osaf/services/infrastructure/fm/fms/fm_mds.c
> > >> --- a/osaf/services/infrastructure/fm/fms/fm_mds.c
> > >> +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c
> > >> @@ -16,6 +16,8 @@
> > >>   */
> > >>     #include "fm.h"
> > >> +#include "osaf_time.h"
> > >> +#include "ncssysf_def.h"
> > >>     const MDS_CLIENT_MSG_FORMAT_VER
> > >> fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = {
> > FM_FM_MSG_FMT_VER_1 };
> > >>   @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E
> > >>   static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info);
> > >>   static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO
> *enc_info);
> > >>   static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO
> *dec_info);
> > >> +static void check_for_node_isolation(FM_CB *cb); static bool
> > >> +has_been_well_connected_recently(FM_CB *cb);
> > >>   static uint32_t fm_mds_node_evt(FM_CB *cb,
> > >> MDS_CALLBACK_NODE_EVENT_INFO * node_evt);
> > >>   static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT
> > >> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code);
> > >>   @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C
> > >>       return;
> > >>   }
> > >>   +static void check_for_node_isolation(FM_CB *cb)
> > >> +{
> > >> +    bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3;
> > >> +    if (cb->well_connected && !well_connected) {
> > >> +        osaf_clock_gettime(CLOCK_MONOTONIC, &cb-
> > >last_well_connected);
> > >> +    }
> > >> +    cb->well_connected = well_connected; }
> > >> +
> > >> +static bool has_been_well_connected_recently(FM_CB *cb) {
> > >> +    if (cb->well_connected) return true;
> > >> +    struct timespec current;
> > >> +    struct timespec difference;
> > >> +    osaf_clock_gettime(CLOCK_MONOTONIC, &current);
> > >> +    if (osaf_timespec_compare(&current, &cb->last_well_connected)
> > >> +<
> > >> 0) return false;
> > >> +    osaf_timespec_subtract(&current, &cb->last_well_connected,
> > >> &difference);
> > >> +    if (osaf_timespec_compare(&difference,
> > >> &cb->node_isolation_timeout) < 0) return true;
> > >> +    return false;
> > >> +}
> > >> +
> > >>
> >
> /**********************************************************
> > ******************
> > >>   * Name          : fm_mds_node_evt
> > >>   *
> > >> @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c
> > >>         switch (node_evt->node_chg) {
> > >>       case NCSMDS_NODE_DOWN:
> > >> +        if (cb->cluster_size != 0) {
> > >> +            --cb->cluster_size;
> > >> +            TRACE("Node down event for node id %x, cluster size is
> > >> now: %llu",
> > >> +                  node_evt->node_id, (unsigned long long)
> > >> cb->cluster_size);
> > >> +            check_for_node_isolation(cb);
> > >> +            if (cb->cluster_size == 1 &&
> > >> has_been_well_connected_recently(cb)) {
> > >> +                opensaf_reboot(0, NULL,
> > >> +                        "Self-fencing due to sudden loss of
> > >> + contact
> > >> with the rest of the cluster");
> > >> +            }
> > >> +        } else {
> > >> +            TRACE("Node down event for node id %x ignored",
> > >> node_evt->node_id);
> > >> +            LOG_ER("Received unexpected node down event for node
> > >> + id
> > >> %x", node_evt->node_id);
> > >> +        }
> > >> +
> > >>           if (node_evt->node_id == cb->peer_node_id &&
> > >> cb->control_tipc) {
> > >>               /* Process NODE_DOWN only if OpenSAF is controling TIPC */
> > >>               LOG_NO("Node Down event for node id %x:",
> > >> node_evt->node_id); @@ -326,6 +365,10 @@ static uint32_t
> > >> fm_mds_node_evt(FM_CB *c
> > >>           break;
> > >>         case NCSMDS_NODE_UP:
> > >> +        ++cb->cluster_size;
> > >> +        TRACE("Node up event for node id %x, cluster size is now:
> > >> %llu",
> > >> +              node_evt->node_id, (unsigned long long)
> > >> cb->cluster_size);
> > >> +        check_for_node_isolation(cb);
> > >>           break;
> > >>         default:
> > >> @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
> > >>               /* Depend on service downs if OpenSAF is not
> > >> controling TIPC */
> > >>               case NCSMDS_SVC_ID_GFM:
> > >>                   if (svc_evt->i_node_id == cb->peer_node_id) {
> > >> +                    TRACE("Peer fm status change: %d -> %d, peer
> > >> node id is: %x, cluster size is %llu",
> > >> +                          (int) cb->peer_sc_up, 0,
> > >> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> > >> +                    cb->peer_sc_up = false;
> > >> +                    check_for_node_isolation(cb);
> > >>                       cb->peer_adest = 0;
> > >>                       if (!cb->control_tipc) {
> > >>                           fm_send_svc_down_to_mbx(cb,
> > >> svc_evt->i_node_id, svc_evt->i_svc_id); @@ -415,6 +462,10 @@ static
> > >> uint32_t fm_mds_svc_evt(FM_CB *cb
> > >>           switch (svc_evt->i_svc_id) {
> > >>           case NCSMDS_SVC_ID_GFM:
> > >>               if ((svc_evt->i_node_id != cb->node_id) &&
> > >> (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) {
> > >> +                TRACE("Peer fm status change: %d -> %d, peer node
> > >> + id
> > >> is: %x, cluster size is %llu",
> > >> +                      (int) cb->peer_sc_up, 1, svc_evt->i_node_id,
> > >> (unsigned long long) cb->cluster_size);
> > >> +                cb->peer_sc_up = true;
> > >> +                check_for_node_isolation(cb);
> > >>                     fm_evt = m_MMGR_ALLOC_FM_EVT;
> > >>                   if (NULL == fm_evt) {
> > >
> >

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to