Hi, a gentle reminder. /Thanks HansN

-----Original Message-----
From: Mathivanan Naickan Palanivelu [mailto:mathi.naic...@oracle.com] 
Sent: den 14 juli 2016 11:37
To: Anders Widell <anders.wid...@ericsson.com>; Hans Nordebäck 
<hans.nordeb...@ericsson.com>; Praveen Malviya <praveen.malv...@oracle.com>; 
Ramesh Babu Betham <ramesh.bet...@oracle.com>
Cc: opensaf-devel@lists.sourceforge.net
Subject: RE: [PATCH 1 of 2] fm: Add support for self-fencing [#1859]

Iam testing this, shall revert by tomorrow.
Thanks,
Mathi.

> -----Original Message-----
> From: Anders Widell [mailto:anders.wid...@ericsson.com]
> Sent: Tuesday, July 12, 2016 7:08 PM
> To: Hans Nordeback; Praveen Malviya; Mathivanan Naickan Palanivelu; 
> Ramesh Babu Betham
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: Re: [PATCH 1 of 2] fm: Add support for self-fencing [#1859]
> 
> One comment: in the prototype patch the feature was on by default, but 
> it ought to be off by default when we introduce this feature officially.
> 
> / Anders Widell
> 
> On 06/30/2016 10:32 AM, Anders Widell wrote:
> > Hi!
> >
> > This patch is actually identical to the prototype code that I wrote 
> > and attached to the ticket, so I am not sure if I am supposed to 
> > also review it... anyways it is ack from from me for the first 
> > patch. :-)
> >
> > regards,
> > Anders Widell
> >
> > On 06/23/2016 07:31 AM, Hans Nordeback wrote:
> >> osaf/services/infrastructure/fm/fms/fm_cb.h   |  10 +++++
> >>   osaf/services/infrastructure/fm/fms/fm_main.c |  16 +++++++-
> >>   osaf/services/infrastructure/fm/fms/fm_mds.c  |  51
> >> +++++++++++++++++++++++++++
> >>   3 files changed, 75 insertions(+), 2 deletions(-)
> >>
> >>
> >> In situations where remote fencing is not possible, this patch adds 
> >> support for self-fencing.
> >>
> >> diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h
> >> b/osaf/services/infrastructure/fm/fms/fm_cb.h
> >> --- a/osaf/services/infrastructure/fm/fms/fm_cb.h
> >> +++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
> >> @@ -27,6 +27,10 @@
> >>   #include "rda_papi.h"
> >>   #include "fm_amf.h"
> >>   +#include <stdbool.h>
> >> +#include <stdint.h>
> >> +#include <time.h>
> >> +
> >>   uint32_t gl_fm_hdl;
> >>     typedef enum {
> >> @@ -92,6 +96,12 @@ typedef struct fm_cb {
> >>       bool amfnd_down;
> >>       bool amfd_down;
> >>       bool fm_down;
> >> +
> >> +    bool peer_sc_up;
> >> +    bool well_connected;
> >> +    uint64_t cluster_size;
> >> +    struct timespec last_well_connected;
> >> +    struct timespec node_isolation_timeout;
> >>   } FM_CB;
> >>     extern char *role_string[];
> >> diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c
> >> b/osaf/services/infrastructure/fm/fms/fm_main.c
> >> --- a/osaf/services/infrastructure/fm/fms/fm_main.c
> >> +++ b/osaf/services/infrastructure/fm/fms/fm_main.c
> >> @@ -30,7 +30,7 @@ This file contains the main() routine fo
> >>     #include <nid_api.h>
> >>   #include "fm.h"
> >> -
> >> +#include "osaf_time.h"
> >>     enum {
> >>       FD_TERM = 0,
> >> @@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb
> >>       fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE;
> >>       fm_cb->activation_supervision_tmr.type = 
> >> FM_TMR_ACTIVATION_SUPERVISION;
> >>   -      TRACE_LEAVE();
> >> +    char* node_isolation_timeout =
> >> getenv("FMS_NODE_ISOLATION_TIMEOUT");
> >> +    if (node_isolation_timeout != NULL) {
> >> +        osaf_millis_to_timespec(atoi(node_isolation_timeout),
> >> +                    &fm_cb->node_isolation_timeout);
> >> +    } else {
> >> +        fm_cb->node_isolation_timeout.tv_sec = 10;
> >> +        fm_cb->node_isolation_timeout.tv_nsec = 0;
> >> +    }
> >> +    TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld",
> >> +          (int64_t) fm_cb->node_isolation_timeout.tv_sec,
> >> +          fm_cb->node_isolation_timeout.tv_nsec);
> >> +
> >> +    TRACE_LEAVE();
> >>       return NCSCC_RC_SUCCESS;
> >>   }
> >>   diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c
> >> b/osaf/services/infrastructure/fm/fms/fm_mds.c
> >> --- a/osaf/services/infrastructure/fm/fms/fm_mds.c
> >> +++ b/osaf/services/infrastructure/fm/fms/fm_mds.c
> >> @@ -16,6 +16,8 @@
> >>   */
> >>     #include "fm.h"
> >> +#include "osaf_time.h"
> >> +#include "ncssysf_def.h"
> >>     const MDS_CLIENT_MSG_FORMAT_VER 
> >> fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = {
> FM_FM_MSG_FMT_VER_1 };
> >>   @@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E
> >>   static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info);
> >>   static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO *enc_info);
> >>   static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO *dec_info);
> >> +static void check_for_node_isolation(FM_CB *cb); static bool 
> >> +has_been_well_connected_recently(FM_CB *cb);
> >>   static uint32_t fm_mds_node_evt(FM_CB *cb, 
> >> MDS_CALLBACK_NODE_EVENT_INFO * node_evt);
> >>   static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT 
> >> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code);
> >>   @@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C
> >>       return;
> >>   }
> >>   +static void check_for_node_isolation(FM_CB *cb)
> >> +{
> >> +    bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3;
> >> +    if (cb->well_connected && !well_connected) {
> >> +        osaf_clock_gettime(CLOCK_MONOTONIC, &cb-
> >last_well_connected);
> >> +    }
> >> +    cb->well_connected = well_connected; }
> >> +
> >> +static bool has_been_well_connected_recently(FM_CB *cb) {
> >> +    if (cb->well_connected) return true;
> >> +    struct timespec current;
> >> +    struct timespec difference;
> >> +    osaf_clock_gettime(CLOCK_MONOTONIC, &current);
> >> +    if (osaf_timespec_compare(&current, &cb->last_well_connected) 
> >> +<
> >> 0) return false;
> >> +    osaf_timespec_subtract(&current, &cb->last_well_connected,
> >> &difference);
> >> +    if (osaf_timespec_compare(&difference,
> >> &cb->node_isolation_timeout) < 0) return true;
> >> +    return false;
> >> +}
> >> +
> >>
> /**********************************************************
> ******************
> >>   * Name          : fm_mds_node_evt
> >>   *
> >> @@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c
> >>         switch (node_evt->node_chg) {
> >>       case NCSMDS_NODE_DOWN:
> >> +        if (cb->cluster_size != 0) {
> >> +            --cb->cluster_size;
> >> +            TRACE("Node down event for node id %x, cluster size is
> >> now: %llu",
> >> +                  node_evt->node_id, (unsigned long long)
> >> cb->cluster_size);
> >> +            check_for_node_isolation(cb);
> >> +            if (cb->cluster_size == 1 &&
> >> has_been_well_connected_recently(cb)) {
> >> +                opensaf_reboot(0, NULL,
> >> +                        "Self-fencing due to sudden loss of 
> >> + contact
> >> with the rest of the cluster");
> >> +            }
> >> +        } else {
> >> +            TRACE("Node down event for node id %x ignored",
> >> node_evt->node_id);
> >> +            LOG_ER("Received unexpected node down event for node 
> >> + id
> >> %x", node_evt->node_id);
> >> +        }
> >> +
> >>           if (node_evt->node_id == cb->peer_node_id &&
> >> cb->control_tipc) {
> >>               /* Process NODE_DOWN only if OpenSAF is controling TIPC */
> >>               LOG_NO("Node Down event for node id %x:", 
> >> node_evt->node_id); @@ -326,6 +365,10 @@ static uint32_t 
> >> fm_mds_node_evt(FM_CB *c
> >>           break;
> >>         case NCSMDS_NODE_UP:
> >> +        ++cb->cluster_size;
> >> +        TRACE("Node up event for node id %x, cluster size is now:
> >> %llu",
> >> +              node_evt->node_id, (unsigned long long)
> >> cb->cluster_size);
> >> +        check_for_node_isolation(cb);
> >>           break;
> >>         default:
> >> @@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
> >>               /* Depend on service downs if OpenSAF is not 
> >> controling TIPC */
> >>               case NCSMDS_SVC_ID_GFM:
> >>                   if (svc_evt->i_node_id == cb->peer_node_id) {
> >> +                    TRACE("Peer fm status change: %d -> %d, peer
> >> node id is: %x, cluster size is %llu",
> >> +                          (int) cb->peer_sc_up, 0,
> >> svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
> >> +                    cb->peer_sc_up = false;
> >> +                    check_for_node_isolation(cb);
> >>                       cb->peer_adest = 0;
> >>                       if (!cb->control_tipc) {
> >>                           fm_send_svc_down_to_mbx(cb, 
> >> svc_evt->i_node_id, svc_evt->i_svc_id); @@ -415,6 +462,10 @@ static 
> >> uint32_t fm_mds_svc_evt(FM_CB *cb
> >>           switch (svc_evt->i_svc_id) {
> >>           case NCSMDS_SVC_ID_GFM:
> >>               if ((svc_evt->i_node_id != cb->node_id) &&
> >> (m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) {
> >> +                TRACE("Peer fm status change: %d -> %d, peer node 
> >> + id
> >> is: %x, cluster size is %llu",
> >> +                      (int) cb->peer_sc_up, 1, svc_evt->i_node_id,
> >> (unsigned long long) cb->cluster_size);
> >> +                cb->peer_sc_up = true;
> >> +                check_for_node_isolation(cb);
> >>                     fm_evt = m_MMGR_ALLOC_FM_EVT;
> >>                   if (NULL == fm_evt) {
> >
> 

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to