osaf/services/infrastructure/fm/fms/fm_cb.h   |  10 +++++
 osaf/services/infrastructure/fm/fms/fm_main.c |  16 +++++++-
 osaf/services/infrastructure/fm/fms/fm_mds.c  |  51 +++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 2 deletions(-)


In situations where remote fencing is not possible, this patch adds support for 
self-fencing.

diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h 
b/osaf/services/infrastructure/fm/fms/fm_cb.h
--- a/osaf/services/infrastructure/fm/fms/fm_cb.h
+++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
@@ -27,6 +27,10 @@
 #include "rda_papi.h"
 #include "fm_amf.h"
 
+#include <stdbool.h>
+#include <stdint.h>
+#include <time.h>
+
 uint32_t gl_fm_hdl;
 
 typedef enum {
@@ -92,6 +96,12 @@ typedef struct fm_cb {
        bool amfnd_down;
        bool amfd_down;
        bool fm_down;
+
+       bool peer_sc_up;
+       bool well_connected;
+       uint64_t cluster_size;
+       struct timespec last_well_connected;
+       struct timespec node_isolation_timeout;
 } FM_CB;
 
 extern char *role_string[];
diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
b/osaf/services/infrastructure/fm/fms/fm_main.c
--- a/osaf/services/infrastructure/fm/fms/fm_main.c
+++ b/osaf/services/infrastructure/fm/fms/fm_main.c
@@ -30,7 +30,7 @@ This file contains the main() routine fo
 
 #include <nid_api.h>
 #include "fm.h"
-
+#include "osaf_time.h"
 
 enum {
        FD_TERM = 0,
@@ -411,7 +411,19 @@ static uint32_t fm_get_args(FM_CB *fm_cb
        fm_cb->promote_active_tmr.type = FM_TMR_PROMOTE_ACTIVE;
        fm_cb->activation_supervision_tmr.type = FM_TMR_ACTIVATION_SUPERVISION;
 
-       TRACE_LEAVE();
+       char* node_isolation_timeout = getenv("FMS_NODE_ISOLATION_TIMEOUT");
+       if (node_isolation_timeout != NULL) {
+               osaf_millis_to_timespec(atoi(node_isolation_timeout),
+                                       &fm_cb->node_isolation_timeout);
+       } else {
+               fm_cb->node_isolation_timeout.tv_sec = 10;
+               fm_cb->node_isolation_timeout.tv_nsec = 0;
+       }
+       TRACE("NODE_ISOLATION_TIMEOUT = %" PRId64 ".%09ld",
+             (int64_t) fm_cb->node_isolation_timeout.tv_sec,
+             fm_cb->node_isolation_timeout.tv_nsec);
+
+       TRACE_LEAVE();
        return NCSCC_RC_SUCCESS;
 }
 
diff --git a/osaf/services/infrastructure/fm/fms/fm_mds.c 
b/osaf/services/infrastructure/fm/fms/fm_mds.c
--- a/osaf/services/infrastructure/fm/fms/fm_mds.c
+++ b/osaf/services/infrastructure/fm/fms/fm_mds.c
@@ -16,6 +16,8 @@
 */
 
 #include "fm.h"
+#include "osaf_time.h"
+#include "ncssysf_def.h"
 
 const MDS_CLIENT_MSG_FORMAT_VER fm_fm_msg_fmt_map_table[FM_SUBPART_VER_MAX] = 
{ FM_FM_MSG_FMT_VER_1 };
 
@@ -28,6 +30,8 @@ static uint32_t fm_encode(MDS_CALLBACK_E
 static uint32_t fm_decode(MDS_CALLBACK_DEC_INFO *dec_info);
 static uint32_t fm_fm_mds_enc(MDS_CALLBACK_ENC_INFO *enc_info);
 static uint32_t fm_fm_mds_dec(MDS_CALLBACK_DEC_INFO *dec_info);
+static void check_for_node_isolation(FM_CB *cb);
+static bool has_been_well_connected_recently(FM_CB *cb);
 static uint32_t fm_mds_node_evt(FM_CB *cb, MDS_CALLBACK_NODE_EVENT_INFO * 
node_evt);
 static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT *fm_evt, NODE_ID 
node_id, FM_FSM_EVT_CODE evt_code);
 
@@ -300,6 +304,27 @@ static void fm_send_svc_down_to_mbx(FM_C
        return;
 }
 
+static void check_for_node_isolation(FM_CB *cb)
+{
+       bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3;
+       if (cb->well_connected && !well_connected) {
+               osaf_clock_gettime(CLOCK_MONOTONIC, &cb->last_well_connected);
+       }
+       cb->well_connected = well_connected;
+}
+
+static bool has_been_well_connected_recently(FM_CB *cb)
+{
+       if (cb->well_connected) return true;
+       struct timespec current;
+       struct timespec difference;
+       osaf_clock_gettime(CLOCK_MONOTONIC, &current);
+       if (osaf_timespec_compare(&current, &cb->last_well_connected) < 0) 
return false;
+       osaf_timespec_subtract(&current, &cb->last_well_connected, &difference);
+       if (osaf_timespec_compare(&difference, &cb->node_isolation_timeout) < 
0) return true;
+       return false;
+}
+
 /****************************************************************************
 * Name          : fm_mds_node_evt
 *
@@ -318,6 +343,20 @@ static uint32_t fm_mds_node_evt(FM_CB *c
 
        switch (node_evt->node_chg) {
        case NCSMDS_NODE_DOWN:
+               if (cb->cluster_size != 0) {
+                       --cb->cluster_size;
+                       TRACE("Node down event for node id %x, cluster size is 
now: %llu",
+                             node_evt->node_id, (unsigned long long) 
cb->cluster_size);
+                       check_for_node_isolation(cb);
+                       if (cb->cluster_size == 1 && 
has_been_well_connected_recently(cb)) {
+                               opensaf_reboot(0, NULL,
+                                               "Self-fencing due to sudden 
loss of contact with the rest of the cluster");
+                       }
+               } else {
+                       TRACE("Node down event for node id %x ignored", 
node_evt->node_id);
+                       LOG_ER("Received unexpected node down event for node id 
%x", node_evt->node_id);
+               }
+
                if (node_evt->node_id == cb->peer_node_id && cb->control_tipc) {
                        /* Process NODE_DOWN only if OpenSAF is controling TIPC 
*/
                        LOG_NO("Node Down event for node id %x:", 
node_evt->node_id);
@@ -326,6 +365,10 @@ static uint32_t fm_mds_node_evt(FM_CB *c
                break;
 
        case NCSMDS_NODE_UP:
+               ++cb->cluster_size;
+               TRACE("Node up event for node id %x, cluster size is now: %llu",
+                     node_evt->node_id, (unsigned long long) cb->cluster_size);
+               check_for_node_isolation(cb);
                break;
 
        default:
@@ -365,6 +408,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
                        /* Depend on service downs if OpenSAF is not controling 
TIPC */
                        case NCSMDS_SVC_ID_GFM:
                                if (svc_evt->i_node_id == cb->peer_node_id) {
+                                       TRACE("Peer fm status change: %d -> %d, 
peer node id is: %x, cluster size is %llu",
+                                             (int) cb->peer_sc_up, 0, 
svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
+                                       cb->peer_sc_up = false;
+                                       check_for_node_isolation(cb);
                                        cb->peer_adest = 0;
                                        if (!cb->control_tipc) {
                                                fm_send_svc_down_to_mbx(cb, 
svc_evt->i_node_id, svc_evt->i_svc_id);
@@ -415,6 +462,10 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb
                switch (svc_evt->i_svc_id) {
                case NCSMDS_SVC_ID_GFM:
                        if ((svc_evt->i_node_id != cb->node_id) && 
(m_MDS_DEST_IS_AN_ADEST(svc_evt->i_dest) == true)) {
+                               TRACE("Peer fm status change: %d -> %d, peer 
node id is: %x, cluster size is %llu",
+                                     (int) cb->peer_sc_up, 1, 
svc_evt->i_node_id, (unsigned long long) cb->cluster_size);
+                               cb->peer_sc_up = true;
+                               check_for_node_isolation(cb);
 
                                fm_evt = m_MMGR_ALLOC_FM_EVT;
                                if (NULL == fm_evt) {

------------------------------------------------------------------------------
Attend Shape: An AT&T Tech Expo July 15-16. Meet us at AT&T Park in San
Francisco, CA to explore cutting-edge tech and listen to tech luminaries
present their vision of the future. This family event has something for
everyone, including kids. Get more information and register today.
http://sdm.link/attshape
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to