I don't think RED_UP always come first or later MDS_UP or vice-versa. Most likley it depends on the sequence of svc registrations happens with MDS.
Thanks, Ramesh. On 2/24/2017 4:48 PM, praveen malviya wrote: > > > On 24-Feb-17 4:07 PM, ramesh betham wrote: >> Good catch. Hitting the case of fm_peer_down_wait() is very unlikely. >> >> But here fm_peer_down_wait() is called only before fm nid_notifies and >> considering for amfnd-up event too. A rare and race condition can hit >> where fm on upcoming new active receives fm-down event and amfnd is >> still alive. >> > But the if block where cb->amfnd_down is marked false assumes that > cb->peer_node_id is already set in RED_UP events of IMMD or AVD. Is > there any guarantee from MDS that RED_UP event will always come before > normal MDS_UP event? > > Thanks, > Praveen > >> Thanks, >> Ramesh. >> >> On 2/24/2017 2:18 PM, praveen malviya wrote: >>> Hi Ramesh, >>> >>> One minor query: >>> In RED_UP of peer AVD, newly active SC will reboot itself if peer FM >>> on old active SC is not up. If this true then in which situations >>> newly active SC will wait in fm_peer_down_wait(). >>> >>> Thanks, >>> Praveen >>> >>> >>> On 22-Feb-17 5:00 PM, ramesh.bet...@oracle.com wrote: >>>> src/fm/fmd/fm_cb.h | 3 + >>>> src/fm/fmd/fm_evt.h | 2 +- >>>> src/fm/fmd/fm_main.c | 114 +++++++++++++++++--------------- >>>> src/fm/fmd/fm_mds.c | 173 >>>> +++++++++++++++++++++++++++++++++++--------------- >>>> 4 files changed, 186 insertions(+), 106 deletions(-) >>>> >>>> >>>> This patch addresses the specific scenario where the new Active is >>>> coming up and has discovered the afmd process on the peer node (which >>>> is going down) is still alive. Here the peer amfd/amfnd is still in >>>> the process of going down i.e., progressing in termination of >>>> application components having big timeouts etc. >>>> >>>> diff --git a/src/fm/fmd/fm_cb.h b/src/fm/fmd/fm_cb.h >>>> --- a/src/fm/fmd/fm_cb.h >>>> +++ b/src/fm/fmd/fm_cb.h >>>> @@ -1,6 +1,7 @@ >>>> /* -*- OpenSAF -*- >>>> * >>>> * (C) Copyright 2008 The OpenSAF Foundation >>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>> reserved. >>>> * >>>> * This program is distributed in the hope that it will be useful, but >>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>> MERCHANTABILITY >>>> @@ -107,6 +108,8 @@ typedef struct fm_cb { >>>> bool use_remote_fencing; >>>> SaNameT peer_clm_node_name; >>>> bool peer_node_terminated; >>>> + NCS_SEL_OBJ peer_down_obj; >>>> + int peer_down_await; >>>> } FM_CB; >>>> >>>> extern char *role_string[]; >>>> diff --git a/src/fm/fmd/fm_evt.h b/src/fm/fmd/fm_evt.h >>>> --- a/src/fm/fmd/fm_evt.h >>>> +++ b/src/fm/fmd/fm_evt.h >>>> @@ -1,6 +1,7 @@ >>>> /* -*- OpenSAF -*- >>>> * >>>> * (C) Copyright 2008 The OpenSAF Foundation >>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>> reserved. >>>> * >>>> * This program is distributed in the hope that it will be useful, but >>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>> MERCHANTABILITY >>>> @@ -49,7 +50,6 @@ typedef enum { >>>> FM_EVT_NODE_DOWN, >>>> FM_EVT_PEER_UP, >>>> FM_EVT_RDA_ROLE, >>>> - FM_EVT_SVC_DOWN, >>>> FM_FSM_EVT_MAX >>>> } FM_FSM_EVT_CODE; >>>> >>>> diff --git a/src/fm/fmd/fm_main.c b/src/fm/fmd/fm_main.c >>>> --- a/src/fm/fmd/fm_main.c >>>> +++ b/src/fm/fmd/fm_main.c >>>> @@ -1,6 +1,7 @@ >>>> /* -*- OpenSAF -*- >>>> * >>>> * (C) Copyright 2008 The OpenSAF Foundation >>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>> reserved. >>>> * >>>> * This program is distributed in the hope that it will be useful, but >>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>> MERCHANTABILITY >>>> @@ -31,6 +32,7 @@ This file contains the main() routine fo >>>> #include "nid/agent/nid_api.h" >>>> #include "fm.h" >>>> #include "base/osaf_time.h" >>>> +#include "base/osaf_poll.h" >>>> >>>> #define FM_CLM_API_TIMEOUT 10000000000LL >>>> >>>> @@ -71,7 +73,6 @@ void handle_mbx_event(void); >>>> extern uint32_t fm_amf_init(FM_AMF_CB *fm_amf_cb); >>>> uint32_t gl_fm_hdl; >>>> static NCS_SEL_OBJ usr1_sel_obj; >>>> -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt); >>>> >>>> /** >>>> * USR1 signal is used when AMF wants instantiate us as a >>>> @@ -119,6 +120,40 @@ static void rda_cb(uint32_t cb_hdl, PCS_ >>>> TRACE_LEAVE(); >>>> } >>>> >>>> +/* This function primarily handles the weird situation in a cluster >>>> where the controller >>>> + * node which is coming up identifies the peer node is in the midst >>>> of DOWN process (i.e., >>>> + * non-existance of peer FM and amfd/amfnd is still alive). In this >>>> case, the controller >>>> + * node has to wait till the peer gracefully shutdowns. This >>>> function returns FAILURE if >>>> + * peer controller node is not down in a timeout period of >>>> OPENSAF_TERMTIMEOUT (or 60 secs default). >>>> + */ >>>> +static uint32_t fm_peer_down_wait(FM_CB *fm_cb) >>>> +{ >>>> + char *envVar = NULL; >>>> + int peer_term_timeout = 60; /*default 60 secs */ >>>> + >>>> + TRACE_ENTER(); >>>> + >>>> + /* Hoping that "OPENSAF_TERMTIMEOUT" on both the controllers >>>> shall be the same */ >>>> + if ((envVar = getenv("OPENSAF_TERMTIMEOUT"))) >>>> + peer_term_timeout = atoi(envVar); >>>> + >>>> + m_NCS_SEL_OBJ_CREATE(&fm_cb->peer_down_obj); >>>> + fm_cb->peer_down_await = 1; >>>> + >>>> + osaf_poll_one_fd(m_GET_FD_FROM_SEL_OBJ(fm_cb->peer_down_obj), >>>> peer_term_timeout*1000); >>>> + >>>> + m_NCS_SEL_OBJ_DESTROY(&fm_cb->peer_down_obj); >>>> + >>>> + /* Return failure if peer node is not yet completely down */ >>>> + if(fm_cb->peer_down_await) { >>>> + LOG_ER("Peer node is not fully DOWN, please check"); >>>> + TRACE_LEAVE(); >>>> + return NCSCC_RC_FAILURE; >>>> + } >>>> + >>>> + TRACE_LEAVE(); >>>> + return NCSCC_RC_SUCCESS; >>>> +} >>>> >>>> >>>> /***************************************************************************** >>>> >>>> >>>> >>>> >>>> @@ -176,6 +211,11 @@ int main(int argc, char *argv[]) >>>> */ >>>> fm_cb->control_tipc = true; /* Default behaviour */ >>>> >>>> + fm_cb->immd_down = true; >>>> + fm_cb->immnd_down = true; >>>> + fm_cb->amfnd_down = true; >>>> + fm_cb->amfd_down = true; >>>> + >>>> /* Create CB handle */ >>>> gl_fm_hdl = ncshm_create_hdl(NCS_HM_POOL_ID_COMMON, >>>> NCS_SERVICE_ID_GFM, (NCSCONTEXT)fm_cb); >>>> >>>> @@ -194,7 +234,7 @@ int main(int argc, char *argv[]) >>>> goto fm_init_failed; >>>> } >>>> >>>> -/* Attach MBX */ >>>> + /* Attach MBX */ >>>> if (m_NCS_IPC_ATTACH(&fm_cb->mbx) != NCSCC_RC_SUCCESS) { >>>> syslog(LOG_ERR, "m_NCS_IPC_ATTACH() failed."); >>>> goto fm_init_failed; >>>> @@ -245,6 +285,16 @@ int main(int argc, char *argv[]) >>>> goto fm_init_failed; >>>> } >>>> >>>> + /* Weird and rare situation. If peer fm doesn't exist, but >>>> amfd/amfnd process(es) >>>> + * are still alive then wait till the peer gracefully shutsdown. >>>> + */ >>>> + if((!fm_cb->peer_sc_up) && !(fm_cb->amfnd_down && >>>> fm_cb->amfd_down)) { >>>> + if(fm_peer_down_wait(fm_cb) != NCSCC_RC_SUCCESS) { >>>> + LOG_ER("Exiting.. Peer node is not completely >>>> DOWN, please check"); >>>> + goto fm_init_failed; >>>> + } >>>> + } >>>> + >>>> /* Get mailbox selection object */ >>>> mbx_sel_obj = m_NCS_IPC_GET_SEL_OBJ(&fm_cb->mbx); >>>> >>>> @@ -268,7 +318,7 @@ int main(int argc, char *argv[]) >>>> >>>> /* notify the NID */ >>>> if (nid_started) >>>> - fm_nid_notify(NCSCC_RC_SUCCESS); >>>> + fm_nid_notify((uint32_t) NCSCC_RC_SUCCESS); >>>> >>>> while (1) { >>>> ret = poll(fds, nfds, -1); >>>> @@ -454,52 +504,6 @@ static uint32_t fm_get_args(FM_CB *fm_cb >>>> return NCSCC_RC_SUCCESS; >>>> } >>>> >>>> -void fm_proc_svc_down(FM_CB *cb, FM_EVT *fm_mbx_evt) >>>> -{ >>>> - switch (fm_mbx_evt->svc_id) { >>>> - case NCSMDS_SVC_ID_IMMND: >>>> - cb->immnd_down = true; >>>> - LOG_NO("IMMND down on: %x", cb->peer_node_id); >>>> - break; >>>> - case NCSMDS_SVC_ID_AVND: >>>> - cb->amfnd_down = true; >>>> - LOG_NO("AMFND down on: %x", cb->peer_node_id); >>>> - break; >>>> - case NCSMDS_SVC_ID_IMMD: >>>> - cb->immd_down = true; >>>> - LOG_NO("IMMD down on: %x", cb->peer_node_id); >>>> - break; >>>> - case NCSMDS_SVC_ID_AVD: >>>> - cb->amfd_down = true; >>>> - LOG_NO("AVD down on: %x", cb->peer_node_id); >>>> - break; >>>> - case NCSMDS_SVC_ID_GFM: >>>> - cb->fm_down = true; >>>> - LOG_NO("FM down on: %x", cb->peer_node_id); >>>> - break; >>>> - default: >>>> - break; >>>> - } >>>> - >>>> - /* Processing only for alternate node. >>>> - * Service downs of AMFND, IMMD, IMMND is the same as NODE_DOWN >>>> from 4.4 onwards. >>>> - * This is required to handle the usecase involving >>>> - * '/etc/init.d/opensafd stop' without an OS reboot cycle >>>> - * Process service downs only if OpenSAF is not controlling TIPC. >>>> - * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to >>>> trigger failover. >>>> - */ >>>> - if (cb->immd_down && cb->immnd_down && cb->amfnd_down && >>>> cb->amfd_down && cb->fm_down) { >>>> - LOG_NO("Core services went down on node_id: %x", >>>> fm_mbx_evt->node_id); >>>> - fm_send_node_down_to_mbx(cb, fm_mbx_evt->node_id); >>>> - /* Reset peer downs, because we've made MDS RED >>>> subscriptions */ >>>> - cb->immd_down = false; >>>> - cb->immnd_down = false; >>>> - cb->amfnd_down = false; >>>> - cb->amfd_down = false; >>>> - cb->fm_down = false; >>>> - } >>>> -} >>>> - >>>> >>>> /**************************************************************************** >>>> >>>> >>>> >>>> * Name : fm_clm_init >>>> * >>>> @@ -642,11 +646,10 @@ static void fm_mbx_msg_handler(FM_CB *fm >>>> } >>>> } >>>> break; >>>> - case FM_EVT_SVC_DOWN: >>>> - fm_proc_svc_down(fm_cb, fm_mbx_evt); >>>> - break; >>>> + >>>> case FM_EVT_PEER_UP: >>>> -/* Peer fm came up so sending ee_id of this node */ >>>> + >>>> + /* Peer fm came up so sending ee_id of this node */ >>>> if (fm_cb->node_name.length != 0) >>>> fms_fms_exchange_node_info(fm_cb); >>>> >>>> @@ -654,8 +657,9 @@ static void fm_mbx_msg_handler(FM_CB *fm >>>> get_peer_clm_node_name(fm_mbx_evt->node_id); >>>> } >>>> break; >>>> + >>>> case FM_EVT_TMR_EXP: >>>> -/* Timer Expiry event posted */ >>>> + /* Timer Expiry event posted */ >>>> if (fm_mbx_evt->info.fm_tmr->type == FM_TMR_PROMOTE_ACTIVE) { >>>> /* Check whether node(AMF) initialization is done */ >>>> if (fm_cb->csi_assigned == false) { >>>> @@ -684,9 +688,11 @@ static void fm_mbx_msg_handler(FM_CB *fm >>>> "within the time limit"); >>>> } >>>> break; >>>> + >>>> case FM_EVT_RDA_ROLE: >>>> fm_evt_proc_rda_callback(fm_cb, fm_mbx_evt); >>>> break; >>>> + >>>> default: >>>> break; >>>> } >>>> diff --git a/src/fm/fmd/fm_mds.c b/src/fm/fmd/fm_mds.c >>>> --- a/src/fm/fmd/fm_mds.c >>>> +++ b/src/fm/fmd/fm_mds.c >>>> @@ -1,6 +1,7 @@ >>>> /* -*- OpenSAF -*- >>>> * >>>> * (C) Copyright 2008 The OpenSAF Foundation >>>> +* Copyright (C) 2017, Oracle and/or its affiliates. All rights >>>> reserved. >>>> * >>>> * This program is distributed in the hope that it will be useful, but >>>> * WITHOUT ANY WARRANTY; without even the implied warranty of >>>> MERCHANTABILITY >>>> @@ -34,6 +35,7 @@ static void check_for_node_isolation(FM_ >>>> static bool has_been_well_connected_recently(FM_CB *cb); >>>> static uint32_t fm_mds_node_evt(FM_CB *cb, >>>> MDS_CALLBACK_NODE_EVENT_INFO * node_evt); >>>> static uint32_t fm_fill_mds_evt_post_fm_mbx(FM_CB *cb, FM_EVT >>>> *fm_evt, NODE_ID node_id, FM_FSM_EVT_CODE evt_code); >>>> +static void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, >>>> NCSMDS_SVC_ID svc_id); >>>> >>>> uint32_t >>>> fm_mds_sync_send(FM_CB *fm_cb, NCSCONTEXT msg, >>>> @@ -62,7 +64,7 @@ uint32_t fm_mds_init(FM_CB *cb) >>>> { >>>> NCSMDS_INFO arg; >>>> MDS_SVC_ID svc_id[] = { NCSMDS_SVC_ID_GFM, NCSMDS_SVC_ID_AVND, >>>> NCSMDS_SVC_ID_IMMND }; >>>> - MDS_SVC_ID immd_id[2] = { NCSMDS_SVC_ID_IMMD, >>>> NCSMDS_SVC_ID_AVD }; >>>> + MDS_SVC_ID svc_red_id[2] = { NCSMDS_SVC_ID_IMMD, >>>> NCSMDS_SVC_ID_AVD }; >>>> >>>> /* Get the MDS handles to be used. */ >>>> if (fm_mds_get_adest_hdls(cb) != NCSCC_RC_SUCCESS) { >>>> @@ -111,7 +113,7 @@ uint32_t fm_mds_init(FM_CB *cb) >>>> arg.i_op = MDS_RED_SUBSCRIBE; >>>> arg.info.svc_subscribe.i_num_svcs = 2; >>>> arg.info.svc_subscribe.i_scope = NCSMDS_SCOPE_NONE; >>>> - arg.info.svc_subscribe.i_svc_ids = immd_id; >>>> + arg.info.svc_subscribe.i_svc_ids = svc_red_id; >>>> if (ncsmds_api(&arg) == NCSCC_RC_FAILURE) { >>>> syslog(LOG_ERR, "MDS_RED_SUBSCRIBE failed"); >>>> arg.i_op = MDS_UNINSTALL; >>>> @@ -285,25 +287,56 @@ uint32_t fm_send_node_down_to_mbx(FM_CB >>>> return rc; >>>> } >>>> >>>> -static void fm_send_svc_down_to_mbx(FM_CB *cb, uint32_t node_id, >>>> NCSMDS_SVC_ID svc_id) >>>> +void fm_proc_svc_down(FM_CB *cb, uint32_t node_id, NCSMDS_SVC_ID >>>> svc_id) >>>> { >>>> - FM_EVT *fm_evt = NULL; >>>> - uint32_t rc = NCSCC_RC_SUCCESS; >>>> - fm_evt = m_MMGR_ALLOC_FM_EVT; >>>> - if (NULL == fm_evt) { >>>> - syslog(LOG_INFO, "fm_mds_rcv_evt: fm_evt allocation >>>> FAILED."); >>>> - return; >>>> + TRACE_ENTER2("SVC ID: %d", (int) svc_id); >>>> + switch (svc_id) { >>>> + case NCSMDS_SVC_ID_IMMND: >>>> + cb->immnd_down = true; >>>> + LOG_NO("IMMND down on: %x", cb->peer_node_id); >>>> + break; >>>> + case NCSMDS_SVC_ID_AVND: >>>> + cb->amfnd_down = true; >>>> + LOG_NO("AMFND down on: %x", cb->peer_node_id); >>>> + break; >>>> + case NCSMDS_SVC_ID_IMMD: >>>> + cb->immd_down = true; >>>> + LOG_NO("IMMD down on: %x", cb->peer_node_id); >>>> + break; >>>> + case NCSMDS_SVC_ID_AVD: >>>> + cb->amfd_down = true; >>>> + LOG_NO("AVD down on: %x", cb->peer_node_id); >>>> + break; >>>> + case NCSMDS_SVC_ID_GFM: >>>> + cb->fm_down = true; >>>> + LOG_NO("FM down on: %x", cb->peer_node_id); >>>> + break; >>>> + default: >>>> + break; >>>> } >>>> - fm_evt->svc_id = svc_id; >>>> - rc = fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, node_id, >>>> FM_EVT_SVC_DOWN); >>>> - if (rc == NCSCC_RC_FAILURE) { >>>> - m_MMGR_FREE_FM_EVT(fm_evt); >>>> - LOG_IN("service down event post to mailbox failed"); >>>> - fm_evt = NULL; >>>> + >>>> + /* Processing only for alternate node. >>>> + * Service downs of AMFND, IMMD, IMMND is the same as NODE_DOWN >>>> from 4.4 onwards. >>>> + * This is required to handle the usecase involving >>>> + * '/etc/init.d/opensafd stop' without an OS reboot cycle >>>> + * Process service downs only if OpenSAF is not controlling TIPC. >>>> + * If OpenSAF is controlling TIPC, just wait for NODE_DOWN to >>>> trigger failover. >>>> + */ >>>> + if (cb->immd_down && cb->immnd_down && cb->amfnd_down && >>>> cb->amfd_down && cb->fm_down) { >>>> + LOG_NO("Core services went down on node_id: %x", node_id); >>>> + if (cb->peer_down_await) { >>>> + cb->peer_down_await = 0; >>>> + m_NCS_SEL_OBJ_IND(&cb->peer_down_obj); >>>> + } >>>> + >>>> + if(!cb->control_tipc) >>>> + fm_send_node_down_to_mbx(cb, node_id); >>>> } >>>> - return; >>>> + >>>> + TRACE_LEAVE(); >>>> } >>>> >>>> + >>>> static void check_for_node_isolation(FM_CB *cb) >>>> { >>>> bool well_connected = cb->peer_sc_up && cb->cluster_size >= 3; >>>> @@ -393,8 +426,7 @@ static uint32_t fm_mds_node_evt(FM_CB *c >>>> >>>> *****************************************************************************/ >>>> >>>> >>>> >>>> static uint32_t fm_mds_svc_evt(FM_CB *cb, >>>> MDS_CALLBACK_SVC_EVENT_INFO *svc_evt) >>>> { >>>> - uint32_t return_val = NCSCC_RC_SUCCESS; >>>> - FM_EVT *fm_evt; >>>> + FM_EVT *fm_evt = NULL; >>>> TRACE_ENTER(); >>>> >>>> if (NULL == svc_evt) { >>>> @@ -413,43 +445,29 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb >>>> cb->peer_sc_up = false; >>>> check_for_node_isolation(cb); >>>> cb->peer_adest = 0; >>>> - if (!cb->control_tipc) { >>>> - fm_send_svc_down_to_mbx(cb, >>>> svc_evt->i_node_id, svc_evt->i_svc_id); >>>> - } >>>> + >>>> + fm_proc_svc_down(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> } >>>> break; >>>> case NCSMDS_SVC_ID_IMMND: >>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>> - && !cb->control_tipc) { >>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> - } >>>> - break; >>>> case NCSMDS_SVC_ID_AVND: >>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>> - && !cb->control_tipc) { >>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> + if (svc_evt->i_node_id == cb->peer_node_id) { >>>> + fm_proc_svc_down(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> } >>>> break; >>>> default: >>>> TRACE("Not interested in service down of other >>>> services"); >>>> break; >>>> } >>>> - >>>> break; >>>> >>>> case NCSMDS_RED_DOWN: >>>> switch (svc_evt->i_svc_id) { >>>> /* Depend on service downs if OpenSAF is not controling >>>> TIPC */ >>>> case NCSMDS_SVC_ID_IMMD: >>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>> - && !cb->control_tipc) { >>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> - } >>>> - break; >>>> case NCSMDS_SVC_ID_AVD: >>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>> - && !cb->control_tipc) { >>>> - fm_send_svc_down_to_mbx(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> + if (svc_evt->i_node_id == cb->peer_node_id) { >>>> + fm_proc_svc_down(cb, svc_evt->i_node_id, >>>> svc_evt->i_svc_id); >>>> } >>>> break; >>>> default: >>>> @@ -465,43 +483,96 @@ static uint32_t fm_mds_svc_evt(FM_CB *cb >>>> TRACE("Peer fm status change: %d -> %d, peer node id >>>> is: %x, cluster size is %llu", >>>> (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>> (unsigned long long) cb->cluster_size); >>>> cb->peer_sc_up = true; >>>> + cb->fm_down = false; >>>> check_for_node_isolation(cb); >>>> >>>> fm_evt = m_MMGR_ALLOC_FM_EVT; >>>> - if (NULL == fm_evt) { >>>> - syslog(LOG_INFO, "fm_mds_svc_evt: fm_evt >>>> allocation FAILED."); >>>> - return NCSCC_RC_FAILURE; >>>> - } >>>> + if (NULL == fm_evt) { >>>> + syslog(LOG_INFO, "fm_mds_svc_evt: fm_evt >>>> allocation FAILED."); >>>> + return NCSCC_RC_FAILURE; >>>> + } >>>> + >>>> cb->peer_adest = svc_evt->i_dest; >>>> cb->peer_node_id = svc_evt->i_node_id; >>>> cb->peer_node_terminated = false; >>>> - return_val = fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, >>>> cb->peer_node_id, FM_EVT_PEER_UP); >>>> >>>> - if (NCSCC_RC_FAILURE == return_val) { >>>> - m_MMGR_FREE_FM_EVT(fm_evt); >>>> - fm_evt = NULL; >>>> - } >>>> + if(fm_fill_mds_evt_post_fm_mbx(cb, fm_evt, >>>> cb->peer_node_id, FM_EVT_PEER_UP) == NCSCC_RC_FAILURE) >>>> + { >>>> + m_MMGR_FREE_FM_EVT(fm_evt); >>>> + fm_evt = NULL; >>>> + } >>>> } >>>> break; >>>> + >>>> case NCSMDS_SVC_ID_IMMND: >>>> - if (svc_evt->i_node_id == cb->peer_node_id >>>> - && !cb->control_tipc) >>>> - cb->immnd_down = false; /* Only IMMND is >>>> restartable */ >>>> + if (svc_evt->i_node_id == cb->peer_node_id){ >>>> + TRACE("Peer immnd status change: %d -> %d, peer node >>>> id is: %x, cluster size is %llu", >>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>> (unsigned long long) cb->cluster_size); >>>> + cb->immnd_down = false; >>>> + } >>>> + break; >>>> + >>>> + case NCSMDS_SVC_ID_AVND: >>>> + if (svc_evt->i_node_id == cb->peer_node_id){ >>>> + TRACE("Peer amfnd status change: %d -> %d, peer node >>>> id is: %x, cluster size is %llu", >>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>> (unsigned long long) cb->cluster_size); >>>> + cb->amfnd_down = false; >>>> + } >>>> break; >>>> default: >>>> break; >>>> } >>>> break; >>>> >>>> + case NCSMDS_RED_UP: >>>> + switch (svc_evt->i_svc_id) { >>>> + /* Depend on service downs if OpenSAF is not controling >>>> TIPC */ >>>> + case NCSMDS_SVC_ID_IMMD: >>>> + if (svc_evt->i_node_id != cb->node_id) { >>>> + TRACE("Peer immd status change: %d -> %d, peer node >>>> id is: %x, cluster size is %llu", >>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>> (unsigned long long) cb->cluster_size); >>>> + cb->peer_node_id = svc_evt->i_node_id; >>>> + cb->immd_down = false; >>>> + >>>> + /* Arrived svc up event of amfd/amfnd/immd/immnd >>>> svc's with out fm svc-up event being arrived. >>>> + * It can be due to peer node is going down but not >>>> fully down. hence reboot the node. >>>> + */ >>>> + if (!fm_cb->peer_sc_up) >>>> + opensaf_reboot(0, NULL, "Peer is not completely >>>> DOWN, Received svc up of peer IMMD"); >>>> + } >>>> + break; >>>> + >>>> + case NCSMDS_SVC_ID_AVD: >>>> + if (svc_evt->i_node_id != cb->node_id) { >>>> + TRACE("Peer amfd status change: %d -> %d, peer node >>>> id is: %x, cluster size is %llu", >>>> + (int) cb->peer_sc_up, 1, svc_evt->i_node_id, >>>> (unsigned long long) cb->cluster_size); >>>> + cb->peer_node_id = svc_evt->i_node_id; >>>> + cb->amfd_down = false; >>>> + >>>> + /* Arrived svc up event of amfd/amfnd/immd/immnd >>>> svc's with out fm svc-up event being arrived. >>>> + * It can be due to peer node is going down but not >>>> fully down. hence reboot the node. >>>> + */ >>>> + if (!fm_cb->peer_sc_up) >>>> + opensaf_reboot(0, NULL, "Peer is not completely >>>> DOWN, Received svc up of peer AMFD"); >>>> + } >>>> + break; >>>> + >>>> + default: >>>> + TRACE("Not interested in service down of other >>>> services"); >>>> + break; >>>> + } >>>> + break; >>>> + >>>> default: >>>> syslog(LOG_INFO, "Wrong MDS event"); >>>> break; >>>> } >>>> >>>> TRACE_LEAVE(); >>>> - return return_val; >>>> + return NCSCC_RC_SUCCESS; >>>> } >>>> >>>> + >>>> >>>> /*************************************************************************** >>>> >>>> >>>> >>>> * Name : fm_mds_rcv_evt >>>> * >>>> >> ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel