osaf/services/saf/amf/amfd/sgproc.cc | 153 ++++++++++++++++------------------ osaf/services/saf/amf/amfnd/err.cc | 8 +- osaf/services/saf/amf/amfnd/susm.cc | 58 ++++++++++++- 3 files changed, 131 insertions(+), 88 deletions(-)
AMF performs node-switchover before the clean up of components. AMFND sends node-switchover request to AMFD before initiating cleanup of failed component. Because of this standby becomes active before cleanup of components. Send node-switchover request to AMFD after cleanup of failed component. diff --git a/osaf/services/saf/amf/amfd/sgproc.cc b/osaf/services/saf/amf/amfd/sgproc.cc --- a/osaf/services/saf/amf/amfd/sgproc.cc +++ b/osaf/services/saf/amf/amfd/sgproc.cc @@ -368,7 +368,73 @@ static uint32_t su_recover_from_fault(AV return rc; } +/** + * @brief Try to repair node by sending reboot message to node + * director of the node. + * + * @param[in] ptr to node. + * + **/ +static void try_node_repair(AVD_AVND *node) +{ + TRACE_ENTER2("'%s'", node->name.value); + + if (node->saAmfNodeAutoRepair) { + LOG_NO("Ordering reboot of '%s' as node fail/switch-over repair action", + node->name.value); + saflog(LOG_NOTICE, amfSvcUsrName, + "Ordering reboot of '%s' as node fail/switch-over repair action", + node->name.value); + avd_d2n_reboot_snd(node); + } else { + LOG_NO("NodeAutorepair disabled for '%s', no reboot ordered", + node->name.value); + saflog(LOG_NOTICE, amfSvcUsrName, + "NodeAutorepair disabled for '%s', NO reboot ordered", + node->name.value); + + } + TRACE_LEAVE(); +} + +/** + * @brief Performs node-switchover recovery. + * + * @param[in] ptr to failed su. + * + **/ +static void node_nodeswtichover_recovery(AVD_AVND *node) +{ + bool node_reboot = true; + TRACE_ENTER2("'%s'", node->name.value); + + AVD_SU *i_su = node->list_of_su; + for (;i_su != NULL; i_su = i_su->avnd_list_su_next) { + i_su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE); + + if (i_su->list_of_susi == NULL) + continue; + + if (su_recover_from_fault(i_su) == NCSCC_RC_FAILURE) { + LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, i_su->name.value); + goto done; + } + + if (i_su->list_of_susi != NULL) + node_reboot = false; + + if (avd_sg_app_su_inst_func(avd_cb, i_su->sg_of_su) == NCSCC_RC_FAILURE) { + LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, i_su->name.value); + goto done; + } + } + + if (node_reboot == true) + try_node_repair(node); +done: + TRACE_LEAVE(); +} /***************************************************************************** * Function: avd_su_oper_state_func * @@ -532,81 +598,19 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb avd_node_down_appl_susi_failover(avd_cb, node); break; } + break; case SA_AMF_NODE_SWITCHOVER: - i_su = node->list_of_su; - while (i_su != NULL) { - i_su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE); - if (i_su->list_of_susi != AVD_SU_SI_REL_NULL) { - /* Delay Node reboot if: - a)Faulted SU has saAmfSUFailover set but - other healthy SUs are present on node. - b)Only faulted SU exists on the node and its - saAmfSUFailover is false. - */ - node_reboot_req = false; - if (((i_su == su) && (!i_su->saAmfSUFailover)) || - ((i_su != su) && - (i_su->saAmfSUOperState == SA_AMF_OPERATIONAL_ENABLED))) { - - /* Since assignments exists call the SG FSM.*/ - if (su_recover_from_fault(i_su) == NCSCC_RC_FAILURE) { - LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, i_su->name.value); - goto done; - } - } - } - - /* Verify the SG to check if any instantiations need - * to be done for the SG on which this SU exists. - */ - if ((!su->saAmfSUFailover)) - if (avd_sg_app_su_inst_func(cb, i_su->sg_of_su) == NCSCC_RC_FAILURE) { - LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, i_su->name.value); - goto done; - } - - i_su = i_su->avnd_list_su_next; - } - break; - case AVSV_ERR_RCVR_SU_FAILOVER: - /* This is a case when node switchover and su failover are happening together. - Reboot node only: - a) After gracefully removing all the assignments (SUs with saAmfSUFailover set). - b) After termination of all components in faulted SU and after performing its failover - as a single entity (saAmfSUFailover set). - */ - if (su_recover_from_fault(su) == NCSCC_RC_FAILURE) { - LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, su->name.value); - goto done; - } - if (avd_sg_app_su_inst_func(cb, su->sg_of_su) == NCSCC_RC_FAILURE) { - LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, su->name.value); - goto done; - } - for (i_su = node->list_of_su; i_su; i_su = i_su->avnd_list_su_next) { - if (i_su->list_of_susi != AVD_SU_SI_REL_NULL) { - node_reboot_req = false; - break; - } - } + node_nodeswtichover_recovery(su->su_on_node); + goto done; break; default : break; } - if (node_reboot_req) { - if (node->saAmfNodeAutoRepair) { - saflog(LOG_NOTICE, amfSvcUsrName, - "Ordering reboot of '%s' as node fail/switch-over repair action", - node->name.value); - avd_d2n_reboot_snd(node); - } else { - saflog(LOG_NOTICE, amfSvcUsrName, - "NodeAutorepair disabled for '%s', NO reboot ordered", - node->name.value); - - } - } + if (node_reboot_req) + try_node_repair(node); + + } else { /* if (n2d_msg->msg_info.n2d_opr_state.node_oper_state == SA_AMF_OPERATIONAL_DISABLED) */ if (su->list_of_susi != AVD_SU_SI_REL_NULL) { @@ -1235,16 +1239,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb, } if (true == all_su_unassigned) { /* All app su got unassigned, Safe to reboot the blade now. */ - if (node->saAmfNodeAutoRepair) { - saflog(LOG_NOTICE, amfSvcUsrName, - "Ordering reboot of '%s' as node fail/switch-over repair action", - node->name.value); - avd_d2n_reboot_snd(node); - } else { - saflog(LOG_NOTICE, amfSvcUsrName, - "NodeAutorepair disabled for '%s', NO reboot ordered", - node->name.value); - } + try_node_repair(node); } } /* Free the messages */ diff --git a/osaf/services/saf/amf/amfnd/err.cc b/osaf/services/saf/amf/amfnd/err.cc --- a/osaf/services/saf/amf/amfnd/err.cc +++ b/osaf/services/saf/amf/amfnd/err.cc @@ -814,7 +814,9 @@ uint32_t avnd_err_rcvr_node_switchover(A m_AVND_SU_FAILED_SET(failed_su); m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, failed_su, AVND_CKPT_SU_FLAG_CHANGE); } + cb->term_state = AVND_TERM_STATE_NODE_SWITCHOVER_STARTED; + cb->failed_su = failed_su; /* transition the su oper state to disabled */ m_AVND_SU_OPER_STATE_SET(failed_su, SA_AMF_OPERATIONAL_DISABLED); @@ -825,11 +827,6 @@ uint32_t avnd_err_rcvr_node_switchover(A if(SA_AMF_OPERATIONAL_DISABLED != cb->oper_state) { /* transition the node oper state to disabled */ cb->oper_state = SA_AMF_OPERATIONAL_DISABLED; - - /* inform avd */ - rc = avnd_di_oper_send(cb, failed_su, SA_AMF_NODE_SWITCHOVER); - if (NCSCC_RC_SUCCESS != rc) - goto done; } /* We are now in the context of failover, forget the restart */ @@ -865,7 +862,6 @@ uint32_t avnd_err_rcvr_node_switchover(A goto done; } } - avnd_su_si_del(cb, &failed_comp->su->name); } else { /* terminate the failed comp */ diff --git a/osaf/services/saf/amf/amfnd/susm.cc b/osaf/services/saf/amf/amfnd/susm.cc --- a/osaf/services/saf/amf/amfnd/susm.cc +++ b/osaf/services/saf/amf/amfnd/susm.cc @@ -1346,6 +1346,46 @@ static bool all_comps_terminated_in_su(c return true; } +static void perform_pending_nodeswitchover() +{ + bool nodeswitchover = true; + + /* Reverify if nodeswitchover is really pending */ + if ((avnd_cb->term_state != AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) || + (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED)) + return; + + AVND_COMP *comp; + AVND_SU *su = avnd_cb->failed_su; + for (comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list)); + comp; + comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) { + + if ((comp->pres == SA_AMF_PRESENCE_INSTANTIATING) || + (comp->pres == SA_AMF_PRESENCE_TERMINATING) || + (comp->pres == SA_AMF_PRESENCE_RESTARTING) || + (comp->pres == SA_AMF_PRESENCE_TERMINATION_FAILED)) { + nodeswitchover= false; + break; + } + } + + + if (nodeswitchover == true) { + /* Now send nodeswitchover request to AMFD as cleanup of failed component is + completed in faulted SU. + */ + + LOG_NO("Sending Nodeswitchover request to AMFD as '%s' " + "got failed", su->name.value); + if (su->sufailover == false) { + uint32_t rc = avnd_di_oper_send(avnd_cb, su, SA_AMF_NODE_SWITCHOVER); + osafassert(NCSCC_RC_SUCCESS == rc); + } + } + +} + /**************************************************************************** Name : avnd_su_pres_fsm_run @@ -1392,14 +1432,26 @@ uint32_t avnd_su_pres_fsm_run(AVND_CB *c /* Since all components got successfully terminated, finish sufailover at amfnd by deleting SUSIs at amfnd and informing amfd about sufailover.*/ LOG_NO("Terminated all components in '%s'", su->name.value); - LOG_NO("Informing director of sufailover"); - rc = avnd_di_oper_send(avnd_cb, su, AVSV_ERR_RCVR_SU_FAILOVER); + if (cb->term_state == AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) { + LOG_NO("Sending Nodeswitchover request to AMFD as '%s' " + "got failed", su->name.value); + rc = avnd_di_oper_send(avnd_cb, su, SA_AMF_NODE_SWITCHOVER); + } + else { + LOG_NO("Informing director of sufailover"); + rc = avnd_di_oper_send(avnd_cb, su, AVSV_ERR_RCVR_SU_FAILOVER); + } osafassert(NCSCC_RC_SUCCESS == rc); avnd_su_si_del(avnd_cb, &su->name); if (!m_AVND_SU_IS_PREINSTANTIABLE(su)) avnd_su_pres_state_set(su, SA_AMF_PRESENCE_UNINSTANTIATED); goto done; - } + } else if ((cb->term_state == AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) && + (cb->oper_state == SA_AMF_OPERATIONAL_DISABLED) && + (su->sufailover == false)) { + perform_pending_nodeswitchover(); + } + /* process state change */ if (prv_st != final_st) ------------------------------------------------------------------------------ HPCC Systems Open Source Big Data Platform from LexisNexis Risk Solutions Find What Matters Most in Your Big Data with HPCC Systems Open Source. Fast. Scalable. Simple. Ideal for Dirty Data. Leverages Graph Analysis for Fast Processing & Easy Data Exploration http://p.sf.net/sfu/hpccsystems _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel