From: Anand Sundararaj <s.an...@gethighavailability.com> --- src/amf/amfd/node.cc | 56 +++++++++++++++++++++++++++++++++++++++++++- src/amf/amfd/node.h | 2 ++ src/amf/amfd/sgproc.cc | 18 ++++++++++++++ src/amf/amfnd/avnd_su.h | 1 + src/amf/amfnd/di.cc | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ src/amf/amfnd/err.cc | 2 +- src/amf/amfnd/su.cc | 2 +- 7 files changed, 140 insertions(+), 3 deletions(-)
diff --git a/src/amf/amfd/node.cc b/src/amf/amfd/node.cc index 2c04fca..4c63055 100644 --- a/src/amf/amfd/node.cc +++ b/src/amf/amfd/node.cc @@ -108,6 +108,7 @@ void AVD_AVND::initialize() { saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED; admin_node_pend_cbk = {}; su_cnt_admin_oper = {}; + su_cnt_admin_repair = {}; node_state = AVD_AVND_STATE_ABSENT; list_of_ncs_su = {}; list_of_su = {}; @@ -1247,6 +1248,23 @@ void AVD_AVND::node_sus_termstate_set(bool term_state) const { } /** + * Update count of application SUs of the given node. + * + * @param node pointer + * @return None + */ +static uint32_t application_sus_count(AVD_AVND *node) { + uint32_t count = 0; + /* Count the applications SUs hosted on this node. */ + for (const auto &su : node->list_of_su) { + if (su->su_is_external == false) + count++; + } + TRACE("count '%u'", count); + return count; +} + +/** * Handle admin operations on SaAmfNode objects. * * @param immOiHandle @@ -1532,9 +1550,44 @@ static void node_admin_op_cb(SaImmOiHandleT immOiHandle, } break; + case SA_AMF_ADMIN_REPAIRED: + if (node->saAmfNodeOperState == SA_AMF_OPERATIONAL_ENABLED) { + report_admin_op_error( + immOiHandle, invocation, SA_AIS_ERR_NO_OP, nullptr, + "Admin repair request for '%s', op state already enabled", + node->name.c_str()); + goto done; + } + + if (node->node_info.member == false) { + LOG_NO("'%s' ADMIN_REPAIRED: CLM node is not member", + node->name.c_str()); + avd_saImmOiAdminOperationResult(immOiHandle, invocation, SA_AIS_ERR_TRY_AGAIN); + goto done; + } + + if (0 == application_sus_count(node)) { + // Node is configured without appl su, return from here. + avd_node_oper_state_set(node, SA_AMF_OPERATIONAL_ENABLED); + avd_saImmOiAdminOperationResult(immOiHandle, invocation, SA_AIS_OK); + goto done; + } + + /* forward the admin op req to the node director */ + if (avd_admin_op_msg_snd(node->name, AVSV_SA_AMF_NODE, SA_AMF_ADMIN_REPAIRED, + node) == NCSCC_RC_SUCCESS) { + node->admin_node_pend_cbk.admin_oper = SA_AMF_ADMIN_REPAIRED; + node->admin_node_pend_cbk.invocation = invocation; + node->su_cnt_admin_repair = application_sus_count(node); + } else { + report_admin_op_error(immOiHandle, invocation, SA_AIS_ERR_TIMEOUT, nullptr, + "Admin op request send failed '%s'", node->name.c_str()); + } + break; + default: report_admin_op_error(immOiHandle, invocation, SA_AIS_ERR_NOT_SUPPORTED, - nullptr, "UNSUPPORTED ADMIN OPERATION (%llu)", + nullptr, "UNSUPPORTED ADMIN OPERATION (%llu)", operationId); break; } @@ -1591,6 +1644,7 @@ void node_reset_su_try_inst_counter(const AVD_AVND *node) { su->sg_of_su->try_inst_counter = 0; } } + /** * @brief Checks all nodegroup of nodes are in UNLOCKED state. * @param ptr to Node (AVD_AVND). diff --git a/src/amf/amfd/node.h b/src/amf/amfd/node.h index 097f54b..37b8ce9 100644 --- a/src/amf/amfd/node.h +++ b/src/amf/amfd/node.h @@ -108,6 +108,8 @@ class AVD_AVND { callbacks on this node */ uint32_t su_cnt_admin_oper; /* count to keep track SUs on this node undergoing node admin op */ + uint32_t su_cnt_admin_repair; /* Count to keep track SUs on this node + being repaired by node admin repair op */ /************ AMF B.04 **************************************************/ diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc index 1d7e62a..335d043 100644 --- a/src/amf/amfd/sgproc.cc +++ b/src/amf/amfd/sgproc.cc @@ -1025,6 +1025,23 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb, AVD_EVT *evt) { } } } else { /* if(su->sg_of_su->sg_ncs_spec == true) */ + // This is a case of Node repair + if ((n2d_msg->msg_info.n2d_opr_state.node_oper_state == + SA_AMF_OPERATIONAL_ENABLED) && + (node->admin_node_pend_cbk.invocation) && + (node->admin_node_pend_cbk.admin_oper == SA_AMF_ADMIN_REPAIRED)) { + avd_node_oper_state_set(node, SA_AMF_OPERATIONAL_ENABLED); + if (node->su_cnt_admin_repair > 0) node->su_cnt_admin_repair--; + if (node->su_cnt_admin_repair == 0) { + /* if this is the last SU then send out the pending callback */ + avd_saImmOiAdminOperationResult( + cb->immOiHandle, node->admin_node_pend_cbk.invocation, + SA_AIS_OK); + node->admin_node_pend_cbk.invocation = 0; + node->admin_node_pend_cbk.admin_oper = + static_cast<SaAmfAdminOperationIdT>(0); + } + } /* If oper state of Uninstantiated SU got ENABLED so try to instantiate it after evaluating SG. */ if (su->saAmfSUPresenceState == SA_AMF_PRESENCE_UNINSTANTIATED) { @@ -2326,6 +2343,7 @@ void avd_node_down_mw_susi_failover(AVD_CL_CB *cb, AVD_AVND *avnd) { SA_AIS_ERR_REPAIR_PENDING, &avnd->admin_node_pend_cbk, "node failure"); avnd->su_cnt_admin_oper = 0; + avnd->su_cnt_admin_repair = 0; } TRACE_LEAVE(); diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h index 45effd6..46a711f 100644 --- a/src/amf/amfnd/avnd_su.h +++ b/src/amf/amfnd/avnd_su.h @@ -461,5 +461,6 @@ extern AVND_SU *avnd_sudb_rec_get_next(AmfDb<std::string, AVND_SU> &sudb, extern void sudb_rec_comp_add(AVND_SU *su, AVND_COMP *comp, uint32_t *rc); uint32_t avnd_evt_avd_compcsi_evh(struct avnd_cb_tag *cb, struct avnd_evt_tag *evt); +extern bool comp_in_term_failed_state(void); #endif // AMF_AMFND_AVND_SU_H_ diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc index 7b01868..5bff121 100644 --- a/src/amf/amfnd/di.cc +++ b/src/amf/amfnd/di.cc @@ -92,6 +92,68 @@ static uint32_t avnd_evt_node_admin_op_req(AVND_CB *cb, AVND_EVT *evt) { cb->rcv_msg_id = info->msg_id; switch (info->oper_id) { + case SA_AMF_ADMIN_REPAIRED: { + /* Node has been repaired. Reset states of this node and all DISABLED + * SUs/comps. And update AMF director accordingly. + * No need to reset states of ENABLED SUs(in case of + * SA_AMF_NODE_SWITCHOVER, only faulty components/SU are disabled, + * other SUs assignments are simply switched over.) + * But for ENABLED Sus, send oper state message, so that, they + * are evaluated for assignments. + */ + LOG_NO("Repair request for Node'%s'", Amf::to_string(&info->dn).c_str()); + + // Reset this node state first, so that when SU enabled message + // reaches at Amfd, it starts giving assignments to those SUs. + cb->oper_state = SA_AMF_OPERATIONAL_ENABLED; + cb->term_state = AVND_TERM_STATE_UP; + cb->node_err_esc_level = AVND_ERR_ESC_LEVEL_0; + cb->failed_su = nullptr; + cb->su_failover_cnt = 0; + + /* Reset all faulty SUs and Comps */ + for (AVND_SU *su = avnd_sudb_rec_get_next(cb->sudb, ""); su != nullptr; + su = avnd_sudb_rec_get_next(cb->sudb, su->name)) { + if (su->is_ncs || su->su_is_external) + continue; + su->comp_restart_cnt = 0; + su->su_restart_cnt = 0; + su->su_err_esc_level = AVND_ERR_ESC_LEVEL_0; + if (m_AVND_SU_OPER_STATE_IS_ENABLED(su)){ + rc = avnd_di_oper_send(cb, su, 0); + continue; + } + // Reset the components states. + for (AVND_COMP *comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET( + m_NCS_DBLIST_FIND_FIRST(&su->comp_list)); + comp; comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET( + m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) { + comp->admin_oper = false; + m_AVND_COMP_STATE_RESET(comp); + avnd_comp_pres_state_set(cb, comp, SA_AMF_PRESENCE_UNINSTANTIATED); + + m_AVND_COMP_OPER_STATE_SET(comp, SA_AMF_OPERATIONAL_ENABLED); + avnd_di_uns32_upd_send(AVSV_SA_AMF_COMP, saAmfCompOperState_ID, + comp->name, comp->oper); + } + + // Reset the SUs states. + if ((su->pres == SA_AMF_PRESENCE_TERMINATION_FAILED) && + (comp_in_term_failed_state() == false)) + avnd_failed_state_file_delete(); + + su->admin_op_Id = static_cast<SaAmfAdminOperationIdT>(0); + reset_suRestart_flag(su); + m_AVND_SU_STATE_RESET(su); + m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_ENABLED); + avnd_di_uns32_upd_send(AVSV_SA_AMF_SU, saAmfSUOperState_ID, su->name, + su->oper); + avnd_su_pres_state_set(cb, su, SA_AMF_PRESENCE_UNINSTANTIATED); + rc = avnd_di_oper_send(cb, su, 0); + } // End of for loop for su + break; + } // End of case SA_AMF_ADMIN_REPAIRED: + default: LOG_NO("%s: unsupported adm op %u", __FUNCTION__, info->oper_id); rc = NCSCC_RC_FAILURE; diff --git a/src/amf/amfnd/err.cc b/src/amf/amfnd/err.cc index aeec373..c549047 100644 --- a/src/amf/amfnd/err.cc +++ b/src/amf/amfnd/err.cc @@ -1135,7 +1135,7 @@ uint32_t avnd_err_rcvr_node_failover(AVND_CB *cb, AVND_SU *failed_su, for (comp = avnd_compdb_rec_get_next(cb->compdb, ""); comp != nullptr; comp = avnd_compdb_rec_get_next(cb->compdb, comp->name)) { if (comp->su->is_ncs || comp->su->su_is_external) continue; - + rc = avnd_comp_curr_info_del(cb, comp); rc = avnd_comp_clc_fsm_run(cb, comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); if (rc != NCSCC_RC_SUCCESS) { LOG_ER("'%s' termination failed", comp->name.c_str()); diff --git a/src/amf/amfnd/su.cc b/src/amf/amfnd/su.cc index d8e388a..cb4928d 100644 --- a/src/amf/amfnd/su.cc +++ b/src/amf/amfnd/su.cc @@ -639,7 +639,7 @@ done: return rc; } -static bool comp_in_term_failed_state(void) { +bool comp_in_term_failed_state(void) { for (AVND_COMP *comp = avnd_compdb_rec_get_next(avnd_cb->compdb, ""); comp != nullptr; comp = avnd_compdb_rec_get_next(avnd_cb->compdb, comp->name)) { -- 2.7.4 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel