src/amf/amfnd/avnd_su.h | 1 + src/amf/amfnd/clc.cc | 3 --- src/amf/amfnd/di.cc | 12 +++++++++++- src/amf/amfnd/susm.cc | 32 +++++++++++++++++++++++++++++--- 4 files changed, 41 insertions(+), 7 deletions(-)
In case component failover, faulty component will be terminated. When the reinstantiation is done, amfnd will send su_oper_message (enabled) to amfd which is running along with component failover. In the reported problem, if su_oper_message (enabled) comes to amfd before the quiesced assignment response (as part of component failover sequence) comes to amfd, then this quiesced assignment response is ignored, thus component failover will not finish. The problem is in function susi_success_sg_realign with act=5, state=3, amfd always assumes su having faulty component is OUT_OF_SERVICE. This assumption is true in most of the time when su_oper_message (enabled) comes a little later than quiesced assignment response. In fact the su_oper_message (enabled) is not designed as part of component failover sequence, thus it can come any time during the failover. If amfd is getting a bit busier with RTA update then the faulty component has enough to reinstiantiate so that amfnd sends su_oper_message (enabled) before quiesced assignment response, the reported problem will be seen. This patch hardens the component failover sequence by ensuring the su_oper_message (enabled) to be sent after su completes to remove assignment. This approach comes from the similarity in su failover, where the su_oper_message (enabled) is sent in repair phase. diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h --- a/src/amf/amfnd/avnd_su.h +++ b/src/amf/amfnd/avnd_su.h @@ -393,6 +393,7 @@ extern struct avnd_su_si_rec *avnd_silis extern struct avnd_su_si_rec *avnd_silist_getprev(const struct avnd_su_si_rec *); extern struct avnd_su_si_rec *avnd_silist_getlast(void); extern bool sufailover_in_progress(const AVND_SU *su); +extern bool componentfailover_in_progress(const AVND_SU *su); extern bool sufailover_during_nodeswitchover(const AVND_SU *su); extern bool all_csis_in_removed_state(const AVND_SU *su); extern void su_reset_restart_count_in_comps(const struct avnd_cb_tag *cb, const AVND_SU *su); diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc --- a/src/amf/amfnd/clc.cc +++ b/src/amf/amfnd/clc.cc @@ -2381,9 +2381,6 @@ uint32_t avnd_comp_clc_terming_cleansucc (m_AVND_SU_IS_FAILOVER(su))) { /* yes, request director to orchestrate component failover */ rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER); - - //Reset component-failover here. SU failover is reset as part of REPAIRED admin op. - m_AVND_SU_FAILOVER_RESET(su); } /* diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc --- a/src/amf/amfnd/di.cc +++ b/src/amf/amfnd/di.cc @@ -894,7 +894,17 @@ uint32_t avnd_di_susi_resp_send(AVND_CB } m_AVND_SU_ALL_SI_RESET(su); } - + if (componentfailover_in_progress(su)) { + if (all_csis_in_removed_state(su) == true) { + bool is_en; + m_AVND_SU_IS_ENABLED(su, is_en); + if (is_en) { + if (avnd_di_oper_send(cb, su, 0) == NCSCC_RC_SUCCESS) { + m_AVND_SU_FAILOVER_RESET(su); + } + } + } + } /* free the contents of avnd message */ avnd_msg_content_free(cb, &msg); diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc --- a/src/amf/amfnd/susm.cc +++ b/src/amf/amfnd/susm.cc @@ -1633,10 +1633,22 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C m_AVND_SU_IS_ENABLED(su, is_en); if (true == is_en) { TRACE("SU oper state is enabled"); + // do not send su_oper state if component failover is in progress m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_ENABLED); - rc = avnd_di_oper_send(cb, su, 0); - if (NCSCC_RC_SUCCESS != rc) - goto done; + if (componentfailover_in_progress(su) == true) { + si = reinterpret_cast<AVND_SU_SI_REC*> + (m_NCS_DBLIST_FIND_FIRST(&su->si_list)); + if (si == nullptr || all_csis_in_removed_state(su)) { + rc = avnd_di_oper_send(cb, su, 0); + if (rc != NCSCC_RC_SUCCESS) + goto done; + m_AVND_SU_FAILOVER_RESET(su); + } + } else { + rc = avnd_di_oper_send(cb, su, 0); + if (NCSCC_RC_SUCCESS != rc) + goto done; + } } else TRACE("SU oper state is disabled"); @@ -3551,6 +3563,20 @@ bool sufailover_in_progress(const AVND_S } /** + * This function checks if the componentfailover is going on. + * @param su: ptr to the SU . + * + * @return true/false. + */ +bool componentfailover_in_progress(const AVND_SU *su) { + if ((su->sufailover == false) && (!m_AVND_SU_IS_RESTART(su)) && + (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) && (!su->is_ncs) && + m_AVND_SU_IS_FAILOVER(su)) + return true; + return false; +} + +/** * This function checks if the sufailover and node switchover are going on. * @param su: ptr to the SU . * ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel