Hi all,
Have you had time to review this patch?
It changes the component failover sequence, so I think we need more time
to look at it.
Thanks,
Minh
On 23/01/17 12:28, Minh Hon Chau wrote:
> src/amf/amfnd/avnd_su.h | 1 +
> src/amf/amfnd/clc.cc | 3 ---
> src/amf/amfnd/di.cc | 12 +++++++++++-
> src/amf/amfnd/susm.cc | 32 +++++++++++++++++++++++++++++---
> 4 files changed, 41 insertions(+), 7 deletions(-)
>
>
> In case component failover, faulty component will be terminated. When the
> reinstantiation
> is done, amfnd will send su_oper_message (enabled) to amfd which is running
> along with
> component failover. In the reported problem, if su_oper_message (enabled)
> comes to amfd
> before the quiesced assignment response (as part of component failover
> sequence) comes to
> amfd, then this quiesced assignment response is ignored, thus component
> failover will not
> finish.
>
> The problem is in function susi_success_sg_realign with act=5, state=3, amfd
> always assumes
> su having faulty component is OUT_OF_SERVICE. This assumption is true in most
> of the time
> when su_oper_message (enabled) comes a little later than quiesced assignment
> response. In fact
> the su_oper_message (enabled) is not designed as part of component failover
> sequence, thus it
> can come any time during the failover. If amfd is getting a bit busier with
> RTA update then
> the faulty component has enough to reinstiantiate so that amfnd sends
> su_oper_message (enabled)
> before quiesced assignment response, the reported problem will be seen.
>
> This patch hardens the component failover sequence by ensuring the
> su_oper_message (enabled) to
> be sent after su completes to remove assignment. This approach comes from the
> similarity in
> su failover, where the su_oper_message (enabled) is sent in repair phase.
>
> diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h
> --- a/src/amf/amfnd/avnd_su.h
> +++ b/src/amf/amfnd/avnd_su.h
> @@ -393,6 +393,7 @@ extern struct avnd_su_si_rec *avnd_silis
> extern struct avnd_su_si_rec *avnd_silist_getprev(const struct
> avnd_su_si_rec *);
> extern struct avnd_su_si_rec *avnd_silist_getlast(void);
> extern bool sufailover_in_progress(const AVND_SU *su);
> +extern bool componentfailover_in_progress(const AVND_SU *su);
> extern bool sufailover_during_nodeswitchover(const AVND_SU *su);
> extern bool all_csis_in_removed_state(const AVND_SU *su);
> extern void su_reset_restart_count_in_comps(const struct avnd_cb_tag *cb,
> const AVND_SU *su);
> diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
> --- a/src/amf/amfnd/clc.cc
> +++ b/src/amf/amfnd/clc.cc
> @@ -2381,9 +2381,6 @@ uint32_t avnd_comp_clc_terming_cleansucc
> (m_AVND_SU_IS_FAILOVER(su))) {
> /* yes, request director to orchestrate component failover */
> rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
> -
> - //Reset component-failover here. SU failover is reset as part
> of REPAIRED admin op.
> - m_AVND_SU_FAILOVER_RESET(su);
> }
>
> /*
> diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc
> --- a/src/amf/amfnd/di.cc
> +++ b/src/amf/amfnd/di.cc
> @@ -894,7 +894,17 @@ uint32_t avnd_di_susi_resp_send(AVND_CB
> }
> m_AVND_SU_ALL_SI_RESET(su);
> }
> -
> + if (componentfailover_in_progress(su)) {
> + if (all_csis_in_removed_state(su) == true) {
> + bool is_en;
> + m_AVND_SU_IS_ENABLED(su, is_en);
> + if (is_en) {
> + if (avnd_di_oper_send(cb, su, 0) ==
> NCSCC_RC_SUCCESS) {
> + m_AVND_SU_FAILOVER_RESET(su);
> + }
> + }
> + }
> + }
> /* free the contents of avnd message */
> avnd_msg_content_free(cb, &msg);
>
> diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc
> --- a/src/amf/amfnd/susm.cc
> +++ b/src/amf/amfnd/susm.cc
> @@ -1633,10 +1633,22 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C
> m_AVND_SU_IS_ENABLED(su, is_en);
> if (true == is_en) {
> TRACE("SU oper state is enabled");
> + // do not send su_oper state if component
> failover is in progress
> m_AVND_SU_OPER_STATE_SET(su,
> SA_AMF_OPERATIONAL_ENABLED);
> - rc = avnd_di_oper_send(cb, su, 0);
> - if (NCSCC_RC_SUCCESS != rc)
> - goto done;
> + if (componentfailover_in_progress(su) == true) {
> + si = reinterpret_cast<AVND_SU_SI_REC*>
> +
> (m_NCS_DBLIST_FIND_FIRST(&su->si_list));
> + if (si == nullptr ||
> all_csis_in_removed_state(su)) {
> + rc = avnd_di_oper_send(cb, su,
> 0);
> + if (rc != NCSCC_RC_SUCCESS)
> + goto done;
> + m_AVND_SU_FAILOVER_RESET(su);
> + }
> + } else {
> + rc = avnd_di_oper_send(cb, su, 0);
> + if (NCSCC_RC_SUCCESS != rc)
> + goto done;
> + }
> }
> else
> TRACE("SU oper state is disabled");
> @@ -3551,6 +3563,20 @@ bool sufailover_in_progress(const AVND_S
> }
>
> /**
> + * This function checks if the componentfailover is going on.
> + * @param su: ptr to the SU .
> + *
> + * @return true/false.
> + */
> +bool componentfailover_in_progress(const AVND_SU *su) {
> + if ((su->sufailover == false) && (!m_AVND_SU_IS_RESTART(su)) &&
> + (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) &&
> (!su->is_ncs) &&
> + m_AVND_SU_IS_FAILOVER(su))
> + return true;
> + return false;
> +}
> +
> +/**
> * This function checks if the sufailover and node switchover are going on.
> * @param su: ptr to the SU .
> *
>
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel