Ack. I think this change needs documentation. Reported #739 while testing this.
Thanks Praveen On 10-Jan-14 2:44 PM, Hans Feldt wrote: > osaf/services/saf/amf/amfnd/comp.cc | 23 > +++++++++++++++++++++-- > osaf/services/saf/amf/amfnd/err.cc | 15 +++++++++++---- > osaf/services/saf/amf/amfnd/include/avnd_comp.h | 1 + > osaf/services/saf/amf/amfnd/su.cc | 1 + > osaf/services/saf/amf/amfnd/util.cc | 5 ++--- > 5 files changed, 36 insertions(+), 9 deletions(-) > > > When a node or SU is locked, AMF assigns the QUIESCED HA state to ACTIVE > sa-aware components. This state is intended to transfer state to a peer > component assigned STANDBY. If the ACTIVE component fails in the QUIESCED > state, AMF will restart it and reassign it QUIESCED. This is an invalid state > transition according to picture 3 in the B.04 AMF spec and makes no sense > since > the state is anyway gone (because the component failed). > > The problem was likely reintroduced when fixing #3083, before that special > handling for comp errors in QUIESCED state was there. However the solution had > other problems and was removed. So in the current code there is no special > handling for errors in QUIESCED state which results in re-assignment of the > QUIESCED HA state after component restart. > > This patch changes the error escalation logic so that instead of component > restart, component failover is performed as recovery action. > > diff --git a/osaf/services/saf/amf/amfnd/comp.cc > b/osaf/services/saf/amf/amfnd/comp.cc > --- a/osaf/services/saf/amf/amfnd/comp.cc > +++ b/osaf/services/saf/amf/amfnd/comp.cc > @@ -32,9 +32,7 @@ > > ****************************************************************************** > */ > > -#include <stdbool.h> > #include "avnd.h" > -#include <stdbool.h> > #include <immutil.h> > > /*** Static function declarations ***/ > @@ -2766,3 +2764,24 @@ void avnd_comp_pres_state_set(AVND_COMP > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, comp, AVND_CKPT_COMP_PRES_STATE); > } > > +/** > + * Returns true if the HA state for any CSI assignment is QUIESCED/QUIESCING > + * @param su > + */ > +bool comp_has_quiesced_assignment(const AVND_COMP *comp) > +{ > + const AVND_COMP_CSI_REC *csi; > + > + for (csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET( > + m_NCS_DBLIST_FIND_FIRST(&comp->csi_list)); > + csi != NULL; > + csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET( > + m_NCS_DBLIST_FIND_NEXT(&csi->comp_dll_node))) { > + > + if ((csi->si->curr_state == SA_AMF_HA_QUIESCED) || > + (csi->si->curr_state == SA_AMF_HA_QUIESCING)) > + return true; > + } > + > + return false; > +} > diff --git a/osaf/services/saf/amf/amfnd/err.cc > b/osaf/services/saf/amf/amfnd/err.cc > --- a/osaf/services/saf/amf/amfnd/err.cc > +++ b/osaf/services/saf/amf/amfnd/err.cc > @@ -411,10 +411,17 @@ uint32_t avnd_err_escalate(AVND_CB *cb, > if (*io_esc_rcvr == SA_AMF_NO_RECOMMENDATION) > *io_esc_rcvr = comp->err_info.def_rec; > > - /* disallow comp-restart if it's disabled */ > - if ((SA_AMF_COMPONENT_RESTART == *io_esc_rcvr) && > m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) { > - LOG_NO("saAmfCompDisableRestart is true for > '%s'",comp->name.value); > - *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER; > + if (*io_esc_rcvr == SA_AMF_COMPONENT_RESTART) { > + if (m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) { > + LOG_NO("saAmfCompDisableRestart is true for > '%s'",comp->name.value); > + LOG_NO("recovery action 'comp restart' escalated to > 'comp failover'"); > + *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER; > + } else if (comp_has_quiesced_assignment(comp) == true) { > + /* Cannot re-assign QUIESCED, escalate to failover */ > + LOG_NO("component with QUIESCED/QUIESCING assignment > failed"); > + LOG_NO("recovery action 'comp restart' escalated to > 'comp failover'"); > + *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER; > + } > } > > if ((SA_AMF_COMPONENT_FAILOVER== *io_esc_rcvr) && (su->sufailover) && > (!su->is_ncs)) { > diff --git a/osaf/services/saf/amf/amfnd/include/avnd_comp.h > b/osaf/services/saf/amf/amfnd/include/avnd_comp.h > --- a/osaf/services/saf/amf/amfnd/include/avnd_comp.h > +++ b/osaf/services/saf/amf/amfnd/include/avnd_comp.h > @@ -876,6 +876,7 @@ extern unsigned int avnd_comp_config_get > extern int avnd_comp_config_reinit(AVND_COMP *comp); > extern void avnd_comp_delete(AVND_COMP *comp); > extern void avnd_comp_pres_state_set(AVND_COMP *comp, SaAmfPresenceStateT > newstate); > +bool comp_has_quiesced_assignment(const AVND_COMP *comp); > > /** > * Initiate restart of a component. > diff --git a/osaf/services/saf/amf/amfnd/su.cc > b/osaf/services/saf/amf/amfnd/su.cc > --- a/osaf/services/saf/amf/amfnd/su.cc > +++ b/osaf/services/saf/amf/amfnd/su.cc > @@ -481,6 +481,7 @@ uint32_t avnd_evt_su_admin_op_req(AVND_C > AVND_COMP *comp; > > /* SU has been repaired. Reset states and update AMF director > accordingly. */ > + LOG_NO("Repair request for '%s'", su->name.value); > > for (comp = > m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list)); > comp; > diff --git a/osaf/services/saf/amf/amfnd/util.cc > b/osaf/services/saf/amf/amfnd/util.cc > --- a/osaf/services/saf/amf/amfnd/util.cc > +++ b/osaf/services/saf/amf/amfnd/util.cc > @@ -236,9 +236,8 @@ void avnd_failed_state_file_create(void) > */ > void avnd_failed_state_file_delete(void) > { > - if (unlink(failed_state_file_name) == -1) > - LOG_ER("cannot unlink failed state file %s: %s", > - failed_state_file_name, strerror(errno)); > + // file might not exist in some cases, ignore errors > + (void) unlink(failed_state_file_name); > } > > /** ------------------------------------------------------------------------------ CenturyLink Cloud: The Leader in Enterprise Cloud Services. Learn Why More Businesses Are Choosing CenturyLink Cloud For Critical Workloads, Development Environments & Everything In Between. Get a Quote or Start a Free Trial Today. http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel