Ack.
I think this change needs documentation.
Reported #739 while testing this.

Thanks
Praveen
On 10-Jan-14 2:44 PM, Hans Feldt wrote:
>   osaf/services/saf/amf/amfnd/comp.cc             |  23 
> +++++++++++++++++++++--
>   osaf/services/saf/amf/amfnd/err.cc              |  15 +++++++++++----
>   osaf/services/saf/amf/amfnd/include/avnd_comp.h |   1 +
>   osaf/services/saf/amf/amfnd/su.cc               |   1 +
>   osaf/services/saf/amf/amfnd/util.cc             |   5 ++---
>   5 files changed, 36 insertions(+), 9 deletions(-)
>
>
> When a node or SU is locked, AMF assigns the QUIESCED HA state to ACTIVE
> sa-aware components. This state is intended to transfer state to a peer
> component assigned STANDBY. If the ACTIVE component fails in the QUIESCED
> state, AMF will restart it and reassign it QUIESCED. This is an invalid state
> transition according to picture 3 in the B.04 AMF spec and makes no sense 
> since
> the state is anyway gone (because the component failed).
>
> The problem was likely reintroduced when fixing #3083, before that special
> handling for comp errors in QUIESCED state was there. However the solution had
> other problems and was removed. So in the current code there is no special
> handling for errors in QUIESCED state which results in re-assignment of the
> QUIESCED HA state after component restart.
>
> This patch changes the error escalation logic so that instead of component
> restart, component failover is performed as recovery action.
>
> diff --git a/osaf/services/saf/amf/amfnd/comp.cc 
> b/osaf/services/saf/amf/amfnd/comp.cc
> --- a/osaf/services/saf/amf/amfnd/comp.cc
> +++ b/osaf/services/saf/amf/amfnd/comp.cc
> @@ -32,9 +32,7 @@
>   
> ******************************************************************************
>   */
>   
> -#include <stdbool.h>
>   #include "avnd.h"
> -#include <stdbool.h>
>   #include <immutil.h>
>   
>   /*** Static function declarations ***/
> @@ -2766,3 +2764,24 @@ void avnd_comp_pres_state_set(AVND_COMP
>       m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, comp, AVND_CKPT_COMP_PRES_STATE);
>   }
>   
> +/**
> + * Returns true if the HA state for any CSI assignment is QUIESCED/QUIESCING
> + * @param su
> + */
> +bool comp_has_quiesced_assignment(const AVND_COMP *comp)
> +{
> +     const AVND_COMP_CSI_REC *csi;
> +
> +     for (csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(
> +                     m_NCS_DBLIST_FIND_FIRST(&comp->csi_list));
> +             csi != NULL;
> +             csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(
> +                     m_NCS_DBLIST_FIND_NEXT(&csi->comp_dll_node))) {
> +
> +             if ((csi->si->curr_state == SA_AMF_HA_QUIESCED) ||
> +                             (csi->si->curr_state == SA_AMF_HA_QUIESCING))
> +                     return true;
> +     }
> +
> +     return false;
> +}
> diff --git a/osaf/services/saf/amf/amfnd/err.cc 
> b/osaf/services/saf/amf/amfnd/err.cc
> --- a/osaf/services/saf/amf/amfnd/err.cc
> +++ b/osaf/services/saf/amf/amfnd/err.cc
> @@ -411,10 +411,17 @@ uint32_t avnd_err_escalate(AVND_CB *cb,
>       if (*io_esc_rcvr == SA_AMF_NO_RECOMMENDATION)
>               *io_esc_rcvr = comp->err_info.def_rec;
>   
> -     /* disallow comp-restart if it's disabled */
> -     if ((SA_AMF_COMPONENT_RESTART == *io_esc_rcvr) && 
> m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) {
> -             LOG_NO("saAmfCompDisableRestart is true for 
> '%s'",comp->name.value);
> -             *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER;
> +     if (*io_esc_rcvr == SA_AMF_COMPONENT_RESTART) {
> +             if (m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) {
> +                     LOG_NO("saAmfCompDisableRestart is true for 
> '%s'",comp->name.value);
> +                     LOG_NO("recovery action 'comp restart' escalated to 
> 'comp failover'");
> +                     *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER;
> +             } else if (comp_has_quiesced_assignment(comp) == true) {
> +                     /* Cannot re-assign QUIESCED, escalate to failover */
> +                     LOG_NO("component with QUIESCED/QUIESCING assignment 
> failed");
> +                     LOG_NO("recovery action 'comp restart' escalated to 
> 'comp failover'");
> +                     *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER;
> +             }
>       }
>   
>       if ((SA_AMF_COMPONENT_FAILOVER== *io_esc_rcvr) && (su->sufailover) && 
> (!su->is_ncs)) {
> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_comp.h 
> b/osaf/services/saf/amf/amfnd/include/avnd_comp.h
> --- a/osaf/services/saf/amf/amfnd/include/avnd_comp.h
> +++ b/osaf/services/saf/amf/amfnd/include/avnd_comp.h
> @@ -876,6 +876,7 @@ extern unsigned int avnd_comp_config_get
>   extern int avnd_comp_config_reinit(AVND_COMP *comp);
>   extern void avnd_comp_delete(AVND_COMP *comp);
>   extern void avnd_comp_pres_state_set(AVND_COMP *comp, SaAmfPresenceStateT 
> newstate);
> +bool comp_has_quiesced_assignment(const AVND_COMP *comp);
>   
>   /**
>    * Initiate restart of a component.
> diff --git a/osaf/services/saf/amf/amfnd/su.cc 
> b/osaf/services/saf/amf/amfnd/su.cc
> --- a/osaf/services/saf/amf/amfnd/su.cc
> +++ b/osaf/services/saf/amf/amfnd/su.cc
> @@ -481,6 +481,7 @@ uint32_t avnd_evt_su_admin_op_req(AVND_C
>               AVND_COMP *comp;
>   
>               /* SU has been repaired. Reset states and update AMF director 
> accordingly. */
> +             LOG_NO("Repair request for '%s'", su->name.value);
>   
>               for (comp = 
> m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
>                     comp;
> diff --git a/osaf/services/saf/amf/amfnd/util.cc 
> b/osaf/services/saf/amf/amfnd/util.cc
> --- a/osaf/services/saf/amf/amfnd/util.cc
> +++ b/osaf/services/saf/amf/amfnd/util.cc
> @@ -236,9 +236,8 @@ void avnd_failed_state_file_create(void)
>    */
>   void avnd_failed_state_file_delete(void)
>   {
> -     if (unlink(failed_state_file_name) == -1)
> -             LOG_ER("cannot unlink failed state file %s: %s",
> -                             failed_state_file_name, strerror(errno));
> +     // file might not exist in some cases, ignore errors
> +     (void) unlink(failed_state_file_name);
>   }
>   
>   /**


------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to