Hi,

Testing this patch for PI SUs.
The patch works fine for component failover for PI SUs.
But in case recovery policy is configured as nodeswitchover, the patch 
causes reboot of node before the removal of assignments. Thus repair is 
being done before the recovery.
This happens because in node switchover escalation  cleanup script is 
invoked. When cleanup up is done successfully, AMFND informs AMFD for 
component failover assuming it to be component failover. This should not 
be done in node switchover case.
Below is the modification in if condition needed to fix it. Here oper 
state of avnd is also checked to differentiate between nodeswitchover 
and component
failover:

  /* determine if this is a case of component failover */
         if (m_AVND_COMP_IS_FAILED(comp) && m_AVND_SU_IS_FAILED(su) &&
                         m_AVND_SU_IS_PREINSTANTIABLE(su) && 
(su->sufailover == false) &&
                         (avnd_cb->oper_state != 
SA_AMF_OPERATIONAL_DISABLED) ) {
                 /* yes, request director to orchestrate component 
failover */
                 rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
         }

Will test with NPI SUs.
Thanks,
Praveen



On 14-Feb-14 6:43 PM, Hans Feldt wrote:
>   osaf/services/saf/amf/amfnd/clc.cc            |  9 +++++++++
>   osaf/services/saf/amf/amfnd/di.cc             |  2 +-
>   osaf/services/saf/amf/amfnd/err.cc            |  9 +++++----
>   osaf/services/saf/amf/amfnd/include/avnd_di.h |  2 +-
>   4 files changed, 16 insertions(+), 6 deletions(-)
>
>
> If a component error is detected and the recovery action is 
> COMPONENT_FAILOVER,
> it is possible that a standby component gets the active assignment before the
> erroneous component has been terminated. This can cause a split brain on
> application level.
>
> The reason for this is that when the error is detected amfnd starts two
> parallel activities, component cleanup and inform director. When the director
> receives the information it starts the process of failing over the workload
> of the erroneous component.
>
> This patch informs the director after successful termination has been 
> performed.
>
> diff --git a/osaf/services/saf/amf/amfnd/clc.cc 
> b/osaf/services/saf/amf/amfnd/clc.cc
> --- a/osaf/services/saf/amf/amfnd/clc.cc
> +++ b/osaf/services/saf/amf/amfnd/clc.cc
> @@ -2024,6 +2024,7 @@ uint32_t avnd_comp_clc_terming_termfail_
>   
> ******************************************************************************/
>   uint32_t avnd_comp_clc_terming_cleansucc_hdler(AVND_CB *cb, AVND_COMP *comp)
>   {
> +     const AVND_SU *su = comp->su;
>       uint32_t rc = NCSCC_RC_SUCCESS;
>       TRACE_ENTER2("'%s': Cleanup success event in the terminating state", 
> comp->name.value);
>   
> @@ -2074,6 +2075,14 @@ uint32_t avnd_comp_clc_terming_cleansucc
>               m_AVND_COMP_REG_PARAM_RESET(cb, comp);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, comp, 
> AVND_CKPT_COMP_CONFIG);
>       }
> +
> +     /* determine if this is a case of component failover */
> +     if (m_AVND_COMP_IS_FAILED(comp) && m_AVND_SU_IS_FAILED(su) &&
> +                     m_AVND_SU_IS_PREINSTANTIABLE(su) && (su->sufailover == 
> false)) {
> +             /* yes, request director to orchestrate component failover */
> +             rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
> +     }
> +
>       TRACE_LEAVE();
>       return rc;
>   }
> diff --git a/osaf/services/saf/amf/amfnd/di.cc 
> b/osaf/services/saf/amf/amfnd/di.cc
> --- a/osaf/services/saf/amf/amfnd/di.cc
> +++ b/osaf/services/saf/amf/amfnd/di.cc
> @@ -476,7 +476,7 @@ uint32_t avnd_evt_mds_avd_dn_evh(AVND_CB
>    
>     Notes         : None.
>   
> ******************************************************************************/
> -uint32_t avnd_di_oper_send(AVND_CB *cb, AVND_SU *su, uint32_t rcvr)
> +uint32_t avnd_di_oper_send(AVND_CB *cb, const AVND_SU *su, uint32_t rcvr)
>   {
>       AVND_MSG msg;
>       uint32_t rc = NCSCC_RC_SUCCESS;
> diff --git a/osaf/services/saf/amf/amfnd/err.cc 
> b/osaf/services/saf/amf/amfnd/err.cc
> --- a/osaf/services/saf/amf/amfnd/err.cc
> +++ b/osaf/services/saf/amf/amfnd/err.cc
> @@ -702,9 +702,6 @@ uint32_t avnd_err_rcvr_comp_failover(AVN
>       m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_DISABLED);
>       m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, AVND_CKPT_SU_OPER_STATE);
>   
> -     /* inform AvD */
> -     rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
> -
>       /*
>        *  su-sis may be in assigning/removing state. signal csi
>        * assign/remove done so that su-si assignment/removal algo can proceed.
> @@ -722,11 +719,15 @@ uint32_t avnd_err_rcvr_comp_failover(AVN
>       if (NCSCC_RC_SUCCESS != rc)
>               goto done;
>   
> -     /* clean the failed comp */
> +     // TODO: there should be no difference between PI/NPI comps
>       if (m_AVND_SU_IS_PREINSTANTIABLE(su)) {
> +             /* clean the failed comp */
>               rc = avnd_comp_clc_fsm_run(cb, failed_comp, 
> AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
>               if (NCSCC_RC_SUCCESS != rc)
>                       goto done;
> +     } else  {
> +             /* request director to orchestrate component failover */
> +             rc = avnd_di_oper_send(cb, failed_comp->su, 
> AVSV_ERR_RCVR_SU_FAILOVER);
>       }
>   
>    done:
> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_di.h 
> b/osaf/services/saf/amf/amfnd/include/avnd_di.h
> --- a/osaf/services/saf/amf/amfnd/include/avnd_di.h
> +++ b/osaf/services/saf/amf/amfnd/include/avnd_di.h
> @@ -68,7 +68,7 @@
>   
>   struct avnd_cb_tag;
>   
> -uint32_t avnd_di_oper_send(struct avnd_cb_tag *, AVND_SU *, uint32_t);
> +uint32_t avnd_di_oper_send(struct avnd_cb_tag *, const AVND_SU *, uint32_t);
>   uint32_t avnd_di_susi_resp_send(struct avnd_cb_tag *, AVND_SU *, 
> AVND_SU_SI_REC *);
>   uint32_t avnd_di_object_upd_send(struct avnd_cb_tag *, AVSV_PARAM_INFO *);
>   uint32_t avnd_di_pg_act_send(struct avnd_cb_tag *, SaNameT *, 
> AVSV_PG_TRACK_ACT, bool);
>
> ------------------------------------------------------------------------------
> Android apps run on BlackBerry 10
> Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
> Now with support for Jelly Bean, Bluetooth, Mapview and more.
> Get your Android app in front of a whole new audience.  Start now.
> http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
> _______________________________________________
> Opensaf-devel mailing list
> Opensaf-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/opensaf-devel


------------------------------------------------------------------------------
Managing the Performance of Cloud-Based Applications
Take advantage of what the Cloud has to offer - Avoid Common Pitfalls.
Read the Whitepaper.
http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to