Ack, minor comments inline. Not tested
/Hans

> -----Original Message-----
> From: praveen.malv...@oracle.com [mailto:praveen.malv...@oracle.com]
> Sent: den 9 juni 2014 16:09
> To: Hans Feldt; nagendr...@oracle.com; Hans Nordebäck
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: [PATCH 1 of 1] amf : perform node-switchover after component cleanup 
> [#387]
> 
>  osaf/services/saf/amf/amfd/sgproc.cc |  153 
> ++++++++++++++++------------------
>  osaf/services/saf/amf/amfnd/err.cc   |    8 +-
>  osaf/services/saf/amf/amfnd/susm.cc  |   58 ++++++++++++-
>  3 files changed, 131 insertions(+), 88 deletions(-)
> 
> 
> AMF performs node-switchover before the clean up of components.
> 
> AMFND sends node-switchover request to  AMFD before initiating cleanup
> of failed component. Because of this standby becomes active before
> cleanup of components.
> 
> Send node-switchover request to AMFD after cleanup of failed component.
> 
> diff --git a/osaf/services/saf/amf/amfd/sgproc.cc 
> b/osaf/services/saf/amf/amfd/sgproc.cc
> --- a/osaf/services/saf/amf/amfd/sgproc.cc
> +++ b/osaf/services/saf/amf/amfd/sgproc.cc
> @@ -368,7 +368,73 @@ static uint32_t su_recover_from_fault(AV
>       return rc;
>  }
> 
> +/**
> + * @brief       Try to repair node by sending reboot message to node
> + *              director of the node.
> + *
> + * @param[in]   ptr to node.
> + *
> + **/
> 
> +static void try_node_repair(AVD_AVND *node)
> +{
> +     TRACE_ENTER2("'%s'", node->name.value);
> +
> +     if (node->saAmfNodeAutoRepair) {
> +             LOG_NO("Ordering reboot of '%s' as node fail/switch-over repair 
> action",
> +                             node->name.value);
> +             saflog(LOG_NOTICE, amfSvcUsrName,
> +                     "Ordering reboot of '%s' as node fail/switch-over 
> repair action",
> +                     node->name.value);
> +             avd_d2n_reboot_snd(node);
> +     } else {
> +             LOG_NO("NodeAutorepair disabled for '%s', no reboot ordered",
> +                             node->name.value);
> +             saflog(LOG_NOTICE, amfSvcUsrName,
> +                     "NodeAutorepair disabled for '%s', NO reboot ordered",
> +                     node->name.value);
> +
> +     }
> +     TRACE_LEAVE();
> +}
> +
> +/**
> + * @brief       Performs node-switchover recovery.
> + *
> + * @param[in]   ptr to failed su.
> + *
> + **/
> +static void node_nodeswtichover_recovery(AVD_AVND *node)
[Hans] wrong spelling in name..., I am missing a verb in the function name, 
perform?

> +{
> +     bool node_reboot = true;
> +     TRACE_ENTER2("'%s'", node->name.value);
> +
> +     AVD_SU *i_su = node->list_of_su;
[Hans] isn't it enough with just "su" as var name?

> +     for (;i_su != NULL; i_su = i_su->avnd_list_su_next) {
> +             i_su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE);
> +
> +             if (i_su->list_of_susi == NULL)
> +                     continue;
> +
> +             if (su_recover_from_fault(i_su) == NCSCC_RC_FAILURE) {
> +                     LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, 
> i_su->name.value);
> +                     goto done;
> +             }
> +
> +             if (i_su->list_of_susi != NULL)
> +                     node_reboot = false;
> +
> +             if (avd_sg_app_su_inst_func(avd_cb, i_su->sg_of_su) == 
> NCSCC_RC_FAILURE) {
> +                     LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, 
> i_su->name.value);
> +                     goto done;
> +             }
> +     }
> +
> +     if (node_reboot == true)
> +             try_node_repair(node);
> +done:
> +     TRACE_LEAVE();
> +}
>  
> /*****************************************************************************
>   * Function: avd_su_oper_state_func
>   *
> @@ -532,81 +598,19 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb
>                                               
> avd_node_down_appl_susi_failover(avd_cb, node);
>                                               break;
>                                       }
> +                                     break;
>                               case SA_AMF_NODE_SWITCHOVER:
> -                                     i_su = node->list_of_su;
> -                                     while (i_su != NULL) {
> -                                             
> i_su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE);
> -                                             if (i_su->list_of_susi != 
> AVD_SU_SI_REL_NULL) {
> -                                                     /* Delay Node reboot if:
> -                                                             a)Faulted SU 
> has saAmfSUFailover set but
> -                                                                     other 
> healthy SUs are present on node.
> -                                                             b)Only faulted 
> SU exists on the node and its
> -                                                                     
> saAmfSUFailover is false.
> -                                                      */
> -                                                     node_reboot_req = false;
> -                                                     if (((i_su == su) && 
> (!i_su->saAmfSUFailover)) ||
> -                                                                     ((i_su 
> != su) &&
> -                                                              
> (i_su->saAmfSUOperState == SA_AMF_OPERATIONAL_ENABLED))) {
> -
> -                                                             /* Since 
> assignments exists call the SG FSM.*/
> -                                                             if 
> (su_recover_from_fault(i_su) == NCSCC_RC_FAILURE) {
> -                                                                     
> LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, i_su-
> >name.value);
> -                                                                     goto 
> done;
> -                                                             }
> -                                                     }
> -                                             }
> -
> -                                             /* Verify the SG to check if 
> any instantiations need
> -                                              * to be done for the SG on 
> which this SU exists.
> -                                              */
> -                                             if ((!su->saAmfSUFailover))
> -                                                     if 
> (avd_sg_app_su_inst_func(cb, i_su->sg_of_su) == NCSCC_RC_FAILURE) {
> -                                                             LOG_ER("%s:%d 
> %s", __FUNCTION__, __LINE__, i_su->name.value);
> -                                                             goto done;
> -                                                     }
> -
> -                                             i_su = i_su->avnd_list_su_next;
> -                                     }
> -                                     break;
> -                             case AVSV_ERR_RCVR_SU_FAILOVER:
> -                                     /* This is a case when node switchover 
> and su failover are happening together.
> -                                        Reboot node only:
> -                                        a) After gracefully removing all the 
> assignments (SUs with saAmfSUFailover set).
> -                                        b) After termination of all 
> components in faulted SU and after performing its failover
> -                                             as a single entity 
> (saAmfSUFailover set).
> -                                      */
> -                                     if (su_recover_from_fault(su) == 
> NCSCC_RC_FAILURE) {
> -                                             LOG_ER("%s:%d %s", 
> __FUNCTION__, __LINE__, su->name.value);
> -                                             goto done;
> -                                     }
> -                                     if (avd_sg_app_su_inst_func(cb, 
> su->sg_of_su) == NCSCC_RC_FAILURE) {
> -                                             LOG_ER("%s:%d %s", 
> __FUNCTION__, __LINE__, su->name.value);
> -                                             goto done;
> -                                     }
> -                                     for (i_su = node->list_of_su; i_su; 
> i_su = i_su->avnd_list_su_next) {
> -                                             if (i_su->list_of_susi != 
> AVD_SU_SI_REL_NULL) {
> -                                                     node_reboot_req = false;
> -                                                     break;
> -                                             }
> -                                     }
> +                                     
> node_nodeswtichover_recovery(su->su_on_node);
> +                                     goto done;
>                                       break;
>                               default :
>                                       break;
>                               }
> 
> -                             if (node_reboot_req) {
> -                                     if (node->saAmfNodeAutoRepair) {
> -                                             saflog(LOG_NOTICE, 
> amfSvcUsrName,
> -                                                             "Ordering 
> reboot of '%s' as node fail/switch-over repair action",
> -                                                             
> node->name.value);
> -                                             avd_d2n_reboot_snd(node);
> -                                     } else {
> -                                             saflog(LOG_NOTICE, 
> amfSvcUsrName,
> -                                                             "NodeAutorepair 
> disabled for '%s', NO reboot ordered",
> -                                                             
> node->name.value);
> -
> -                                     }
> -                             }
> +                             if (node_reboot_req)
> +                                     try_node_repair(node);
> +
> +
>                       } else { /* if 
> (n2d_msg->msg_info.n2d_opr_state.node_oper_state == 
> SA_AMF_OPERATIONAL_DISABLED) */
> 
>                               if (su->list_of_susi != AVD_SU_SI_REL_NULL) {
> @@ -1235,16 +1239,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb,
>               }
>               if (true == all_su_unassigned) {
>                       /* All app su got unassigned, Safe to reboot the blade 
> now. */
> -                     if (node->saAmfNodeAutoRepair) {
> -                             saflog(LOG_NOTICE, amfSvcUsrName,
> -                                     "Ordering reboot of '%s' as node 
> fail/switch-over repair action",
> -                                     node->name.value);
> -                             avd_d2n_reboot_snd(node);
> -                     } else {
> -                             saflog(LOG_NOTICE, amfSvcUsrName,
> -                                     "NodeAutorepair disabled for '%s', NO 
> reboot ordered",
> -                                     node->name.value);
> -                     }
> +                     try_node_repair(node);
>               }
>       }
>       /* Free the messages */
> diff --git a/osaf/services/saf/amf/amfnd/err.cc 
> b/osaf/services/saf/amf/amfnd/err.cc
> --- a/osaf/services/saf/amf/amfnd/err.cc
> +++ b/osaf/services/saf/amf/amfnd/err.cc
> @@ -814,7 +814,9 @@ uint32_t avnd_err_rcvr_node_switchover(A
>               m_AVND_SU_FAILED_SET(failed_su);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, failed_su, 
> AVND_CKPT_SU_FLAG_CHANGE);
>       }
> +
>       cb->term_state = AVND_TERM_STATE_NODE_SWITCHOVER_STARTED;
> +     cb->failed_su = failed_su;
> 
>       /* transition the su oper state to disabled */
>       m_AVND_SU_OPER_STATE_SET(failed_su, SA_AMF_OPERATIONAL_DISABLED);
> @@ -825,11 +827,6 @@ uint32_t avnd_err_rcvr_node_switchover(A
>       if(SA_AMF_OPERATIONAL_DISABLED != cb->oper_state) {
>               /* transition the node oper state to disabled */
>               cb->oper_state = SA_AMF_OPERATIONAL_DISABLED;
> -
> -             /* inform avd */
> -             rc = avnd_di_oper_send(cb, failed_su, SA_AMF_NODE_SWITCHOVER);
> -             if (NCSCC_RC_SUCCESS != rc)
> -                     goto done;
>       }
> 
>       /* We are now in the context of failover, forget the restart */
> @@ -865,7 +862,6 @@ uint32_t avnd_err_rcvr_node_switchover(A
>                               goto done;
>                       }
>               }
> -             avnd_su_si_del(cb, &failed_comp->su->name);
>       }
>       else {
>               /* terminate the failed comp */
> diff --git a/osaf/services/saf/amf/amfnd/susm.cc 
> b/osaf/services/saf/amf/amfnd/susm.cc
> --- a/osaf/services/saf/amf/amfnd/susm.cc
> +++ b/osaf/services/saf/amf/amfnd/susm.cc
> @@ -1346,6 +1346,46 @@ static bool all_comps_terminated_in_su(c
>         return true;
>  }
> 
> +static void perform_pending_nodeswitchover()
> +{
> +     bool nodeswitchover = true;
> +
> +     /* Reverify if nodeswitchover is really pending */
> +     if ((avnd_cb->term_state != AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) ||
> +                     (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED))
> +             return;
> +
> +     AVND_COMP *comp;
> +     AVND_SU *su = avnd_cb->failed_su;
> +     for (comp = 
> m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
> +                     comp;
> +             comp = 
> m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) 
> {
> +
> +             if ((comp->pres == SA_AMF_PRESENCE_INSTANTIATING) ||
> +                             (comp->pres == SA_AMF_PRESENCE_TERMINATING) ||
> +                             (comp->pres == SA_AMF_PRESENCE_RESTARTING) ||
> +                             (comp->pres == 
> SA_AMF_PRESENCE_TERMINATION_FAILED)) {
> +                     nodeswitchover= false;
> +                     break;
> +             }
> +     }
> +
> +
> +     if (nodeswitchover == true) {
> +             /* Now send nodeswitchover request to AMFD as cleanup of failed 
> component is
> +                completed in faulted SU.
> +              */
> +
> +             LOG_NO("Sending Nodeswitchover request to AMFD as '%s' "
> +                             "got failed", su->name.value);
[Hans] This log is consistent with the one for SU failover

> +             if (su->sufailover == false) {
> +                     uint32_t rc = avnd_di_oper_send(avnd_cb, su, 
> SA_AMF_NODE_SWITCHOVER);
> +                     osafassert(NCSCC_RC_SUCCESS == rc);
> +             }
> +     }
> +
> +}
> +
>  /****************************************************************************
>    Name          : avnd_su_pres_fsm_run
> 
> @@ -1392,14 +1432,26 @@ uint32_t avnd_su_pres_fsm_run(AVND_CB *c
>                  /* Since all components got successfully terminated, finish 
> sufailover at amfnd
>                     by deleting SUSIs at amfnd and informing amfd about 
> sufailover.*/
>                  LOG_NO("Terminated all components in '%s'", su->name.value);
> -                LOG_NO("Informing director of sufailover");
> -                rc = avnd_di_oper_send(avnd_cb, su, 
> AVSV_ERR_RCVR_SU_FAILOVER);
> +             if (cb->term_state == AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) {
> +                     LOG_NO("Sending Nodeswitchover request to AMFD as '%s' "
> +                                     "got failed", su->name.value);
[Hans] This log is consistent with the one for SU failover

> +                     rc = avnd_di_oper_send(avnd_cb, su, 
> SA_AMF_NODE_SWITCHOVER);
> +             }
> +             else {
> +                     LOG_NO("Informing director of sufailover");
> +                     rc = avnd_di_oper_send(avnd_cb, su, 
> AVSV_ERR_RCVR_SU_FAILOVER);
> +             }
>                  osafassert(NCSCC_RC_SUCCESS == rc);
>                  avnd_su_si_del(avnd_cb, &su->name);
>               if (!m_AVND_SU_IS_PREINSTANTIABLE(su))
>                       avnd_su_pres_state_set(su, 
> SA_AMF_PRESENCE_UNINSTANTIATED);
>               goto done;
> -        }
> +        } else if ((cb->term_state == 
> AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) &&
> +                     (cb->oper_state == SA_AMF_OPERATIONAL_DISABLED) &&
> +                     (su->sufailover == false)) {
> +             perform_pending_nodeswitchover();
> +     }
> +
[Hans] looks like two blank lines here

> 
>       /* process state change */
>       if (prv_st != final_st)

------------------------------------------------------------------------------
Open source business process management suite built on Java and Eclipse
Turn processes into business applications with Bonita BPM Community Edition
Quickly connect people, data, and systems into organized workflows
Winner of BOSSIE, CODIE, OW2 and Gartner awards
http://p.sf.net/sfu/Bonitasoft
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to