osaf/services/saf/amf/amfd/sgproc.cc |  153 ++++++++++++++++------------------
 osaf/services/saf/amf/amfnd/err.cc   |    8 +-
 osaf/services/saf/amf/amfnd/susm.cc  |   58 ++++++++++++-
 3 files changed, 131 insertions(+), 88 deletions(-)


AMF performs node-switchover before the clean up of components.

AMFND sends node-switchover request to  AMFD before initiating cleanup
of failed component. Because of this standby becomes active before
cleanup of components.

Send node-switchover request to AMFD after cleanup of failed component.

diff --git a/osaf/services/saf/amf/amfd/sgproc.cc 
b/osaf/services/saf/amf/amfd/sgproc.cc
--- a/osaf/services/saf/amf/amfd/sgproc.cc
+++ b/osaf/services/saf/amf/amfd/sgproc.cc
@@ -368,7 +368,73 @@ static uint32_t su_recover_from_fault(AV
        return rc;
 }
 
+/**
+ * @brief       Try to repair node by sending reboot message to node
+ *              director of the node.
+ *
+ * @param[in]   ptr to node. 
+ *
+ **/
 
+static void try_node_repair(AVD_AVND *node)
+{
+       TRACE_ENTER2("'%s'", node->name.value);
+
+       if (node->saAmfNodeAutoRepair) {
+               LOG_NO("Ordering reboot of '%s' as node fail/switch-over repair 
action",
+                               node->name.value);
+               saflog(LOG_NOTICE, amfSvcUsrName,
+                       "Ordering reboot of '%s' as node fail/switch-over 
repair action",
+                       node->name.value);
+               avd_d2n_reboot_snd(node);
+       } else {
+               LOG_NO("NodeAutorepair disabled for '%s', no reboot ordered",
+                               node->name.value);
+               saflog(LOG_NOTICE, amfSvcUsrName,
+                       "NodeAutorepair disabled for '%s', NO reboot ordered",
+                       node->name.value);
+
+       }
+       TRACE_LEAVE();
+}
+
+/**
+ * @brief       Performs node-switchover recovery. 
+ *
+ * @param[in]   ptr to failed su.
+ *
+ **/
+static void node_nodeswtichover_recovery(AVD_AVND *node)
+{
+       bool node_reboot = true;
+       TRACE_ENTER2("'%s'", node->name.value);
+
+       AVD_SU *i_su = node->list_of_su;
+       for (;i_su != NULL; i_su = i_su->avnd_list_su_next) {
+               i_su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE);
+
+               if (i_su->list_of_susi == NULL)
+                       continue;
+
+               if (su_recover_from_fault(i_su) == NCSCC_RC_FAILURE) {
+                       LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, 
i_su->name.value);
+                       goto done;
+               }
+
+               if (i_su->list_of_susi != NULL)
+                       node_reboot = false;
+
+               if (avd_sg_app_su_inst_func(avd_cb, i_su->sg_of_su) == 
NCSCC_RC_FAILURE) {
+                       LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, 
i_su->name.value);
+                       goto done;
+               }
+       }
+
+       if (node_reboot == true)
+               try_node_repair(node);
+done:
+       TRACE_LEAVE();
+}
 /*****************************************************************************
  * Function: avd_su_oper_state_func
  *
@@ -532,81 +598,19 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb
                                                
avd_node_down_appl_susi_failover(avd_cb, node);
                                                break;
                                        }
+                                       break;
                                case SA_AMF_NODE_SWITCHOVER:
-                                       i_su = node->list_of_su;
-                                       while (i_su != NULL) {
-                                               
i_su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE);
-                                               if (i_su->list_of_susi != 
AVD_SU_SI_REL_NULL) {
-                                                       /* Delay Node reboot if:
-                                                               a)Faulted SU 
has saAmfSUFailover set but 
-                                                                       other 
healthy SUs are present on node.
-                                                               b)Only faulted 
SU exists on the node and its 
-                                                                       
saAmfSUFailover is false.
-                                                        */
-                                                       node_reboot_req = false;
-                                                       if (((i_su == su) && 
(!i_su->saAmfSUFailover)) ||
-                                                                       ((i_su 
!= su) &&
-                                                                
(i_su->saAmfSUOperState == SA_AMF_OPERATIONAL_ENABLED))) {
-
-                                                               /* Since 
assignments exists call the SG FSM.*/
-                                                               if 
(su_recover_from_fault(i_su) == NCSCC_RC_FAILURE) {
-                                                                       
LOG_ER("%s:%d %s", __FUNCTION__, __LINE__, i_su->name.value);
-                                                                       goto 
done;
-                                                               }
-                                                       }
-                                               }
-
-                                               /* Verify the SG to check if 
any instantiations need
-                                                * to be done for the SG on 
which this SU exists.
-                                                */
-                                               if ((!su->saAmfSUFailover))
-                                                       if 
(avd_sg_app_su_inst_func(cb, i_su->sg_of_su) == NCSCC_RC_FAILURE) {
-                                                               LOG_ER("%s:%d 
%s", __FUNCTION__, __LINE__, i_su->name.value);
-                                                               goto done;
-                                                       }
-
-                                               i_su = i_su->avnd_list_su_next;
-                                       }
-                                       break;
-                               case AVSV_ERR_RCVR_SU_FAILOVER:
-                                       /* This is a case when node switchover 
and su failover are happening together.
-                                          Reboot node only:
-                                          a) After gracefully removing all the 
assignments (SUs with saAmfSUFailover set).
-                                          b) After termination of all 
components in faulted SU and after performing its failover
-                                               as a single entity 
(saAmfSUFailover set). 
-                                        */
-                                       if (su_recover_from_fault(su) == 
NCSCC_RC_FAILURE) {
-                                               LOG_ER("%s:%d %s", 
__FUNCTION__, __LINE__, su->name.value);
-                                               goto done;
-                                       }
-                                       if (avd_sg_app_su_inst_func(cb, 
su->sg_of_su) == NCSCC_RC_FAILURE) {
-                                               LOG_ER("%s:%d %s", 
__FUNCTION__, __LINE__, su->name.value);
-                                               goto done;
-                                       }
-                                       for (i_su = node->list_of_su; i_su; 
i_su = i_su->avnd_list_su_next) {
-                                               if (i_su->list_of_susi != 
AVD_SU_SI_REL_NULL) {
-                                                       node_reboot_req = false;
-                                                       break;
-                                               }
-                                       }
+                                       
node_nodeswtichover_recovery(su->su_on_node);
+                                       goto done;
                                        break;
                                default :
                                        break;
                                }
 
-                               if (node_reboot_req) {
-                                       if (node->saAmfNodeAutoRepair) {
-                                               saflog(LOG_NOTICE, 
amfSvcUsrName,
-                                                               "Ordering 
reboot of '%s' as node fail/switch-over repair action",
-                                                               
node->name.value);
-                                               avd_d2n_reboot_snd(node);
-                                       } else {
-                                               saflog(LOG_NOTICE, 
amfSvcUsrName,
-                                                               "NodeAutorepair 
disabled for '%s', NO reboot ordered",
-                                                               
node->name.value);
-                                               
-                                       }
-                               }
+                               if (node_reboot_req)
+                                       try_node_repair(node);
+
+
                        } else { /* if 
(n2d_msg->msg_info.n2d_opr_state.node_oper_state == 
SA_AMF_OPERATIONAL_DISABLED) */
 
                                if (su->list_of_susi != AVD_SU_SI_REL_NULL) {
@@ -1235,16 +1239,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb,
                }
                if (true == all_su_unassigned) {
                        /* All app su got unassigned, Safe to reboot the blade 
now. */
-                       if (node->saAmfNodeAutoRepair) {
-                               saflog(LOG_NOTICE, amfSvcUsrName,
-                                       "Ordering reboot of '%s' as node 
fail/switch-over repair action",
-                                       node->name.value);
-                               avd_d2n_reboot_snd(node);
-                       } else {
-                               saflog(LOG_NOTICE, amfSvcUsrName,
-                                       "NodeAutorepair disabled for '%s', NO 
reboot ordered",
-                                       node->name.value);
-                       }
+                       try_node_repair(node);
                }
        }
        /* Free the messages */
diff --git a/osaf/services/saf/amf/amfnd/err.cc 
b/osaf/services/saf/amf/amfnd/err.cc
--- a/osaf/services/saf/amf/amfnd/err.cc
+++ b/osaf/services/saf/amf/amfnd/err.cc
@@ -814,7 +814,9 @@ uint32_t avnd_err_rcvr_node_switchover(A
                m_AVND_SU_FAILED_SET(failed_su);
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, failed_su, 
AVND_CKPT_SU_FLAG_CHANGE);
        }
+
        cb->term_state = AVND_TERM_STATE_NODE_SWITCHOVER_STARTED;
+       cb->failed_su = failed_su;
 
        /* transition the su oper state to disabled */
        m_AVND_SU_OPER_STATE_SET(failed_su, SA_AMF_OPERATIONAL_DISABLED);
@@ -825,11 +827,6 @@ uint32_t avnd_err_rcvr_node_switchover(A
        if(SA_AMF_OPERATIONAL_DISABLED != cb->oper_state) {
                /* transition the node oper state to disabled */
                cb->oper_state = SA_AMF_OPERATIONAL_DISABLED;
-
-               /* inform avd */
-               rc = avnd_di_oper_send(cb, failed_su, SA_AMF_NODE_SWITCHOVER);
-               if (NCSCC_RC_SUCCESS != rc)
-                       goto done;
        }
 
        /* We are now in the context of failover, forget the restart */
@@ -865,7 +862,6 @@ uint32_t avnd_err_rcvr_node_switchover(A
                                goto done;
                        }
                }
-               avnd_su_si_del(cb, &failed_comp->su->name);
        }
        else {
                /* terminate the failed comp */
diff --git a/osaf/services/saf/amf/amfnd/susm.cc 
b/osaf/services/saf/amf/amfnd/susm.cc
--- a/osaf/services/saf/amf/amfnd/susm.cc
+++ b/osaf/services/saf/amf/amfnd/susm.cc
@@ -1346,6 +1346,46 @@ static bool all_comps_terminated_in_su(c
        return true;
 }
 
+static void perform_pending_nodeswitchover()
+{
+       bool nodeswitchover = true;
+
+       /* Reverify if nodeswitchover is really pending */      
+       if ((avnd_cb->term_state != AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) || 
 
+                       (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED))
+               return; 
+
+       AVND_COMP *comp;
+       AVND_SU *su = avnd_cb->failed_su;
+       for (comp = 
m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
+                       comp;
+               comp = 
m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) {
+
+               if ((comp->pres == SA_AMF_PRESENCE_INSTANTIATING) || 
+                               (comp->pres == SA_AMF_PRESENCE_TERMINATING) ||
+                               (comp->pres == SA_AMF_PRESENCE_RESTARTING) ||
+                               (comp->pres == 
SA_AMF_PRESENCE_TERMINATION_FAILED)) {
+                       nodeswitchover= false;
+                       break;
+               }
+       }
+
+
+       if (nodeswitchover == true) {
+               /* Now send nodeswitchover request to AMFD as cleanup of failed 
component is 
+                  completed in faulted SU.
+                */
+
+               LOG_NO("Sending Nodeswitchover request to AMFD as '%s' "
+                               "got failed", su->name.value);
+               if (su->sufailover == false) {
+                       uint32_t rc = avnd_di_oper_send(avnd_cb, su, 
SA_AMF_NODE_SWITCHOVER);
+                       osafassert(NCSCC_RC_SUCCESS == rc);
+               }
+       }
+
+}
+
 /****************************************************************************
   Name          : avnd_su_pres_fsm_run
  
@@ -1392,14 +1432,26 @@ uint32_t avnd_su_pres_fsm_run(AVND_CB *c
                 /* Since all components got successfully terminated, finish 
sufailover at amfnd
                    by deleting SUSIs at amfnd and informing amfd about 
sufailover.*/
                 LOG_NO("Terminated all components in '%s'", su->name.value);
-                LOG_NO("Informing director of sufailover");
-                rc = avnd_di_oper_send(avnd_cb, su, AVSV_ERR_RCVR_SU_FAILOVER);
+               if (cb->term_state == AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) {
+                       LOG_NO("Sending Nodeswitchover request to AMFD as '%s' "
+                                       "got failed", su->name.value);
+                       rc = avnd_di_oper_send(avnd_cb, su, 
SA_AMF_NODE_SWITCHOVER);
+               }
+               else {
+                       LOG_NO("Informing director of sufailover");
+                       rc = avnd_di_oper_send(avnd_cb, su, 
AVSV_ERR_RCVR_SU_FAILOVER);
+               }
                 osafassert(NCSCC_RC_SUCCESS == rc);
                 avnd_su_si_del(avnd_cb, &su->name);
                if (!m_AVND_SU_IS_PREINSTANTIABLE(su))
                        avnd_su_pres_state_set(su, 
SA_AMF_PRESENCE_UNINSTANTIATED);
                goto done;
-        }
+        } else if ((cb->term_state == AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) 
&&
+                       (cb->oper_state == SA_AMF_OPERATIONAL_DISABLED) && 
+                       (su->sufailover == false)) {
+               perform_pending_nodeswitchover();
+       }
+               
 
        /* process state change */
        if (prv_st != final_st)

------------------------------------------------------------------------------
HPCC Systems Open Source Big Data Platform from LexisNexis Risk Solutions
Find What Matters Most in Your Big Data with HPCC Systems
Open Source. Fast. Scalable. Simple. Ideal for Dirty Data.
Leverages Graph Analysis for Fast Processing & Easy Data Exploration
http://p.sf.net/sfu/hpccsystems
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to