osaf/services/saf/amf/amfd/cluster.cc |  15 ++++++++++++++
 osaf/services/saf/amf/amfd/ndfsm.cc   |  36 ++++++++++++++++++++++++++--------
 osaf/services/saf/amf/amfnd/clm.cc    |   2 -
 osaf/services/saf/amf/amfnd/di.cc     |   9 +++----
 osaf/services/saf/amf/amfnd/term.cc   |   6 +++-
 5 files changed, 50 insertions(+), 18 deletions(-)


In scenario of admin continuation after headless, if saAmfClusterStartupTimeout
configures with a pretty small value, then the admin continuation will initiate
when saAmfClusterStartupTimeout expires but the SU(s) are still in OUT OF 
SERVICE.
The eventual result is failure of admin operation after headless.

When saAmfClusterStartupTimeout expires, AMFD needs:
. ensure that all veteran nodes finish joining cluster.
. if any veteran nodes comes after the saAmfClusterStartupTimeout's expiry, this
veteran will be reboot.
. signal all veteran AMFND(s) that application assignment can be started.

diff --git a/osaf/services/saf/amf/amfd/cluster.cc 
b/osaf/services/saf/amf/amfd/cluster.cc
--- a/osaf/services/saf/amf/amfd/cluster.cc
+++ b/osaf/services/saf/amf/amfd/cluster.cc
@@ -54,9 +54,11 @@ void avd_cluster_tmr_init_evh(AVD_CL_CB 
 {
        TRACE_ENTER();
        AVD_SU *su = nullptr;
+       AVD_AVND *node = nullptr;
        saflog(LOG_NOTICE, amfSvcUsrName, "Cluster startup timeout, assigning 
SIs to SUs");
 
        osafassert(evt->info.tmr.type == AVD_TMR_CL_INIT);
+       LOG_NO("Cluster startup is done");
 
        if (avd_cluster->saAmfClusterAdminState != SA_AMF_ADMIN_UNLOCKED) {
                LOG_WA("Admin state of cluster is locked");
@@ -72,6 +74,19 @@ void avd_cluster_tmr_init_evh(AVD_CL_CB 
        cb->init_state = AVD_APP_STATE;
        m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, cb, AVSV_CKPT_AVD_CB_CONFIG);
 
+       // Resend set_leds message to all veteran nodes after cluster startup
+       // to waits for all veteran nodes becoming ENABLED
+       // This set_leds message will enables AMFND starting sending susi 
assignment
+       // message to AMFD
+       for (std::map<std::string, AVD_AVND *>::const_iterator it = 
node_name_db->begin();
+                       it != node_name_db->end(); it++) {
+               node = it->second;
+               if (node->node_state == AVD_AVND_STATE_PRESENT &&
+                       node->node_info.nodeId != cb->node_id_avd &&
+                       node->node_info.nodeId != cb->node_id_avd_other)
+                       avd_snd_set_leds_msg(cb, node);
+       }
+
        /* call the realignment routine for each of the SGs in the
         * system that are not NCS specific.
         */
diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc 
b/osaf/services/saf/amf/amfd/ndfsm.cc
--- a/osaf/services/saf/amf/amfd/ndfsm.cc
+++ b/osaf/services/saf/amf/amfd/ndfsm.cc
@@ -400,19 +400,24 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_
        if (n2d_msg->msg_info.n2d_node_up.leds_set == true) {
                TRACE("node %x is already up", avnd->node_info.nodeId);
 
-               if (cb->node_sync_window_closed == true && 
avnd->node_up_msg_count == 0) {
-                       LOG_WA("Received new node_up_msg from node:%s after 
node sync window, "
-                               "sending node reboot order to target node",
+               if (avnd->reboot) {
+                       LOG_WA("Sending node reboot order to node:%s, due to 
nodeFailFast during headless",
                                
osaf_extended_name_borrow(&n2d_msg->msg_info.n2d_node_up.node_name));
-                               avd_d2n_reboot_snd(avnd);
-                               goto done;
-               } else if (avnd->reboot) {
-                       // delayed node failfast
+               } else if (cb->node_sync_window_closed == true && 
avnd->node_up_msg_count == 0) {
+                       LOG_WA("Sending node reboot order to node:%s, due to 
first node_up_msg after node sync window",
+                               
osaf_extended_name_borrow(&n2d_msg->msg_info.n2d_node_up.node_name));
+                       avnd->reboot = true;
+               } else if (cb->init_state == AVD_APP_STATE) {
+                       LOG_WA("Sending node reboot order to node:%s, due to 
late node_up_msg after cluster startup timeout",
+                               
osaf_extended_name_borrow(&n2d_msg->msg_info.n2d_node_up.node_name));
+                       avnd->reboot = true;
+               }
+
+               if (avnd->reboot) {
                        avd_d2n_reboot_snd(avnd);
                        avnd->reboot = false;
                        goto done;
-               }
-               else {
+               } else {
                        // this node is already up
                        avd_node_state_set(avnd, AVD_AVND_STATE_PRESENT);
                        avd_node_oper_state_set(avnd, 
SA_AMF_OPERATIONAL_ENABLED);
@@ -435,6 +440,19 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_
                                if (su->is_in_service())
                                        
su->set_readiness_state(SA_AMF_READINESS_IN_SERVICE);
                        }
+                       // Start/Restart cluster init timer to wait for all 
node becomes PRESENT
+                       // and all SUs are IN-SERVICE
+                       // NOTE: start cluster init timer with 
NODE_SYNC_PERIOD. This NODE_SYNC_PERIOD
+                       // is defined to sync node_up message of one node. The 
reason is using
+                       // NODE_SYNC_PERIOD is that saAmfClusterStartupTimeout 
can be configured with a
+                       // too large or even too small, that will cause the 
presence state synchronization
+                       // of all nodes unstable.
+                       cb->amf_init_tmr.type = AVD_TMR_CL_INIT;
+                       avd_start_tmr(cb,&(cb->amf_init_tmr), 
AVSV_DEF_NODE_SYNC_PERIOD);
+                       if (cluster_su_instantiation_done(cb, nullptr) == true) 
{
+                               avd_stop_tmr(cb, &cb->amf_init_tmr);
+                               cluster_startup_expiry_event_generate(cb);
+                       }
                        goto node_joined;
                }
        }
diff --git a/osaf/services/saf/amf/amfnd/clm.cc 
b/osaf/services/saf/amf/amfnd/clm.cc
--- a/osaf/services/saf/amf/amfnd/clm.cc
+++ b/osaf/services/saf/amf/amfnd/clm.cc
@@ -182,8 +182,6 @@ uint32_t avnd_evt_avd_node_up_evh(AVND_C
        cb->su_failover_max = info->su_failover_max;
        cb->su_failover_prob = info->su_failover_prob;
 
-       cb->amfd_sync_required = false;
-
        TRACE_LEAVE();
        return rc;
 }
diff --git a/osaf/services/saf/amf/amfnd/di.cc 
b/osaf/services/saf/amf/amfnd/di.cc
--- a/osaf/services/saf/amf/amfnd/di.cc
+++ b/osaf/services/saf/amf/amfnd/di.cc
@@ -548,7 +548,7 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB
                 * but only sync info is sent for recovery
                 */
                if (evt->info.mds.i_change == NCSMDS_UP) {
-                       if (cb->amfd_sync_required && cb->led_state == 
AVND_LED_STATE_GREEN) {
+                       if (cb->is_avd_down && cb->led_state == 
AVND_LED_STATE_GREEN) {
                                avnd_sync_sisu(cb);
                                avnd_sync_csicomp(cb);
                        }
@@ -560,7 +560,7 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB
                 * provided that the no-active timer in MDS has not expired. We 
only want to send
                 * node_up/sync info in case of recovery.
                 */
-               if (evt->info.mds.i_change == NCSMDS_NEW_ACTIVE && 
cb->amfd_sync_required) {
+               if (evt->info.mds.i_change == NCSMDS_NEW_ACTIVE && 
cb->is_avd_down) {
                        if (cb->led_state == AVND_LED_STATE_GREEN) {
                                LOG_NO("Sending node up due to 
NCSMDS_NEW_ACTIVE");
 
@@ -570,9 +570,9 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB
                                avnd_send_node_up_msg();
                        }
                }
+               cb->is_avd_down = false;
        }
 
-       cb->is_avd_down = false;
        if (m_AVND_TMR_IS_ACTIVE(cb->sc_absence_tmr))
                avnd_stop_tmr(cb, &cb->sc_absence_tmr);
 
@@ -871,7 +871,7 @@ uint32_t avnd_di_susi_resp_send(AVND_CB 
                                su->name.c_str());
         }
 
-        if (cb->is_avd_down == true) {
+        if (cb->is_avd_down == true || cb->amfd_sync_required == true) {
                // We are in headless, buffer this msg
                msg.info.avd->msg_info.n2d_su_si_assign.msg_id = 0;
                if (avnd_diq_rec_add(cb, &msg) == nullptr) {
@@ -1270,7 +1270,6 @@ void avnd_diq_rec_del(AVND_CB *cb, AVND_
        /* stop the AvD msg response timer */
        if (m_AVND_TMR_IS_ACTIVE(rec->resp_tmr)) {
                m_AVND_TMR_MSG_RESP_STOP(cb, *rec);
-               avnd_diq_rec_send_buffered_msg(cb);
                /* resend pg start track */
                avnd_di_resend_pg_start_track(cb);
        }
diff --git a/osaf/services/saf/amf/amfnd/term.cc 
b/osaf/services/saf/amf/amfnd/term.cc
--- a/osaf/services/saf/amf/amfnd/term.cc
+++ b/osaf/services/saf/amf/amfnd/term.cc
@@ -203,14 +203,16 @@ uint32_t avnd_evt_avd_set_leds_evh(AVND_
 
        avnd_msgid_assert(info->msg_id);
        cb->rcv_msg_id = info->msg_id;
-
+       cb->amfd_sync_required = false;
        if (cb->led_state == AVND_LED_STATE_GREEN) {
-               /* Nothing to be done we have already got this msg */
+               // Resend buffered headless msg
+               avnd_diq_rec_send_buffered_msg(cb);
                goto done;
        }
 
        cb->led_state = AVND_LED_STATE_GREEN;
 
+
        /* Notify the NIS script/deamon that we have fully come up */
        rc = nid_notify(const_cast<char*>("AMFND"), NCSCC_RC_SUCCESS, nullptr);
 

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to