osaf/services/saf/amf/amfd/cluster.cc | 15 ++++++++++++++
osaf/services/saf/amf/amfd/ndfsm.cc | 36 ++++++++++++++++++++++++++--------
osaf/services/saf/amf/amfnd/clm.cc | 2 -
osaf/services/saf/amf/amfnd/di.cc | 9 +++----
osaf/services/saf/amf/amfnd/term.cc | 6 +++-
5 files changed, 50 insertions(+), 18 deletions(-)
In scenario of admin continuation after headless, if saAmfClusterStartupTimeout
configures with a pretty small value, then the admin continuation will initiate
when saAmfClusterStartupTimeout expires but the SU(s) are still in OUT OF
SERVICE.
The eventual result is failure of admin operation after headless.
When saAmfClusterStartupTimeout expires, AMFD needs:
. ensure that all veteran nodes finish joining cluster.
. if any veteran nodes comes after the saAmfClusterStartupTimeout's expiry, this
veteran will be reboot.
. signal all veteran AMFND(s) that application assignment can be started.
diff --git a/osaf/services/saf/amf/amfd/cluster.cc
b/osaf/services/saf/amf/amfd/cluster.cc
--- a/osaf/services/saf/amf/amfd/cluster.cc
+++ b/osaf/services/saf/amf/amfd/cluster.cc
@@ -54,9 +54,11 @@ void avd_cluster_tmr_init_evh(AVD_CL_CB
{
TRACE_ENTER();
AVD_SU *su = nullptr;
+ AVD_AVND *node = nullptr;
saflog(LOG_NOTICE, amfSvcUsrName, "Cluster startup timeout, assigning
SIs to SUs");
osafassert(evt->info.tmr.type == AVD_TMR_CL_INIT);
+ LOG_NO("Cluster startup is done");
if (avd_cluster->saAmfClusterAdminState != SA_AMF_ADMIN_UNLOCKED) {
LOG_WA("Admin state of cluster is locked");
@@ -72,6 +74,19 @@ void avd_cluster_tmr_init_evh(AVD_CL_CB
cb->init_state = AVD_APP_STATE;
m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, cb, AVSV_CKPT_AVD_CB_CONFIG);
+ // Resend set_leds message to all veteran nodes after cluster startup
+ // to waits for all veteran nodes becoming ENABLED
+ // This set_leds message will enables AMFND starting sending susi
assignment
+ // message to AMFD
+ for (std::map<std::string, AVD_AVND *>::const_iterator it =
node_name_db->begin();
+ it != node_name_db->end(); it++) {
+ node = it->second;
+ if (node->node_state == AVD_AVND_STATE_PRESENT &&
+ node->node_info.nodeId != cb->node_id_avd &&
+ node->node_info.nodeId != cb->node_id_avd_other)
+ avd_snd_set_leds_msg(cb, node);
+ }
+
/* call the realignment routine for each of the SGs in the
* system that are not NCS specific.
*/
diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc
b/osaf/services/saf/amf/amfd/ndfsm.cc
--- a/osaf/services/saf/amf/amfd/ndfsm.cc
+++ b/osaf/services/saf/amf/amfd/ndfsm.cc
@@ -400,19 +400,24 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_
if (n2d_msg->msg_info.n2d_node_up.leds_set == true) {
TRACE("node %x is already up", avnd->node_info.nodeId);
- if (cb->node_sync_window_closed == true &&
avnd->node_up_msg_count == 0) {
- LOG_WA("Received new node_up_msg from node:%s after
node sync window, "
- "sending node reboot order to target node",
+ if (avnd->reboot) {
+ LOG_WA("Sending node reboot order to node:%s, due to
nodeFailFast during headless",
osaf_extended_name_borrow(&n2d_msg->msg_info.n2d_node_up.node_name));
- avd_d2n_reboot_snd(avnd);
- goto done;
- } else if (avnd->reboot) {
- // delayed node failfast
+ } else if (cb->node_sync_window_closed == true &&
avnd->node_up_msg_count == 0) {
+ LOG_WA("Sending node reboot order to node:%s, due to
first node_up_msg after node sync window",
+
osaf_extended_name_borrow(&n2d_msg->msg_info.n2d_node_up.node_name));
+ avnd->reboot = true;
+ } else if (cb->init_state == AVD_APP_STATE) {
+ LOG_WA("Sending node reboot order to node:%s, due to
late node_up_msg after cluster startup timeout",
+
osaf_extended_name_borrow(&n2d_msg->msg_info.n2d_node_up.node_name));
+ avnd->reboot = true;
+ }
+
+ if (avnd->reboot) {
avd_d2n_reboot_snd(avnd);
avnd->reboot = false;
goto done;
- }
- else {
+ } else {
// this node is already up
avd_node_state_set(avnd, AVD_AVND_STATE_PRESENT);
avd_node_oper_state_set(avnd,
SA_AMF_OPERATIONAL_ENABLED);
@@ -435,6 +440,19 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_
if (su->is_in_service())
su->set_readiness_state(SA_AMF_READINESS_IN_SERVICE);
}
+ // Start/Restart cluster init timer to wait for all
node becomes PRESENT
+ // and all SUs are IN-SERVICE
+ // NOTE: start cluster init timer with
NODE_SYNC_PERIOD. This NODE_SYNC_PERIOD
+ // is defined to sync node_up message of one node. The
reason is using
+ // NODE_SYNC_PERIOD is that saAmfClusterStartupTimeout
can be configured with a
+ // too large or even too small, that will cause the
presence state synchronization
+ // of all nodes unstable.
+ cb->amf_init_tmr.type = AVD_TMR_CL_INIT;
+ avd_start_tmr(cb,&(cb->amf_init_tmr),
AVSV_DEF_NODE_SYNC_PERIOD);
+ if (cluster_su_instantiation_done(cb, nullptr) == true)
{
+ avd_stop_tmr(cb, &cb->amf_init_tmr);
+ cluster_startup_expiry_event_generate(cb);
+ }
goto node_joined;
}
}
diff --git a/osaf/services/saf/amf/amfnd/clm.cc
b/osaf/services/saf/amf/amfnd/clm.cc
--- a/osaf/services/saf/amf/amfnd/clm.cc
+++ b/osaf/services/saf/amf/amfnd/clm.cc
@@ -182,8 +182,6 @@ uint32_t avnd_evt_avd_node_up_evh(AVND_C
cb->su_failover_max = info->su_failover_max;
cb->su_failover_prob = info->su_failover_prob;
- cb->amfd_sync_required = false;
-
TRACE_LEAVE();
return rc;
}
diff --git a/osaf/services/saf/amf/amfnd/di.cc
b/osaf/services/saf/amf/amfnd/di.cc
--- a/osaf/services/saf/amf/amfnd/di.cc
+++ b/osaf/services/saf/amf/amfnd/di.cc
@@ -548,7 +548,7 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB
* but only sync info is sent for recovery
*/
if (evt->info.mds.i_change == NCSMDS_UP) {
- if (cb->amfd_sync_required && cb->led_state ==
AVND_LED_STATE_GREEN) {
+ if (cb->is_avd_down && cb->led_state ==
AVND_LED_STATE_GREEN) {
avnd_sync_sisu(cb);
avnd_sync_csicomp(cb);
}
@@ -560,7 +560,7 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB
* provided that the no-active timer in MDS has not expired. We
only want to send
* node_up/sync info in case of recovery.
*/
- if (evt->info.mds.i_change == NCSMDS_NEW_ACTIVE &&
cb->amfd_sync_required) {
+ if (evt->info.mds.i_change == NCSMDS_NEW_ACTIVE &&
cb->is_avd_down) {
if (cb->led_state == AVND_LED_STATE_GREEN) {
LOG_NO("Sending node up due to
NCSMDS_NEW_ACTIVE");
@@ -570,9 +570,9 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB
avnd_send_node_up_msg();
}
}
+ cb->is_avd_down = false;
}
- cb->is_avd_down = false;
if (m_AVND_TMR_IS_ACTIVE(cb->sc_absence_tmr))
avnd_stop_tmr(cb, &cb->sc_absence_tmr);
@@ -871,7 +871,7 @@ uint32_t avnd_di_susi_resp_send(AVND_CB
su->name.c_str());
}
- if (cb->is_avd_down == true) {
+ if (cb->is_avd_down == true || cb->amfd_sync_required == true) {
// We are in headless, buffer this msg
msg.info.avd->msg_info.n2d_su_si_assign.msg_id = 0;
if (avnd_diq_rec_add(cb, &msg) == nullptr) {
@@ -1270,7 +1270,6 @@ void avnd_diq_rec_del(AVND_CB *cb, AVND_
/* stop the AvD msg response timer */
if (m_AVND_TMR_IS_ACTIVE(rec->resp_tmr)) {
m_AVND_TMR_MSG_RESP_STOP(cb, *rec);
- avnd_diq_rec_send_buffered_msg(cb);
/* resend pg start track */
avnd_di_resend_pg_start_track(cb);
}
diff --git a/osaf/services/saf/amf/amfnd/term.cc
b/osaf/services/saf/amf/amfnd/term.cc
--- a/osaf/services/saf/amf/amfnd/term.cc
+++ b/osaf/services/saf/amf/amfnd/term.cc
@@ -203,14 +203,16 @@ uint32_t avnd_evt_avd_set_leds_evh(AVND_
avnd_msgid_assert(info->msg_id);
cb->rcv_msg_id = info->msg_id;
-
+ cb->amfd_sync_required = false;
if (cb->led_state == AVND_LED_STATE_GREEN) {
- /* Nothing to be done we have already got this msg */
+ // Resend buffered headless msg
+ avnd_diq_rec_send_buffered_msg(cb);
goto done;
}
cb->led_state = AVND_LED_STATE_GREEN;
+
/* Notify the NIS script/deamon that we have fully come up */
rc = nid_notify(const_cast<char*>("AMFND"), NCSCC_RC_SUCCESS, nullptr);
------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel