Hi Nagu, I'm running the tests with this configuration and will get back to you.
Thanks, Minh On 09/09/16 22:26, Nagendra Kumar wrote: > Hi Minh, > I am using 1725_pending_review.tgz > (1725_02_V2_bugfix_01_resend_buffer_in_set_leds.diff, > 1725_02_V2_bugfix_02_honor_clusterinit_nodesync_timer.diff, > 1725_02_V2_bugfix_03_restore_ng_admin.diff, > 1725_03_V4_failover_absent_susi_longDn.diff, > 1725_04_V2_headless_validation.diff, 1725_05_V2_resend_oper_state.diff, > 1725_06a_fullscope_escalation_headless.diff). > > I am doing basic node reboot validation testing with no faults. > > Configuration: SU1(act) and SU2(stanby) both on PL-3. > > TC #1: Start SC-1, PL-3 and PL-5: Unlock SU1 and SU2. Stop SC-1 and stop > PL-3, start PL-3 and start SC-1. > After SC-1 and PL-3 comes back, ideally SU1 and SU2 should get assignments as > Act and Std, but no assignment are being given to SUs on PL-3 and it shows > following in status: > > Only Su2 has Std assignment. > > safSISU=safSu=SC-1\,safSg=NoRed\,safApp=OpenSAF,safSi=NoRed1,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > safSISU=safSu=PL-5\,safSg=NoRed\,safApp=OpenSAF,safSi=NoRed2,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > safSISU=safSu=SU2\,safSg=AmfDemo_2N\,safApp=AmfDemo1,safSi=AmfDemo1,safApp=AmfDemo1 > saAmfSISUHAState=STANDBY(2) > safSISU=safSu=SC-1\,safSg=2N\,safApp=OpenSAF,safSi=SC-2N,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > safSISU=safSu=PL-3\,safSg=NoRed\,safApp=OpenSAF,safSi=NoRed3,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > > TC #2: Configuration same as TC#1. Stop PL-3 and don't start. The same issue: > safSISU=safSu=PL-5\,safSg=NoRed\,safApp=OpenSAF,safSi=NoRed3,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > safSISU=safSu=SU2\,safSg=AmfDemo_2N\,safApp=AmfDemo1,safSi=AmfDemo1,safApp=AmfDemo1 > saAmfSISUHAState=STANDBY(2) > safSISU=safSu=SC-1\,safSg=NoRed\,safApp=OpenSAF,safSi=NoRed2,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > safSISU=safSu=SC-1\,safSg=2N\,safApp=OpenSAF,safSi=SC-2N,safApp=OpenSAF > saAmfSISUHAState=ACTIVE(1) > > TC #3: Configured SU1(Act) on PL-3 and SU2(Std) on PL-4. > Stop SC-1, stop PL-3 and PL-4, but PL-5 is running. start SC-1, the same > issue. > > TC #4: Same as TC #3, but SU3 configured on PL-5 as spare. SU3 doesn't get > any assignment and Sg is unstable. > > Thanks > -Nagu > >> -----Original Message----- >> From: Minh Hon Chau [mailto:minh.c...@dektech.com.au] >> Sent: 18 August 2016 05:46 >> To: hans.nordeb...@ericsson.com; Nagendra Kumar; Praveen Malviya; >> gary....@dektech.com.au; long.hb.ngu...@dektech.com.au; >> minh.c...@dektech.com.au >> Cc: opensaf-devel@lists.sourceforge.net >> Subject: [PATCH 2 of 4] AMFND: Admin operation continuation if csi >> completes during headless [#1725 part 1] V1 >> >> osaf/services/saf/amf/amfnd/di.cc | 199 >> +++++++++++++++++-------- >> osaf/services/saf/amf/amfnd/include/avnd_di.h | 1 + >> 2 files changed, 134 insertions(+), 66 deletions(-) >> >> >> There're two options basically that AMFD can continue admin operation wih >> completed csi(s) >> >> First: AMFD can use the sync SUSI fsm state as latest, AMFD then has to >> explore its SUSI assignments with adminStates of relevant entities to >> determine which SU should be on call of susi_success(). Deeper level of >> exploration for csi addition. It also depends on SG Fsm state which is being >> used variously in different SG types. >> >> Second: AMFD uses the SUSI fsm state read from IMM as latest, and AMFND >> needs to resend susi_resp messages which were deferred during headless so >> that AMFD can continue the admin operation sequence. Both cases of csi >> completion [during or after] headless can run in the same code flow. >> >> The patch buffers susi_resp_msg during headless stage and resend it to >> AMFD after headless. There could be a chance that AMFND sent out susi >> response message but AMFD could not receive or process it. This case could >> be seen as a defect, which can be fixed by securing the result of sending >> susi_resp message from AMFND toward AMFD. >> >> diff --git a/osaf/services/saf/amf/amfnd/di.cc >> b/osaf/services/saf/amf/amfnd/di.cc >> --- a/osaf/services/saf/amf/amfnd/di.cc >> +++ b/osaf/services/saf/amf/amfnd/di.cc >> @@ -805,11 +805,6 @@ uint32_t avnd_di_susi_resp_send(AVND_CB >> if (cb->term_state == >> AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED) >> return rc; >> >> - if (cb->is_avd_down == true) { >> - m_AVND_SU_ALL_SI_RESET(su); >> - return rc; >> - } >> - >> // should be in assignment pending state to be here >> osafassert(m_AVND_SU_IS_ASSIGN_PEND(su)); >> >> @@ -820,64 +815,76 @@ uint32_t avnd_di_susi_resp_send(AVND_CB >> TRACE_ENTER2("Sending Resp su=%s, si=%s, curr_state=%u, >> prv_state=%u", su->name.value, curr_si->name.value,curr_si- >>> curr_state,curr_si->prv_state); >> /* populate the susi resp msg */ >> msg.info.avd = new AVSV_DND_MSG(); >> - msg.type = AVND_MSG_AVD; >> - msg.info.avd->msg_type = AVSV_N2D_INFO_SU_SI_ASSIGN_MSG; >> - msg.info.avd->msg_info.n2d_su_si_assign.msg_id = ++(cb- >>> snd_msg_id); >> - msg.info.avd->msg_info.n2d_su_si_assign.node_id = cb- >>> node_info.nodeId; >> - if (si) { >> - msg.info.avd->msg_info.n2d_su_si_assign.single_csi = >> - ((si->single_csi_add_rem_in_si == >> AVSV_SUSI_ACT_BASE) ? >> false : true); >> - } >> - TRACE("curr_assign_state '%u'", curr_si->curr_assign_state); >> - msg.info.avd->msg_info.n2d_su_si_assign.msg_act = >> - (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNED(curr_si) || >> - m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNING(curr_si)) ? >> - ((!curr_si->prv_state) ? AVSV_SUSI_ACT_ASGN : >> AVSV_SUSI_ACT_MOD) : AVSV_SUSI_ACT_DEL; >> - msg.info.avd->msg_info.n2d_su_si_assign.su_name = su->name; >> - if (si) { >> - msg.info.avd->msg_info.n2d_su_si_assign.si_name = si->name; >> - if (AVSV_SUSI_ACT_ASGN == si->single_csi_add_rem_in_si) { >> - TRACE("si->curr_assign_state '%u'", curr_si- >>> curr_assign_state); >> - msg.info.avd->msg_info.n2d_su_si_assign.msg_act = >> - >> (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNED(curr_si) || >> - >> m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNING(curr_si)) ? >> - AVSV_SUSI_ACT_ASGN : AVSV_SUSI_ACT_DEL; >> - } >> - } >> - msg.info.avd->msg_info.n2d_su_si_assign.ha_state = >> - (SA_AMF_HA_QUIESCING == curr_si->curr_state) ? >> SA_AMF_HA_QUIESCED : curr_si->curr_state; >> - msg.info.avd->msg_info.n2d_su_si_assign.error = >> - (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNED(curr_si) || >> - m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_REMOVED(curr_si)) ? >> NCSCC_RC_SUCCESS : NCSCC_RC_FAILURE; >> + msg.type = AVND_MSG_AVD; >> + msg.info.avd->msg_type = AVSV_N2D_INFO_SU_SI_ASSIGN_MSG; >> + msg.info.avd->msg_info.n2d_su_si_assign.node_id = cb- >>> node_info.nodeId; >> + if (si) { >> + msg.info.avd->msg_info.n2d_su_si_assign.single_csi = >> + ((si->single_csi_add_rem_in_si == >> AVSV_SUSI_ACT_BASE) ? false : true); >> + } >> + TRACE("curr_assign_state '%u'", curr_si->curr_assign_state); >> + msg.info.avd->msg_info.n2d_su_si_assign.msg_act = >> + >> (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNED(curr_si) || >> + >> m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNING(curr_si)) ? >> + ((!curr_si->prv_state) ? >> AVSV_SUSI_ACT_ASGN : AVSV_SUSI_ACT_MOD) : AVSV_SUSI_ACT_DEL; >> + msg.info.avd->msg_info.n2d_su_si_assign.su_name = su->name; >> + if (si) { >> + msg.info.avd->msg_info.n2d_su_si_assign.si_name = si- >>> name; >> + if (AVSV_SUSI_ACT_ASGN == si->single_csi_add_rem_in_si) { >> + TRACE("si->curr_assign_state '%u'", curr_si- >>> curr_assign_state); >> + msg.info.avd- >>> msg_info.n2d_su_si_assign.msg_act = >> + >> (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNED(curr_si) || >> + >> m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNING(curr_si)) ? >> + AVSV_SUSI_ACT_ASGN : >> AVSV_SUSI_ACT_DEL; >> + } >> + } >> + msg.info.avd->msg_info.n2d_su_si_assign.ha_state = >> + (SA_AMF_HA_QUIESCING == curr_si->curr_state) ? >> SA_AMF_HA_QUIESCED : curr_si->curr_state; >> + msg.info.avd->msg_info.n2d_su_si_assign.error = >> + >> (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_ASSIGNED(curr_si) || >> + >> m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_REMOVED(curr_si)) ? >> +NCSCC_RC_SUCCESS : NCSCC_RC_FAILURE; >> >> - if (msg.info.avd->msg_info.n2d_su_si_assign.msg_act == >> AVSV_SUSI_ACT_ASGN) >> - osafassert(si); >> + if (msg.info.avd->msg_info.n2d_su_si_assign.msg_act == >> AVSV_SUSI_ACT_ASGN) >> + osafassert(si); >> >> - /* send the msg to AvD */ >> - TRACE("Sending. msg_id'%u', node_id'%u', msg_act'%u', su'%s', >> si'%s', >> ha_state'%u', error'%u', single_csi'%u'", >> - msg.info.avd->msg_info.n2d_su_si_assign.msg_id, msg.info.avd- >>> msg_info.n2d_su_si_assign.node_id, >> - msg.info.avd->msg_info.n2d_su_si_assign.msg_act, >> msg.info.avd- >>> msg_info.n2d_su_si_assign.su_name.value, >> - msg.info.avd->msg_info.n2d_su_si_assign.si_name.value, >> msg.info.avd->msg_info.n2d_su_si_assign.ha_state, >> - msg.info.avd->msg_info.n2d_su_si_assign.error, msg.info.avd- >>> msg_info.n2d_su_si_assign.single_csi); >> + /* send the msg to AvD */ >> + TRACE("Sending. msg_id'%u', node_id'%u', msg_act'%u', su'%s', >> si'%s', ha_state'%u', error'%u', single_csi'%u'", >> + msg.info.avd->msg_info.n2d_su_si_assign.msg_id, >> msg.info.avd->msg_info.n2d_su_si_assign.node_id, >> + msg.info.avd->msg_info.n2d_su_si_assign.msg_act, >> msg.info.avd->msg_info.n2d_su_si_assign.su_name.value, >> + msg.info.avd->msg_info.n2d_su_si_assign.si_name.value, >> msg.info.avd->msg_info.n2d_su_si_assign.ha_state, >> + msg.info.avd->msg_info.n2d_su_si_assign.error, >> +msg.info.avd->msg_info.n2d_su_si_assign.single_csi); >> >> - if ((su->si_list.n_nodes > 1) && (si == nullptr)) { >> - if (msg.info.avd->msg_info.n2d_su_si_assign.msg_act == >> AVSV_SUSI_ACT_DEL) >> - LOG_NO("Removed 'all SIs' from '%s'", >> su->name.value); >> + if ((su->si_list.n_nodes > 1) && (si == nullptr)) { >> + if (msg.info.avd->msg_info.n2d_su_si_assign.msg_act == >> AVSV_SUSI_ACT_DEL) >> + LOG_NO("Removed 'all SIs' from '%s'", su- >>> name.value); >> - if (msg.info.avd->msg_info.n2d_su_si_assign.msg_act == >> AVSV_SUSI_ACT_MOD) >> - LOG_NO("Assigned 'all SIs' %s of '%s'", >> - ha_state[msg.info.avd- >>> msg_info.n2d_su_si_assign.ha_state], >> - su->name.value); >> - } >> + if (msg.info.avd->msg_info.n2d_su_si_assign.msg_act == >> AVSV_SUSI_ACT_MOD) >> + LOG_NO("Assigned 'all SIs' %s of '%s'", >> + ha_state[msg.info.avd- >>> msg_info.n2d_su_si_assign.ha_state], >> + su->name.value); >> + } >> >> - rc = avnd_di_msg_send(cb, &msg); >> - if (NCSCC_RC_SUCCESS == rc) >> - msg.info.avd = 0; >> - >> - /* we have completed the SU SI msg processing */ >> - if (su_assign_state_is_stable(su)) >> - m_AVND_SU_ASSIGN_PEND_RESET(su); >> - m_AVND_SU_ALL_SI_RESET(su); >> + if (cb->is_avd_down == true) { >> + // We are in headless, buffer this msg >> + msg.info.avd->msg_info.n2d_su_si_assign.msg_id = 0; >> + if (avnd_diq_rec_add(cb, &msg) == nullptr) { >> + rc = NCSCC_RC_FAILURE; >> + } >> + m_AVND_SU_ALL_SI_RESET(su); >> + LOG_NO("avnd_di_susi_resp_send() deferred as AMF >> director is offline"); >> + } else { >> + // We are in normal cluster, send msg to director >> + msg.info.avd->msg_info.n2d_su_si_assign.msg_id = ++(cb- >>> snd_msg_id); >> + /* send the msg to AvD */ >> + rc = avnd_di_msg_send(cb, &msg); >> + if (NCSCC_RC_SUCCESS == rc) >> + msg.info.avd = 0; >> + /* we have completed the SU SI msg processing */ >> + if (su_assign_state_is_stable(su)) { >> + m_AVND_SU_ASSIGN_PEND_RESET(su); >> + } >> + m_AVND_SU_ALL_SI_RESET(su); >> + } >> >> /* free the contents of avnd message */ >> avnd_msg_content_free(cb, &msg); >> @@ -1256,14 +1263,7 @@ void avnd_diq_rec_del(AVND_CB *cb, AVND_ >> /* stop the AvD msg response timer */ >> if (m_AVND_TMR_IS_ACTIVE(rec->resp_tmr)) { >> m_AVND_TMR_MSG_RESP_STOP(cb, *rec); >> - // Resend msgs from queue because amfd dropped during >> sync >> - if ((cb->dnd_list.head != nullptr)) { >> - TRACE("retransmit message to amfd"); >> - AVND_DND_MSG_LIST *pending_rec = 0; >> - for (pending_rec = cb->dnd_list.head; pending_rec != >> nullptr; pending_rec = pending_rec->next) { >> - avnd_diq_rec_send(cb, pending_rec); >> - } >> - } >> + avnd_diq_rec_send_buffered_msg(cb); >> /* resend pg start track */ >> avnd_di_resend_pg_start_track(cb); >> } >> @@ -1276,6 +1276,73 @@ void avnd_diq_rec_del(AVND_CB *cb, AVND_ >> TRACE_LEAVE(); >> return; >> } >> +/************************************************************ >> **************** >> + Name : avnd_diq_rec_send_buffered_msg >> + >> + Description : Resend buffered msg >> + >> + Arguments : cb - ptr to the AvND control block >> + >> + Return Values : None. >> + >> + Notes : None. >> +************************************************************* >> ********** >> +*******/ void avnd_diq_rec_send_buffered_msg(AVND_CB *cb) { >> + TRACE_ENTER(); >> + // Resend msgs from queue because amfnd dropped during headless >> + // or headless-synchronization >> + if ((cb->dnd_list.head != nullptr)) { >> + AVND_DND_MSG_LIST *pending_rec = 0; >> + TRACE("Attach msg_id of buffered msg"); >> + bool found = true; >> + while (found) { >> + found = false; >> + for (pending_rec = cb->dnd_list.head; pending_rec != >> nullptr; pending_rec = pending_rec->next) { >> + if (pending_rec->msg.type == >> AVND_MSG_AVD) { >> + // At this moment, only oper_state >> msg needs to report to director >> + if (pending_rec->msg.info.avd- >>> msg_type == AVSV_N2D_INFO_SU_SI_ASSIGN_MSG && >> + pending_rec->msg.info.avd- >>> msg_info.n2d_su_si_assign.msg_id == 0) { >> + m_AVND_DIQ_REC_POP(cb, >> pending_rec); #if 0 >> + // only resend if this SUSI >> does exist >> + AVND_SU *su = >> m_AVND_SUDB_REC_GET(cb->sudb, >> + pending_rec- >>> msg.info.avd->msg_info.n2d_su_si_assign.su_name); >> + if (su != nullptr && su- >>> si_list.n_nodes > 0) { #endif >> + pending_rec- >>> msg.info.avd->msg_info.n2d_su_si_assign.msg_id = ++(cb->snd_msg_id); >> + >> m_AVND_DIQ_REC_PUSH(cb, pending_rec); >> + LOG_NO("Found and >> resend buffered su_si_assign msg for SU:'%s', " >> + >> "SI:'%s', ha_state:'%u', msg_act:'%u', single_csi:'%u', " >> + >> "error:'%u', msg_id:'%u'", >> + >> pending_rec->msg.info.avd- >>> msg_info.n2d_su_si_assign.su_name.value, >> + >> pending_rec->msg.info.avd- >>> msg_info.n2d_su_si_assign.si_name.value, >> + >> pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.ha_state, >> + >> pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.msg_act, >> + >> pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.single_csi, >> + >> pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.error, >> + >> pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.msg_id); >> + >> +#if 0 >> + } else { >> + >> avnd_msg_content_free(cb, &pending_rec->msg); >> + delete pending_rec; >> + pending_rec = cb- >>> dnd_list.head; >> + } >> +#endif >> + found = true; >> + } >> + } >> + } >> + } >> + TRACE("retransmit message to amfd"); >> + for (pending_rec = cb->dnd_list.head; pending_rec != nullptr; >> pending_rec = pending_rec->next) { >> + avnd_diq_rec_send(cb, pending_rec); >> + } >> + } >> + TRACE_LEAVE(); >> + return; >> +} >> >> >> /************************************************************* >> *************** >> Name : avnd_diq_rec_send >> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_di.h >> b/osaf/services/saf/amf/amfnd/include/avnd_di.h >> --- a/osaf/services/saf/amf/amfnd/include/avnd_di.h >> +++ b/osaf/services/saf/amf/amfnd/include/avnd_di.h >> @@ -79,6 +79,7 @@ void avnd_di_msg_ack_process(struct avnd void >> avnd_diq_del(struct avnd_cb_tag *); AVND_DND_MSG_LIST >> *avnd_diq_rec_add(struct avnd_cb_tag *cb, AVND_MSG *msg); void >> avnd_diq_rec_del(struct avnd_cb_tag *cb, AVND_DND_MSG_LIST *rec); >> +void avnd_diq_rec_send_buffered_msg(struct avnd_cb_tag *cb); >> uint32_t avnd_diq_rec_send(struct avnd_cb_tag *cb, AVND_DND_MSG_LIST >> *rec); uint32_t avnd_di_reg_su_rsp_snd(struct avnd_cb_tag *cb, SaNameT >> *su_name, uint32_t ret_code); uint32_t avnd_di_ack_nack_msg_send(struct >> avnd_cb_tag *cb, uint32_t rcv_id, uint32_t view_num); ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel