Hi Minh, All patches are not received. Please attached them in the ticket. Thanks, Praveen
On 18-Aug-16 5:45 AM, Minh Hon Chau wrote: > osaf/services/saf/amf/amfd/include/sg.h | 4 +- > osaf/services/saf/amf/amfd/include/susi.h | 2 + > osaf/services/saf/amf/amfd/ndfsm.cc | 15 ++++++- > osaf/services/saf/amf/amfd/sg.cc | 37 ++++++++++++++++++- > osaf/services/saf/amf/amfd/siass.cc | 59 > +++++++++++++++++++++++++++++- > osaf/services/saf/amf/amfd/su.cc | 12 ++++++ > 6 files changed, 121 insertions(+), 8 deletions(-) > > > Since headless interuption is unplanned action and writing rta to IMM > is currently queued up in AMFD implemenentation. That can result into > inappropriate states of SG fsm state, SUSI fsm state, ha state, > SUOperationList, etc. Eventually, AMFD will run into SG unstable, false > assertion, or even SUSIs become permanently PARTIALLY, which is hard > to debug (even harder without trace) > > This patch adds a validation routine to check headless cached RTAs read > from IMM, more validation rule to be added. Also, a TODO is left for > discussion about what's a action should be taken if validation is failed. > > diff --git a/osaf/services/saf/amf/amfd/include/sg.h > b/osaf/services/saf/amf/amfd/include/sg.h > --- a/osaf/services/saf/amf/amfd/include/sg.h > +++ b/osaf/services/saf/amf/amfd/include/sg.h > @@ -418,7 +418,7 @@ public: > bool any_assignment_absent(); > void failover_absent_assignment(); > bool ng_using_saAmfSGAdminState; > - > + bool headless_validation; > uint32_t term_su_list_in_reverse(); > //Runtime calculates value of saAmfSGNumCurrAssignedSUs; > uint32_t curr_assigned_sus() const; > @@ -579,7 +579,7 @@ private: > #define m_AVD_CHK_OPLIST(i_su,flag) (flag) = > (i_su)->sg_of_su->in_su_oper_list(i_su) > > void avd_sg_read_headless_cached_rta(AVD_CL_CB *cb); > - > +bool avd_sg_validate_headless_cached_rta(AVD_CL_CB *cb); > extern void avd_sg_delete(AVD_SG *sg); > extern void avd_sg_db_add(AVD_SG *sg); > extern void avd_sg_db_remove(AVD_SG *sg); > diff --git a/osaf/services/saf/amf/amfd/include/susi.h > b/osaf/services/saf/amf/amfd/include/susi.h > --- a/osaf/services/saf/amf/amfd/include/susi.h > +++ b/osaf/services/saf/amf/amfd/include/susi.h > @@ -143,6 +143,8 @@ AVD_SU_SI_REL *avd_susi_create(AVD_CL_CB > AVD_SU_SI_STATE default_fsm = > AVD_SU_SI_STATE_ABSENT); > AVD_SU_SI_REL *avd_susi_find(AVD_CL_CB *cb, const SaNameT *su_name, const > SaNameT *si_name); > void avd_susi_update_fsm(AVD_SU_SI_REL *susi, AVD_SU_SI_STATE new_fsm_state); > +bool avd_susi_validate_headless_cached_rta(AVD_SU_SI_REL *present_susi, > + SaAmfHAStateT ha_fr_imm, AVD_SU_SI_STATE fsm_fr_imm); > void avd_susi_read_headless_cached_rta(AVD_CL_CB *cb); > extern void avd_susi_update(AVD_SU_SI_REL *susi, SaAmfHAStateT ha_state); > > diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc > b/osaf/services/saf/amf/amfd/ndfsm.cc > --- a/osaf/services/saf/amf/amfd/ndfsm.cc > +++ b/osaf/services/saf/amf/amfd/ndfsm.cc > @@ -127,13 +127,22 @@ void avd_process_state_info_queue(AVD_CL > > // Read cached rta from Imm, the order of calling > // below functions is IMPORTANT. > - // Reading sg must be after reading susi > - // Cleanup compcsi must be after reading sg > if (found_state_info == true) { > + LOG_NO("Enter restore headless cached RTAs from IMM"); > + // Read all cached susi, includes ABSENT SUSI with IMM fsm state > avd_susi_read_headless_cached_rta(cb); > + // Read SUSwitch of SU, validate toggle depends on SUSI fsm > state > + avd_su_read_headless_cached_rta(cb); > + // Read SUOperationList, set ABSENT fsm state for ABSENT SUSI > avd_sg_read_headless_cached_rta(cb); > + // Clean compcsi object of ABSENT SUSI > avd_compcsi_cleanup_imm_object(cb); > - avd_su_read_headless_cached_rta(cb); > + // Last, validate all > + bool valid = avd_sg_validate_headless_cached_rta(cb); > + if (valid) > + LOG_NO("Leave reading headless cached RTAs from IMM: > SUCCESS"); > + else > + LOG_ER("Leave reading headless cached RTAs from IMM: > FAILED"); > } > done: > TRACE("queue_size after processing: %lu", (unsigned long) > cb->evt_queue.size()); > diff --git a/osaf/services/saf/amf/amfd/sg.cc > b/osaf/services/saf/amf/amfd/sg.cc > --- a/osaf/services/saf/amf/amfd/sg.cc > +++ b/osaf/services/saf/amf/amfd/sg.cc > @@ -124,7 +124,8 @@ AVD_SG::AVD_SG(): > max_assigned_su(nullptr), > min_assigned_su(nullptr), > si_tobe_redistributed(nullptr), > - try_inst_counter(0) > + try_inst_counter(0), > + headless_validation(true) > { > adminOp = static_cast<SaAmfAdminOperationIdT>(0); > memset(&name, 0, sizeof(SaNameT)); > @@ -2115,6 +2116,9 @@ void avd_sg_read_headless_cached_rta(AVD > (SaImmAttrValuesT_2 ***)&attributes)) > == SA_AIS_OK) { > sg = sg_db->find(Amf::to_string(&sg_dn)); > if (sg && sg->sg_ncs_spec == false) { > + if (sg->headless_validation == false) { > + continue; > + } > // Read sg fsm state > rc = > immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSGFsmState"), > attributes, 0, &imm_sg_fsm_state); > @@ -2159,6 +2163,37 @@ done: > TRACE_LEAVE(); > } > > +/** > + * @brief Validate all cached RTAs read from IMM after headless. > + This validation is necessary. If AMFD doesn't have this > + validation routine and the cached RTAs are invalid, > + that would lead into *unpredictably* wrong states, which > + is hard to debug (harder if no trace) > + * @param Control block (AVD_CL_CB). > + * @Return true if valid, false otherwise. > +*/ > +bool avd_sg_validate_headless_cached_rta(AVD_CL_CB *cb) { > + TRACE_ENTER(); > + bool valid = true; > + for (std::map<std::string, AVD_SG*>::const_iterator it = sg_db->begin(); > + it != sg_db->end(); it++) { > + AVD_SG *i_sg = it->second; > + if (i_sg->sg_ncs_spec == true) { > + continue; > + } > + > + if (i_sg->headless_validation == false) { > + //TODO: AMFD should make all SUs of this SG faulty to > remove > + //all assignments, clean up IMM headless cached RTA. > + //Just assert for now > + //osafassert(false); > + valid = false; > + } > + } > + TRACE_LEAVE2("%u", valid); > + return valid; > +} > + > void AVD_SG::failover_absent_assignment() { > > TRACE_ENTER2("SG:'%s'", Amf::to_string(&name).c_str()); > diff --git a/osaf/services/saf/amf/amfd/siass.cc > b/osaf/services/saf/amf/amfd/siass.cc > --- a/osaf/services/saf/amf/amfd/siass.cc > +++ b/osaf/services/saf/amf/amfd/siass.cc > @@ -214,11 +214,17 @@ void avd_susi_read_headless_cached_rta(A > susi = avd_su_susi_find(cb, su, &si->name); > rc = immutil_getAttr("osafAmfSISUFsmState", attributes, 0, > &imm_susi_fsm); > osafassert(rc == SA_AIS_OK); > + rc = immutil_getAttr("saAmfSISUHAState", attributes, 0, > &imm_ha_state); > + osafassert(rc == SA_AIS_OK); > > if (susi) { // FOR PRESENT SUSI found in AMFND(s) > TRACE("SISU:'%s', old(imm) fsm state: %d, new(sync) fsm > state: %d", > Amf::to_string(&dn).c_str(), imm_susi_fsm, > susi->fsm); > > + if (avd_susi_validate_headless_cached_rta(susi, > imm_ha_state, > + imm_susi_fsm) == false) { > + continue; > + } > #if 1 > // If remove the below line in this #if block, AMFD > will use > // the synced fsm state, which is latest. That means, in > @@ -255,8 +261,6 @@ void avd_susi_read_headless_cached_rta(A > > } else { // For ABSENT SUSI > if (su->sg_of_su->sg_ncs_spec == false) { > - rc = immutil_getAttr("saAmfSISUHAState", > attributes, 0, &imm_ha_state); > - osafassert(rc == SA_AIS_OK); > TRACE("Absent SUSI, ha_state:'%u', > fsm_state:'%u'", imm_ha_state, imm_susi_fsm); > if (imm_susi_fsm != AVD_SU_SI_STATE_UNASGN) { > absent_susi = avd_susi_create(avd_cb, > si, su, imm_ha_state, false, AVSV_SUSI_ACT_BASE); > @@ -288,6 +292,57 @@ void avd_susi_read_headless_cached_rta(A > done: > TRACE_LEAVE(); > } > +/** > + * Validate cached RTA read from IMM > + * @param present_susi > + * @param ha_fr_imm: Ha state of @present_susi read from IMM > + * @param fsm_fr_imm: Fsm state of @present susi read from IMM > + * @return: true of valid, false otherwise > + */ > +bool avd_susi_validate_headless_cached_rta(AVD_SU_SI_REL *present_susi, > + SaAmfHAStateT ha_fr_imm, AVD_SU_SI_STATE fsm_fr_imm) { > + std::string dn = Amf::to_string(&present_susi->si->name) + "," + > + Amf::to_string(&present_susi->su->name); > + TRACE_ENTER2("SISU:'%s'", dn.c_str()); > + bool valid = true; > + // rule 1: valid ha state > + if (ha_fr_imm != present_susi->state) { > + if (ha_fr_imm == SA_AMF_HA_QUIESCING || > + ha_fr_imm == SA_AMF_HA_QUIESCED) { > + // That's fine > + ; > + } else { > + LOG_ER("SISU:'%s', old(imm) ha state: %d, new(sync) ha > state: %d", > + dn.c_str(), ha_fr_imm, present_susi->state); > + valid = false; > + goto done; > + } > + } > + // rule 2: if ha_fr_imm is QUIESCING, one of relevant entities must > + // have adminState is SHUTTINGDOWN > + if (ha_fr_imm == SA_AMF_HA_QUIESCING) { > + if (present_susi->su->saAmfSUAdminState == > SA_AMF_ADMIN_SHUTTING_DOWN || > + present_susi->si->saAmfSIAdminState == > SA_AMF_ADMIN_SHUTTING_DOWN || > + present_susi->su->sg_of_su->saAmfSGAdminState == > SA_AMF_ADMIN_SHUTTING_DOWN || > + present_susi->su->su_on_node->saAmfNodeAdminState == > SA_AMF_ADMIN_SHUTTING_DOWN) { > + // That's fine > + ; > + } else { > + LOG_ER("SISU:'%s', ha:'%u', but one of [node/sg/su/si] > is not in SHUTTING_DOWN", > + dn.c_str(), ha_fr_imm); > + valid = false; > + goto done; > + } > + } > + // TODO: more rules to be added when issue is found in reality due to > writing > + // cached RTA to IMM > +done: > + if (valid == false) > + present_susi->su->sg_of_su->headless_validation = valid; > + > + TRACE_LEAVE2("%u, %u", valid, > present_susi->su->sg_of_su->headless_validation); > + return present_susi->su->sg_of_su->headless_validation; > +} > > /***************************************************************************** > * Function: avd_susi_create > * > diff --git a/osaf/services/saf/amf/amfd/su.cc > b/osaf/services/saf/amf/amfd/su.cc > --- a/osaf/services/saf/amf/amfd/su.cc > +++ b/osaf/services/saf/amf/amfd/su.cc > @@ -1964,6 +1964,18 @@ void avd_su_read_headless_cached_rta(AVD > rc = > immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSUSwitch"), > attributes, 0, &su_toggle); > osafassert(rc == SA_AIS_OK); > + if (su_toggle == AVSV_SI_TOGGLE_SWITCH) { > + // 2N, if toggle but no pending assignment -> > bad state > + if (su->sg_of_su->sg_redundancy_model == > SA_AMF_2N_REDUNDANCY_MODEL && > + > su->sg_of_su->any_assignment_in_progress() == false){ > + LOG_ER("SG'%s', osafAmfSUSwitch:'%u', > but no pending assignment", > + > Amf::to_string(&su->sg_of_su->name).c_str(), > + su_toggle); > + su->sg_of_su->headless_validation = > false; > + } > + if (su->sg_of_su->headless_validation == false) > + continue; > + } > su->set_su_switch(su_toggle, false); > } > } > ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel