osaf/services/saf/amf/amfd/include/sg.h | 4 +- osaf/services/saf/amf/amfd/include/susi.h | 2 + osaf/services/saf/amf/amfd/ndfsm.cc | 15 ++++++- osaf/services/saf/amf/amfd/sg.cc | 37 ++++++++++++++++++- osaf/services/saf/amf/amfd/siass.cc | 59 +++++++++++++++++++++++++++++- osaf/services/saf/amf/amfd/su.cc | 12 ++++++ 6 files changed, 121 insertions(+), 8 deletions(-)
Since headless interuption is unplanned action and writing rta to IMM is currently queued up in AMFD implemenentation. That can result into inappropriate states of SG fsm state, SUSI fsm state, ha state, SUOperationList, etc. Eventually, AMFD will run into SG unstable, false assertion, or even SUSIs become permanently PARTIALLY, which is hard to debug (even harder without trace) This patch adds a validation routine to check headless cached RTAs read from IMM, more validation rule to be added. Also, a TODO is left for discussion about what's a action should be taken if validation is failed. diff --git a/osaf/services/saf/amf/amfd/include/sg.h b/osaf/services/saf/amf/amfd/include/sg.h --- a/osaf/services/saf/amf/amfd/include/sg.h +++ b/osaf/services/saf/amf/amfd/include/sg.h @@ -418,7 +418,7 @@ public: bool any_assignment_absent(); void failover_absent_assignment(); bool ng_using_saAmfSGAdminState; - + bool headless_validation; uint32_t term_su_list_in_reverse(); //Runtime calculates value of saAmfSGNumCurrAssignedSUs; uint32_t curr_assigned_sus() const; @@ -579,7 +579,7 @@ private: #define m_AVD_CHK_OPLIST(i_su,flag) (flag) = (i_su)->sg_of_su->in_su_oper_list(i_su) void avd_sg_read_headless_cached_rta(AVD_CL_CB *cb); - +bool avd_sg_validate_headless_cached_rta(AVD_CL_CB *cb); extern void avd_sg_delete(AVD_SG *sg); extern void avd_sg_db_add(AVD_SG *sg); extern void avd_sg_db_remove(AVD_SG *sg); diff --git a/osaf/services/saf/amf/amfd/include/susi.h b/osaf/services/saf/amf/amfd/include/susi.h --- a/osaf/services/saf/amf/amfd/include/susi.h +++ b/osaf/services/saf/amf/amfd/include/susi.h @@ -143,6 +143,8 @@ AVD_SU_SI_REL *avd_susi_create(AVD_CL_CB AVD_SU_SI_STATE default_fsm = AVD_SU_SI_STATE_ABSENT); AVD_SU_SI_REL *avd_susi_find(AVD_CL_CB *cb, const SaNameT *su_name, const SaNameT *si_name); void avd_susi_update_fsm(AVD_SU_SI_REL *susi, AVD_SU_SI_STATE new_fsm_state); +bool avd_susi_validate_headless_cached_rta(AVD_SU_SI_REL *present_susi, + SaAmfHAStateT ha_fr_imm, AVD_SU_SI_STATE fsm_fr_imm); void avd_susi_read_headless_cached_rta(AVD_CL_CB *cb); extern void avd_susi_update(AVD_SU_SI_REL *susi, SaAmfHAStateT ha_state); diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc b/osaf/services/saf/amf/amfd/ndfsm.cc --- a/osaf/services/saf/amf/amfd/ndfsm.cc +++ b/osaf/services/saf/amf/amfd/ndfsm.cc @@ -127,13 +127,22 @@ void avd_process_state_info_queue(AVD_CL // Read cached rta from Imm, the order of calling // below functions is IMPORTANT. - // Reading sg must be after reading susi - // Cleanup compcsi must be after reading sg if (found_state_info == true) { + LOG_NO("Enter restore headless cached RTAs from IMM"); + // Read all cached susi, includes ABSENT SUSI with IMM fsm state avd_susi_read_headless_cached_rta(cb); + // Read SUSwitch of SU, validate toggle depends on SUSI fsm state + avd_su_read_headless_cached_rta(cb); + // Read SUOperationList, set ABSENT fsm state for ABSENT SUSI avd_sg_read_headless_cached_rta(cb); + // Clean compcsi object of ABSENT SUSI avd_compcsi_cleanup_imm_object(cb); - avd_su_read_headless_cached_rta(cb); + // Last, validate all + bool valid = avd_sg_validate_headless_cached_rta(cb); + if (valid) + LOG_NO("Leave reading headless cached RTAs from IMM: SUCCESS"); + else + LOG_ER("Leave reading headless cached RTAs from IMM: FAILED"); } done: TRACE("queue_size after processing: %lu", (unsigned long) cb->evt_queue.size()); diff --git a/osaf/services/saf/amf/amfd/sg.cc b/osaf/services/saf/amf/amfd/sg.cc --- a/osaf/services/saf/amf/amfd/sg.cc +++ b/osaf/services/saf/amf/amfd/sg.cc @@ -124,7 +124,8 @@ AVD_SG::AVD_SG(): max_assigned_su(nullptr), min_assigned_su(nullptr), si_tobe_redistributed(nullptr), - try_inst_counter(0) + try_inst_counter(0), + headless_validation(true) { adminOp = static_cast<SaAmfAdminOperationIdT>(0); memset(&name, 0, sizeof(SaNameT)); @@ -2115,6 +2116,9 @@ void avd_sg_read_headless_cached_rta(AVD (SaImmAttrValuesT_2 ***)&attributes)) == SA_AIS_OK) { sg = sg_db->find(Amf::to_string(&sg_dn)); if (sg && sg->sg_ncs_spec == false) { + if (sg->headless_validation == false) { + continue; + } // Read sg fsm state rc = immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSGFsmState"), attributes, 0, &imm_sg_fsm_state); @@ -2159,6 +2163,37 @@ done: TRACE_LEAVE(); } +/** + * @brief Validate all cached RTAs read from IMM after headless. + This validation is necessary. If AMFD doesn't have this + validation routine and the cached RTAs are invalid, + that would lead into *unpredictably* wrong states, which + is hard to debug (harder if no trace) + * @param Control block (AVD_CL_CB). + * @Return true if valid, false otherwise. +*/ +bool avd_sg_validate_headless_cached_rta(AVD_CL_CB *cb) { + TRACE_ENTER(); + bool valid = true; + for (std::map<std::string, AVD_SG*>::const_iterator it = sg_db->begin(); + it != sg_db->end(); it++) { + AVD_SG *i_sg = it->second; + if (i_sg->sg_ncs_spec == true) { + continue; + } + + if (i_sg->headless_validation == false) { + //TODO: AMFD should make all SUs of this SG faulty to remove + //all assignments, clean up IMM headless cached RTA. + //Just assert for now + //osafassert(false); + valid = false; + } + } + TRACE_LEAVE2("%u", valid); + return valid; +} + void AVD_SG::failover_absent_assignment() { TRACE_ENTER2("SG:'%s'", Amf::to_string(&name).c_str()); diff --git a/osaf/services/saf/amf/amfd/siass.cc b/osaf/services/saf/amf/amfd/siass.cc --- a/osaf/services/saf/amf/amfd/siass.cc +++ b/osaf/services/saf/amf/amfd/siass.cc @@ -214,11 +214,17 @@ void avd_susi_read_headless_cached_rta(A susi = avd_su_susi_find(cb, su, &si->name); rc = immutil_getAttr("osafAmfSISUFsmState", attributes, 0, &imm_susi_fsm); osafassert(rc == SA_AIS_OK); + rc = immutil_getAttr("saAmfSISUHAState", attributes, 0, &imm_ha_state); + osafassert(rc == SA_AIS_OK); if (susi) { // FOR PRESENT SUSI found in AMFND(s) TRACE("SISU:'%s', old(imm) fsm state: %d, new(sync) fsm state: %d", Amf::to_string(&dn).c_str(), imm_susi_fsm, susi->fsm); + if (avd_susi_validate_headless_cached_rta(susi, imm_ha_state, + imm_susi_fsm) == false) { + continue; + } #if 1 // If remove the below line in this #if block, AMFD will use // the synced fsm state, which is latest. That means, in @@ -255,8 +261,6 @@ void avd_susi_read_headless_cached_rta(A } else { // For ABSENT SUSI if (su->sg_of_su->sg_ncs_spec == false) { - rc = immutil_getAttr("saAmfSISUHAState", attributes, 0, &imm_ha_state); - osafassert(rc == SA_AIS_OK); TRACE("Absent SUSI, ha_state:'%u', fsm_state:'%u'", imm_ha_state, imm_susi_fsm); if (imm_susi_fsm != AVD_SU_SI_STATE_UNASGN) { absent_susi = avd_susi_create(avd_cb, si, su, imm_ha_state, false, AVSV_SUSI_ACT_BASE); @@ -288,6 +292,57 @@ void avd_susi_read_headless_cached_rta(A done: TRACE_LEAVE(); } +/** + * Validate cached RTA read from IMM + * @param present_susi + * @param ha_fr_imm: Ha state of @present_susi read from IMM + * @param fsm_fr_imm: Fsm state of @present susi read from IMM + * @return: true of valid, false otherwise + */ +bool avd_susi_validate_headless_cached_rta(AVD_SU_SI_REL *present_susi, + SaAmfHAStateT ha_fr_imm, AVD_SU_SI_STATE fsm_fr_imm) { + std::string dn = Amf::to_string(&present_susi->si->name) + "," + + Amf::to_string(&present_susi->su->name); + TRACE_ENTER2("SISU:'%s'", dn.c_str()); + bool valid = true; + // rule 1: valid ha state + if (ha_fr_imm != present_susi->state) { + if (ha_fr_imm == SA_AMF_HA_QUIESCING || + ha_fr_imm == SA_AMF_HA_QUIESCED) { + // That's fine + ; + } else { + LOG_ER("SISU:'%s', old(imm) ha state: %d, new(sync) ha state: %d", + dn.c_str(), ha_fr_imm, present_susi->state); + valid = false; + goto done; + } + } + // rule 2: if ha_fr_imm is QUIESCING, one of relevant entities must + // have adminState is SHUTTINGDOWN + if (ha_fr_imm == SA_AMF_HA_QUIESCING) { + if (present_susi->su->saAmfSUAdminState == SA_AMF_ADMIN_SHUTTING_DOWN || + present_susi->si->saAmfSIAdminState == SA_AMF_ADMIN_SHUTTING_DOWN || + present_susi->su->sg_of_su->saAmfSGAdminState == SA_AMF_ADMIN_SHUTTING_DOWN || + present_susi->su->su_on_node->saAmfNodeAdminState == SA_AMF_ADMIN_SHUTTING_DOWN) { + // That's fine + ; + } else { + LOG_ER("SISU:'%s', ha:'%u', but one of [node/sg/su/si] is not in SHUTTING_DOWN", + dn.c_str(), ha_fr_imm); + valid = false; + goto done; + } + } + // TODO: more rules to be added when issue is found in reality due to writing + // cached RTA to IMM +done: + if (valid == false) + present_susi->su->sg_of_su->headless_validation = valid; + + TRACE_LEAVE2("%u, %u", valid, present_susi->su->sg_of_su->headless_validation); + return present_susi->su->sg_of_su->headless_validation; +} /***************************************************************************** * Function: avd_susi_create * diff --git a/osaf/services/saf/amf/amfd/su.cc b/osaf/services/saf/amf/amfd/su.cc --- a/osaf/services/saf/amf/amfd/su.cc +++ b/osaf/services/saf/amf/amfd/su.cc @@ -1964,6 +1964,18 @@ void avd_su_read_headless_cached_rta(AVD rc = immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSUSwitch"), attributes, 0, &su_toggle); osafassert(rc == SA_AIS_OK); + if (su_toggle == AVSV_SI_TOGGLE_SWITCH) { + // 2N, if toggle but no pending assignment -> bad state + if (su->sg_of_su->sg_redundancy_model == SA_AMF_2N_REDUNDANCY_MODEL && + su->sg_of_su->any_assignment_in_progress() == false){ + LOG_ER("SG'%s', osafAmfSUSwitch:'%u', but no pending assignment", + Amf::to_string(&su->sg_of_su->name).c_str(), + su_toggle); + su->sg_of_su->headless_validation = false; + } + if (su->sg_of_su->headless_validation == false) + continue; + } su->set_su_switch(su_toggle, false); } } ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel