Ack. -----Original Message----- From: Thuan Tran <thuan.t...@dektech.com.au> Sent: Friday, December 18, 2020 10:41 AM To: Minh Hon Chau <minh.c...@dektech.com.au>; Thang Duc Nguyen <thang.d.ngu...@dektech.com.au> Cc: opensaf-devel@lists.sourceforge.net; Thuan Tran <thuan.t...@dektech.com.au> Subject: [PATCH 1/1] amf: fix cluster stuck unhealthy when SCs brutal reboot [#3241]
When see AMFD UP/NEW_ACTIVE in AMFD down state TRUE, AMFND should send sync info if any assigned NCS SUs. After msg node_up acked, resend buffered headless msg for NCS SUs. --- src/amf/amfnd/avnd_cb.h | 1 + src/amf/amfnd/avnd_di.h | 2 +- src/amf/amfnd/di.cc | 46 +++++++++++++++++++++++++++++++---------- src/amf/amfnd/main.cc | 1 + src/amf/amfnd/susm.cc | 1 + src/amf/amfnd/term.cc | 2 +- 6 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h index a2e521359..8af5e5fe1 100644 --- a/src/amf/amfnd/avnd_cb.h +++ b/src/amf/amfnd/avnd_cb.h @@ -120,6 +120,7 @@ typedef struct avnd_cb_tag { bool reboot_in_progress; AVND_SU *failed_su; bool cont_reboot_in_progress; + bool is_ncs_su_assigned; /* the duration that amfnd should tolerate absence of any SC */ SaTimeT scs_absence_max_duration; diff --git a/src/amf/amfnd/avnd_di.h b/src/amf/amfnd/avnd_di.h index 9870ad774..f9471aa6b 100644 --- a/src/amf/amfnd/avnd_di.h +++ b/src/amf/amfnd/avnd_di.h @@ -46,7 +46,7 @@ void avnd_di_msg_ack_process(struct avnd_cb_tag *, uint32_t); void avnd_diq_rec_check_buffered_msg(struct avnd_cb_tag *); AVND_DND_MSG_LIST *avnd_diq_rec_add(struct avnd_cb_tag *cb, AVND_MSG *msg); void avnd_diq_rec_del(struct avnd_cb_tag *cb, AVND_DND_MSG_LIST *rec); -void avnd_diq_rec_send_buffered_msg(struct avnd_cb_tag *cb); +void avnd_diq_rec_send_buffered_msg(struct avnd_cb_tag *cb, bool +only_ncs); uint32_t avnd_diq_rec_send(struct avnd_cb_tag *cb, AVND_DND_MSG_LIST *rec); uint32_t avnd_di_reg_su_rsp_snd(struct avnd_cb_tag *cb, const std::string &su_name, uint32_t ret_code); diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc index 5bff12104..20e752146 100644 --- a/src/amf/amfnd/di.cc +++ b/src/amf/amfnd/di.cc @@ -543,8 +543,7 @@ void avnd_send_node_up_msg(void) { msg.type = AVND_MSG_AVD; msg.info.avd->msg_type = AVSV_N2D_NODE_UP_MSG; msg.info.avd->msg_info.n2d_node_up.msg_id = ++(cb->snd_msg_id); - msg.info.avd->msg_info.n2d_node_up.leds_set = - cb->led_state == AVND_LED_STATE_GREEN ? true : false; + msg.info.avd->msg_info.n2d_node_up.leds_set = cb->is_ncs_su_assigned; osaf_extended_name_alloc(cb->amf_nodeName.c_str(), &msg.info.avd->msg_info.n2d_node_up.node_name); msg.info.avd->msg_info.n2d_node_up.node_id = cb->node_info.nodeId; @@ -652,7 +651,7 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB *cb, AVND_EVT *evt) { * node_up in both cases but only sync info is sent for recovery */ if (evt->info.mds.i_change == NCSMDS_UP) { - if (cb->is_avd_down && cb->led_state == AVND_LED_STATE_GREEN) { + if (cb->is_avd_down && cb->is_ncs_su_assigned) { avnd_sync_sisu(cb); avnd_sync_csicomp(cb); } @@ -665,7 +664,7 @@ uint32_t avnd_evt_mds_avd_up_evh(AVND_CB *cb, AVND_EVT *evt) { * only want to send node_up/sync info in case of recovery. */ if (evt->info.mds.i_change == NCSMDS_NEW_ACTIVE && cb->is_avd_down) { - if (cb->led_state == AVND_LED_STATE_GREEN) { + if (cb->is_ncs_su_assigned) { // node_up, sync sisu, compcsi info to AVND for recovery avnd_sync_sisu(cb); avnd_sync_csicomp(cb); @@ -1376,6 +1375,12 @@ void avnd_di_msg_ack_process(AVND_CB *cb, uint32_t mid) { // then perform last step clean up avnd_stop_tmr(cb, &rec->resp_tmr); avnd_last_step_clean(cb); + } else if (rec->msg.info.avd->msg_type == AVSV_N2D_NODE_UP_MSG) { + TRACE("msg node_up acked"); + // Resend buffered headless msg for NCS SUs + if (cb->is_ncs_su_assigned) { + avnd_diq_rec_send_buffered_msg(cb, true); + } } TRACE("remove msg %u from queue", msg_id); avnd_diq_rec_del(cb, rec); @@ -1541,15 +1546,17 @@ void avnd_diq_rec_del(AVND_CB *cb, AVND_DND_MSG_LIST *rec) { Description : Resend buffered msg Arguments : cb - ptr to the AvND control block + only_ncs - only send msg for NCS SUs Return Values : None. Notes : None. ******************************************************************************/ -void avnd_diq_rec_send_buffered_msg(AVND_CB *cb) { +void avnd_diq_rec_send_buffered_msg(AVND_CB *cb, bool only_ncs) { TRACE_ENTER(); // Resend msgs from queue because amfnd dropped during headless // or headless-synchronization + std::vector<AVND_DND_MSG_LIST*> tmp_dnd_list; for (auto iter = cb->dnd_list.begin(); iter != cb->dnd_list.end();) { auto pending_rec = *iter; @@ -1564,6 +1571,10 @@ void avnd_diq_rec_send_buffered_msg(AVND_CB *cb) { // only resend if this SUSI does exist AVND_SU *su = cb->sudb.find(Amf::to_string( &pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.su_name)); + if (only_ncs && su && !su->is_ncs) { + ++iter; + continue; + } if (su != nullptr && su->si_list.n_nodes > 0) { pending_rec->msg.info.avd->msg_info.n2d_su_si_assign.msg_id = ++(cb->snd_msg_id); @@ -1586,12 +1597,18 @@ void avnd_diq_rec_send_buffered_msg(AVND_CB *cb) { iter = cb->dnd_list.erase(iter); avnd_msg_content_free(cb, &pending_rec->msg); delete pending_rec; + continue; } } else if (pending_rec->msg.info.avd->msg_type == AVSV_N2D_OPERATION_STATE_MSG && pending_rec->msg.info.avd->msg_info.n2d_opr_state.msg_id == 0) { + AVND_SU *su = cb->sudb.find(Amf::to_string( + &pending_rec->msg.info.avd->msg_info.n2d_opr_state.su_name)); + if (only_ncs && su && !su->is_ncs) { + ++iter; + continue; + } pending_rec->msg.info.avd->msg_info.n2d_opr_state.msg_id = ++(cb->snd_msg_id); - LOG_NO( "Found and resend buffered oper_state msg for SU:'%s', " "su_oper_state:'%u', node_oper_state:'%u', recovery:'%u'", @@ -1606,22 +1623,29 @@ void avnd_diq_rec_send_buffered_msg(AVND_CB *cb) { ++iter; } else if (pending_rec->msg.info.avd->msg_type == AVSV_N2D_DATA_REQUEST_MSG && pending_rec->msg.info.avd->msg_info.n2d_data_req.msg_id == 0) { + AVND_SU *su = cb->sudb.find(Amf::to_string( + &pending_rec->msg.info.avd->msg_info.n2d_data_req.param_info.name)); + if (only_ncs && su && !su->is_ncs) { + ++iter; + continue; + } pending_rec->msg.info.avd->msg_info.n2d_data_req.msg_id = ++(cb->snd_msg_id); - LOG_NO( "Found and resend buffered Data Req msg for SU:'%s', msg_id:'%u'", osaf_extended_name_borrow(&pending_rec->msg.info.avd->msg_info .n2d_data_req.param_info.name), pending_rec->msg.info.avd->msg_info.n2d_data_req.msg_id); ++iter; - } else { - ++iter; - } + } else { + ++iter; + if (only_ncs) continue; + } + tmp_dnd_list.push_back(pending_rec); } TRACE("retransmit message to amfd"); - for (auto pending_rec : cb->dnd_list) { + for (auto pending_rec : tmp_dnd_list) { avnd_diq_rec_send(cb, pending_rec); } TRACE_LEAVE(); diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc index e0ede1161..265907917 100644 --- a/src/amf/amfnd/main.cc +++ b/src/amf/amfnd/main.cc @@ -317,6 +317,7 @@ AVND_CB *avnd_cb_create() { cb->oper_state = SA_AMF_OPERATIONAL_ENABLED; cb->term_state = AVND_TERM_STATE_UP; cb->led_state = AVND_LED_STATE_RED; + cb->is_ncs_su_assigned = false; /* assign the default timeout values (in nsec) */ cb->msg_resp_intv = AVND_AVD_MSG_RESP_TIME * 1000000; diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc index 80b35ea8f..67d4f06e6 100644 --- a/src/amf/amfnd/susm.cc +++ b/src/amf/amfnd/susm.cc @@ -1153,6 +1153,7 @@ uint32_t avnd_su_si_oper_done(AVND_CB *cb, AVND_SU *su, AVND_SU_SI_REC *si) { AVND_SU_SI_ASSIGN_STATE_ASSIGNED); LOG_NO("Assigned '%s' %s to '%s'", curr_si->name.c_str(), ha_state[curr_si->curr_state], su->name.c_str()); + if (su->is_ncs) cb->is_ncs_su_assigned = true; } else if (m_AVND_SU_SI_CURR_ASSIGN_STATE_IS_REMOVING(curr_si)) { m_AVND_SU_SI_CURR_ASSIGN_STATE_SET(curr_si, AVND_SU_SI_ASSIGN_STATE_REMOVED); diff --git a/src/amf/amfnd/term.cc b/src/amf/amfnd/term.cc index cad5de478..84503238f 100644 --- a/src/amf/amfnd/term.cc +++ b/src/amf/amfnd/term.cc @@ -205,7 +205,7 @@ uint32_t avnd_evt_avd_set_leds_evh(AVND_CB *cb, AVND_EVT *evt) { cb->amfd_sync_required = false; if (cb->led_state == AVND_LED_STATE_GREEN) { // Resend buffered headless msg - avnd_diq_rec_send_buffered_msg(cb); + avnd_diq_rec_send_buffered_msg(cb, false); goto done; } -- 2.25.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel