When the PBE hung, amfd can process the events with below order when a node was started then stop then started - clm_track_cb for node down event - clm_track_cb for second node up event - avd_mds_avnd_down_evh was called to process amfnd down event
And it cause the node can not join the cluster. --- src/amf/amfd/ndfsm.cc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc index ee47582de..e2235b2e9 100644 --- a/src/amf/amfd/ndfsm.cc +++ b/src/amf/amfd/ndfsm.cc @@ -800,6 +800,23 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) { daemon_exit(); } + if (node->node_state == AVD_AVND_STATE_ABSENT) { + bool after_headless = false; + for (const auto &i_su : node->list_of_ncs_su) { + if (i_su->saAmfSUOperState == SA_AMF_OPERATIONAL_ENABLED) { + after_headless = true; + break; + } + } + if (after_headless != true) { + // Ignore amfnd down event in late after clm cb node left then joined + // But not ignore if after headless + LOG_WA("Ignore '%s' amfnd down event", node->node_name.c_str()); + TRACE_LEAVE(); + return; + } + } + if (cb->failover_list.find(evt->info.node_id) != cb->failover_list.end()) { std::shared_ptr<NodeStateMachine> failed_node = cb->failover_list.at(evt->info.node_id); -- 2.17.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel