Hi Minh, See my replies inline.
Best Regards, ThuanTr -----Original Message----- From: Minh Hon Chau <minh.c...@dektech.com.au> Sent: Tuesday, March 31, 2020 8:05 AM To: Thuan Tran <thuan.t...@dektech.com.au>; Thang Duc Nguyen <thang.d.ngu...@dektech.com.au> Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: fix mds_tipc_fctrl_shutdown() stuck forever [#3169] Hi Thuan, More comments inline. Thanks Minh On 30/3/20 9:19 pm, Thuan Tran wrote: > Hi Minh, > > Thanks for reviewing. See my replies inline. > > Best Regards, > ThuanTr > > -----Original Message----- > From: Minh Hon Chau <minh.c...@dektech.com.au> > Sent: Monday, March 30, 2020 5:00 PM > To: Thuan Tran <thuan.t...@dektech.com.au>; Thang Duc Nguyen > <thang.d.ngu...@dektech.com.au> > Cc: opensaf-devel@lists.sourceforge.net > Subject: Re: [PATCH 1/1] mds: fix mds_tipc_fctrl_shutdown() stuck forever > [#3169] > > Hi Thuan, > > Please find my comments inline [M] > > Thanks > > Minh > > On 23/3/20 8:59 pm, thuan.tran wrote: >> - Deadlock of portid_map_mutex locking: mds_tipc_fctrl_shutdown() >> take lock then wait for thread process_all_events() to be canceled. >> But that thread also want get lock then it is keep waiting for lock. >> - Create safe method to cancel process_all_events() thread similar >> as the way MDS destroy legacy receiving thread. > [M] I could miss it but I don't find where MDS destroys the legacy > receiving thread by sending event to the legacy receiving thread to > cancel the thread. Is it mdtm_tipc_destroy() you mean? > [T] Yes, before calling that function, check mds_main.c/ line: status = > mds_mdtm_destroy(); [M]: OK I see it, thanks > >> And getting portid_map_mutex lock only after that thread released. >> --- >> src/mds/mds_tipc_fctrl_intf.cc | 24 ++++++++++++++++++++++-- >> src/mds/mds_tipc_fctrl_msg.h | 8 ++++++++ >> 2 files changed, 30 insertions(+), 2 deletions(-) >> >> diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc >> index 6ce00782e..93bfce51c 100644 >> --- a/src/mds/mds_tipc_fctrl_intf.cc >> +++ b/src/mds/mds_tipc_fctrl_intf.cc >> @@ -27,6 +27,8 @@ >> >> #include "base/ncssysf_def.h" >> #include "base/ncssysf_tsk.h" >> +#include "base/ncs_osprm.h" >> +#include "base/osaf_poll.h" >> >> #include "mds/mds_log.h" >> #include "mds/mds_tipc_fctrl_portid.h" >> @@ -194,6 +196,7 @@ bool mds_fctrl_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT >> msg) { >> } >> >> uint32_t process_all_events(void) { >> + bool running = true; >> enum { FD_FCTRL = 0, NUM_FDS }; >> >> int poll_tmo = chunk_ack_timeout; >> @@ -203,7 +206,7 @@ uint32_t process_all_events(void) { >> ncs_ipc_get_sel_obj(&mbx_events).rmv_obj; >> pfd[FD_FCTRL].events = POLLIN; >> >> - while (true) { >> + while (running) { >> int pollres; >> >> pollres = poll(pfd, NUM_FDS, poll_tmo); >> @@ -231,9 +234,13 @@ uint32_t process_all_events(void) { >> if (evt->IsFlowEvent()) { >> process_flow_event(*evt); >> } >> + if (evt->IsShutDownEvent()) { >> + running = false; >> + } >> >> delete evt; >> portid_map_mutex.unlock(); >> + if (!running) m_NCS_SEL_OBJ_IND(&evt->destroy_ack_obj_); >> } >> } >> // timeout, scan all portid and send ack msgs >> @@ -243,6 +250,7 @@ uint32_t process_all_events(void) { >> portid_map_mutex.unlock(); >> } >> } /* while */ >> + m_MDS_LOG_DBG("FCTRL: process_all_events() thread end"); >> return NCSCC_RC_SUCCESS; >> } >> >> @@ -305,7 +313,18 @@ uint32_t mds_tipc_fctrl_initialize(int dgramsock, >> struct tipc_portid id, >> uint32_t mds_tipc_fctrl_shutdown(void) { >> if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS; >> >> - portid_map_mutex.lock(); >> + NCS_SEL_OBJ destroy_ack_obj; >> + m_NCS_SEL_OBJ_CREATE(&destroy_ack_obj); >> + Event* pevt = new Event(Event::Type::kEvtShutDown, destroy_ack_obj); >> + if (m_NCS_IPC_SEND(&mbx_events, pevt, >> + NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { >> + m_MDS_LOG_ERR("FCTRL: Failed to send shutdown, Error[%s]", >> + strerror(errno)); >> + abort(); >> + } >> + osaf_poll_one_fd(m_GET_FD_FROM_SEL_OBJ(destroy_ack_obj), 10000); >> + m_NCS_SEL_OBJ_DESTROY(&destroy_ack_obj); >> + memset(&destroy_ack_obj, 0, sizeof(destroy_ack_obj)); [M]: Don't you need to memset 0? [T]: Just clone it as legacy code. >> >> if (ncs_task_release(p_task_hdl) != NCSCC_RC_SUCCESS) { >> m_MDS_LOG_ERR("FCTRL: Stop of the Created Task-failed, Error[%s]", >> @@ -315,6 +334,7 @@ uint32_t mds_tipc_fctrl_shutdown(void) { >> m_NCS_IPC_DETACH(&mbx_events, mds_fctrl_mbx_cleanup, nullptr); >> m_NCS_IPC_RELEASE(&mbx_events, nullptr); >> >> + portid_map_mutex.lock(); >> for (auto i : portid_map) delete i.second; >> portid_map.clear(); > [M] Moving the lock() down here is not enough? > [T] It's not enough, it will continue stuck due to lock inside mailbox > handling, m_NCS_IPC_DETACH() can stuck forever. [M] I suspect another place in process_all_event() taking the lock too early, before retrieving msg in mbx, that caused those threads locked up? [T] Inside the thread, during proceeding mailbox, mailbox lock can be acquired then the thread get cancelled. I cause deadlock in later DETACH(). --- a/src/mds/mds_tipc_fctrl_intf.cc +++ b/src/mds/mds_tipc_fctrl_intf.cc @@ -216,11 +216,12 @@ uint32_t process_all_events(void) { if (pollres > 0) { if (pfd[FD_FCTRL].revents == POLLIN) { - portid_map_mutex.lock(); Event *evt = reinterpret_cast<Event*>(ncs_ipc_non_blk_recv( &mbx_events)); + portid_map_mutex.lock(); + if (evt == nullptr) { portid_map_mutex.unlock(); continue; >> >> diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h >> index c4641ed4e..1ba625650 100644 >> --- a/src/mds/mds_tipc_fctrl_msg.h >> +++ b/src/mds/mds_tipc_fctrl_msg.h >> @@ -49,10 +49,13 @@ class Event { >> kEvtTmrAll, >> kEvtTmrTxProb, // event that tx probation timer expired for once >> kEvtTmrChunkAck, // event to send the chunk ack >> + kEvtShutDown, // event to shutdown flow control thread >> }; >> NCS_IPC_MSG next_{0}; >> Type type_; >> >> + // Used for shutdown >> + NCS_SEL_OBJ destroy_ack_obj_; >> // Used for flow event only >> struct tipc_portid id_; >> uint16_t svc_id_{0}; >> @@ -69,10 +72,15 @@ class Event { >> mseq_(mseq), mfrag_(mfrag), fseq_(f_seq_num) { >> type_ = type; >> } >> + Event(Type type, NCS_SEL_OBJ destroy_ack_obj): >> + destroy_ack_obj_(destroy_ack_obj) { >> + type_ = type; >> + } >> bool IsTimerEvent() const { return (type_ > Type::kEvtTmrAll); } >> bool IsFlowEvent() const { >> return (Type::kEvtDataFlowAll < type_ && type_ < Type::kEvtTmrAll); >> } >> + bool IsShutDownEvent() const { return (type_ == Type::kEvtShutDown); } >> }; >> >> class BaseMessage { _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel