Hi Minh, Thanks for reviewing. See my replies inline.
Best Regards, ThuanTr -----Original Message----- From: Minh Hon Chau <minh.c...@dektech.com.au> Sent: Monday, March 30, 2020 5:00 PM To: Thuan Tran <thuan.t...@dektech.com.au>; Thang Duc Nguyen <thang.d.ngu...@dektech.com.au> Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: fix mds_tipc_fctrl_shutdown() stuck forever [#3169] Hi Thuan, Please find my comments inline [M] Thanks Minh On 23/3/20 8:59 pm, thuan.tran wrote: > - Deadlock of portid_map_mutex locking: mds_tipc_fctrl_shutdown() > take lock then wait for thread process_all_events() to be canceled. > But that thread also want get lock then it is keep waiting for lock. > - Create safe method to cancel process_all_events() thread similar > as the way MDS destroy legacy receiving thread. [M] I could miss it but I don't find where MDS destroys the legacy receiving thread by sending event to the legacy receiving thread to cancel the thread. Is it mdtm_tipc_destroy() you mean? [T] Yes, before calling that function, check mds_main.c/ line: status = mds_mdtm_destroy(); > And getting portid_map_mutex lock only after that thread released. > --- > src/mds/mds_tipc_fctrl_intf.cc | 24 ++++++++++++++++++++++-- > src/mds/mds_tipc_fctrl_msg.h | 8 ++++++++ > 2 files changed, 30 insertions(+), 2 deletions(-) > > diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc > index 6ce00782e..93bfce51c 100644 > --- a/src/mds/mds_tipc_fctrl_intf.cc > +++ b/src/mds/mds_tipc_fctrl_intf.cc > @@ -27,6 +27,8 @@ > > #include "base/ncssysf_def.h" > #include "base/ncssysf_tsk.h" > +#include "base/ncs_osprm.h" > +#include "base/osaf_poll.h" > > #include "mds/mds_log.h" > #include "mds/mds_tipc_fctrl_portid.h" > @@ -194,6 +196,7 @@ bool mds_fctrl_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT > msg) { > } > > uint32_t process_all_events(void) { > + bool running = true; > enum { FD_FCTRL = 0, NUM_FDS }; > > int poll_tmo = chunk_ack_timeout; > @@ -203,7 +206,7 @@ uint32_t process_all_events(void) { > ncs_ipc_get_sel_obj(&mbx_events).rmv_obj; > pfd[FD_FCTRL].events = POLLIN; > > - while (true) { > + while (running) { > int pollres; > > pollres = poll(pfd, NUM_FDS, poll_tmo); > @@ -231,9 +234,13 @@ uint32_t process_all_events(void) { > if (evt->IsFlowEvent()) { > process_flow_event(*evt); > } > + if (evt->IsShutDownEvent()) { > + running = false; > + } > > delete evt; > portid_map_mutex.unlock(); > + if (!running) m_NCS_SEL_OBJ_IND(&evt->destroy_ack_obj_); > } > } > // timeout, scan all portid and send ack msgs > @@ -243,6 +250,7 @@ uint32_t process_all_events(void) { > portid_map_mutex.unlock(); > } > } /* while */ > + m_MDS_LOG_DBG("FCTRL: process_all_events() thread end"); > return NCSCC_RC_SUCCESS; > } > > @@ -305,7 +313,18 @@ uint32_t mds_tipc_fctrl_initialize(int dgramsock, struct > tipc_portid id, > uint32_t mds_tipc_fctrl_shutdown(void) { > if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS; > > - portid_map_mutex.lock(); > + NCS_SEL_OBJ destroy_ack_obj; > + m_NCS_SEL_OBJ_CREATE(&destroy_ack_obj); > + Event* pevt = new Event(Event::Type::kEvtShutDown, destroy_ack_obj); > + if (m_NCS_IPC_SEND(&mbx_events, pevt, > + NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { > + m_MDS_LOG_ERR("FCTRL: Failed to send shutdown, Error[%s]", > + strerror(errno)); > + abort(); > + } > + osaf_poll_one_fd(m_GET_FD_FROM_SEL_OBJ(destroy_ack_obj), 10000); > + m_NCS_SEL_OBJ_DESTROY(&destroy_ack_obj); > + memset(&destroy_ack_obj, 0, sizeof(destroy_ack_obj)); > > if (ncs_task_release(p_task_hdl) != NCSCC_RC_SUCCESS) { > m_MDS_LOG_ERR("FCTRL: Stop of the Created Task-failed, Error[%s]", > @@ -315,6 +334,7 @@ uint32_t mds_tipc_fctrl_shutdown(void) { > m_NCS_IPC_DETACH(&mbx_events, mds_fctrl_mbx_cleanup, nullptr); > m_NCS_IPC_RELEASE(&mbx_events, nullptr); > > + portid_map_mutex.lock(); > for (auto i : portid_map) delete i.second; > portid_map.clear(); [M] Moving the lock() down here is not enough? [T] It's not enough, it will continue stuck due to lock inside mailbox handling, m_NCS_IPC_DETACH() can stuck forever. > > diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h > index c4641ed4e..1ba625650 100644 > --- a/src/mds/mds_tipc_fctrl_msg.h > +++ b/src/mds/mds_tipc_fctrl_msg.h > @@ -49,10 +49,13 @@ class Event { > kEvtTmrAll, > kEvtTmrTxProb, // event that tx probation timer expired for once > kEvtTmrChunkAck, // event to send the chunk ack > + kEvtShutDown, // event to shutdown flow control thread > }; > NCS_IPC_MSG next_{0}; > Type type_; > > + // Used for shutdown > + NCS_SEL_OBJ destroy_ack_obj_; > // Used for flow event only > struct tipc_portid id_; > uint16_t svc_id_{0}; > @@ -69,10 +72,15 @@ class Event { > mseq_(mseq), mfrag_(mfrag), fseq_(f_seq_num) { > type_ = type; > } > + Event(Type type, NCS_SEL_OBJ destroy_ack_obj): > + destroy_ack_obj_(destroy_ack_obj) { > + type_ = type; > + } > bool IsTimerEvent() const { return (type_ > Type::kEvtTmrAll); } > bool IsFlowEvent() const { > return (Type::kEvtDataFlowAll < type_ && type_ < Type::kEvtTmrAll); > } > + bool IsShutDownEvent() const { return (type_ == Type::kEvtShutDown); } > }; > > class BaseMessage { _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel