Hi Minh,

See my replies inline.

Best Regards,
ThuanTr

-----Original Message-----
From: Minh Hon Chau <minh.c...@dektech.com.au> 
Sent: Tuesday, March 31, 2020 8:05 AM
To: Thuan Tran <thuan.t...@dektech.com.au>; Thang Duc Nguyen 
<thang.d.ngu...@dektech.com.au>
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: fix mds_tipc_fctrl_shutdown() stuck forever 
[#3169]

Hi Thuan,

More comments inline.

Thanks

Minh

On 30/3/20 9:19 pm, Thuan Tran wrote:
> Hi Minh,
>
> Thanks for reviewing. See my replies inline.
>
> Best Regards,
> ThuanTr
>
> -----Original Message-----
> From: Minh Hon Chau <minh.c...@dektech.com.au>
> Sent: Monday, March 30, 2020 5:00 PM
> To: Thuan Tran <thuan.t...@dektech.com.au>; Thang Duc Nguyen 
> <thang.d.ngu...@dektech.com.au>
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: Re: [PATCH 1/1] mds: fix mds_tipc_fctrl_shutdown() stuck forever 
> [#3169]
>
> Hi Thuan,
>
> Please find my comments inline [M]
>
> Thanks
>
> Minh
>
> On 23/3/20 8:59 pm, thuan.tran wrote:
>> - Deadlock of portid_map_mutex locking: mds_tipc_fctrl_shutdown()
>> take lock then wait for thread process_all_events() to be canceled.
>> But that thread also want get lock then it is keep waiting for lock.
>> - Create safe method to cancel process_all_events() thread similar
>> as the way MDS destroy legacy receiving thread.
> [M] I could miss it but I don't find where MDS destroys the legacy
> receiving thread by sending event to the legacy receiving thread to
> cancel the thread. Is it mdtm_tipc_destroy() you mean?
> [T] Yes, before calling that function, check mds_main.c/ line: status = 
> mds_mdtm_destroy();
[M]: OK I see it, thanks
>
>> And getting portid_map_mutex lock only after that thread released.
>> ---
>>    src/mds/mds_tipc_fctrl_intf.cc | 24 ++++++++++++++++++++++--
>>    src/mds/mds_tipc_fctrl_msg.h   |  8 ++++++++
>>    2 files changed, 30 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
>> index 6ce00782e..93bfce51c 100644
>> --- a/src/mds/mds_tipc_fctrl_intf.cc
>> +++ b/src/mds/mds_tipc_fctrl_intf.cc
>> @@ -27,6 +27,8 @@
>>    
>>    #include "base/ncssysf_def.h"
>>    #include "base/ncssysf_tsk.h"
>> +#include "base/ncs_osprm.h"
>> +#include "base/osaf_poll.h"
>>    
>>    #include "mds/mds_log.h"
>>    #include "mds/mds_tipc_fctrl_portid.h"
>> @@ -194,6 +196,7 @@ bool mds_fctrl_mbx_cleanup(NCSCONTEXT arg, NCSCONTEXT 
>> msg) {
>>    }
>>    
>>    uint32_t process_all_events(void) {
>> +  bool running = true;
>>      enum { FD_FCTRL = 0, NUM_FDS };
>>    
>>      int poll_tmo = chunk_ack_timeout;
>> @@ -203,7 +206,7 @@ uint32_t process_all_events(void) {
>>          ncs_ipc_get_sel_obj(&mbx_events).rmv_obj;
>>      pfd[FD_FCTRL].events = POLLIN;
>>    
>> -  while (true) {
>> +  while (running) {
>>        int pollres;
>>    
>>        pollres = poll(pfd, NUM_FDS, poll_tmo);
>> @@ -231,9 +234,13 @@ uint32_t process_all_events(void) {
>>            if (evt->IsFlowEvent()) {
>>              process_flow_event(*evt);
>>            }
>> +        if (evt->IsShutDownEvent()) {
>> +          running = false;
>> +        }
>>    
>>            delete evt;
>>            portid_map_mutex.unlock();
>> +        if (!running) m_NCS_SEL_OBJ_IND(&evt->destroy_ack_obj_);
>>          }
>>        }
>>        // timeout, scan all portid and send ack msgs
>> @@ -243,6 +250,7 @@ uint32_t process_all_events(void) {
>>          portid_map_mutex.unlock();
>>        }
>>      }  /* while */
>> +  m_MDS_LOG_DBG("FCTRL: process_all_events() thread end");
>>      return NCSCC_RC_SUCCESS;
>>    }
>>    
>> @@ -305,7 +313,18 @@ uint32_t mds_tipc_fctrl_initialize(int dgramsock, 
>> struct tipc_portid id,
>>    uint32_t mds_tipc_fctrl_shutdown(void) {
>>      if (is_fctrl_enabled == false) return NCSCC_RC_SUCCESS;
>>    
>> -  portid_map_mutex.lock();
>> +  NCS_SEL_OBJ destroy_ack_obj;
>> +  m_NCS_SEL_OBJ_CREATE(&destroy_ack_obj);
>> +  Event* pevt = new Event(Event::Type::kEvtShutDown, destroy_ack_obj);
>> +  if (m_NCS_IPC_SEND(&mbx_events, pevt,
>> +      NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) {
>> +    m_MDS_LOG_ERR("FCTRL: Failed to send shutdown, Error[%s]",
>> +        strerror(errno));
>> +    abort();
>> +  }
>> +  osaf_poll_one_fd(m_GET_FD_FROM_SEL_OBJ(destroy_ack_obj), 10000);
>> +  m_NCS_SEL_OBJ_DESTROY(&destroy_ack_obj);
>> +  memset(&destroy_ack_obj, 0, sizeof(destroy_ack_obj));
[M]: Don't you need to memset 0?
[T]: Just clone it as legacy code.
>>    
>>      if (ncs_task_release(p_task_hdl) != NCSCC_RC_SUCCESS) {
>>        m_MDS_LOG_ERR("FCTRL: Stop of the Created Task-failed, Error[%s]",
>> @@ -315,6 +334,7 @@ uint32_t mds_tipc_fctrl_shutdown(void) {
>>      m_NCS_IPC_DETACH(&mbx_events, mds_fctrl_mbx_cleanup, nullptr);
>>      m_NCS_IPC_RELEASE(&mbx_events, nullptr);
>>    
>> +  portid_map_mutex.lock();
>>      for (auto i : portid_map) delete i.second;
>>      portid_map.clear();
> [M] Moving the lock() down here is not enough?
> [T] It's not enough, it will continue stuck due to lock inside mailbox 
> handling, m_NCS_IPC_DETACH() can stuck forever.

[M] I suspect another place in process_all_event() taking the lock too 
early, before retrieving msg in mbx, that caused those threads locked up?
[T] Inside the thread, during proceeding mailbox, mailbox lock can be acquired 
then the thread get cancelled.
I cause deadlock in later DETACH().

--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -216,11 +216,12 @@ uint32_t process_all_events(void) {

      if (pollres > 0) {
        if (pfd[FD_FCTRL].revents == POLLIN) {
-        portid_map_mutex.lock();

          Event *evt = reinterpret_cast<Event*>(ncs_ipc_non_blk_recv(
              &mbx_events));

+        portid_map_mutex.lock();
+
          if (evt == nullptr) {
            portid_map_mutex.unlock();
            continue;

>>    
>> diff --git a/src/mds/mds_tipc_fctrl_msg.h b/src/mds/mds_tipc_fctrl_msg.h
>> index c4641ed4e..1ba625650 100644
>> --- a/src/mds/mds_tipc_fctrl_msg.h
>> +++ b/src/mds/mds_tipc_fctrl_msg.h
>> @@ -49,10 +49,13 @@ class Event {
>>        kEvtTmrAll,
>>        kEvtTmrTxProb,    // event that tx probation timer expired for once
>>        kEvtTmrChunkAck,  // event to send the chunk ack
>> +    kEvtShutDown,     // event to shutdown flow control thread
>>      };
>>      NCS_IPC_MSG next_{0};
>>      Type type_;
>>    
>> +  // Used for shutdown
>> +  NCS_SEL_OBJ destroy_ack_obj_;
>>      // Used for flow event only
>>      struct tipc_portid id_;
>>      uint16_t svc_id_{0};
>> @@ -69,10 +72,15 @@ class Event {
>>        mseq_(mseq), mfrag_(mfrag), fseq_(f_seq_num) {
>>        type_ = type;
>>      }
>> +  Event(Type type, NCS_SEL_OBJ destroy_ack_obj):
>> +    destroy_ack_obj_(destroy_ack_obj) {
>> +    type_ = type;
>> +  }
>>      bool IsTimerEvent() const { return (type_ > Type::kEvtTmrAll); }
>>      bool IsFlowEvent() const {
>>        return (Type::kEvtDataFlowAll < type_ && type_ < Type::kEvtTmrAll);
>>      }
>> +  bool IsShutDownEvent() const { return (type_ == Type::kEvtShutDown); }
>>    };
>>    
>>    class BaseMessage {

_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to