Re: [devel] [PATCH 1/1] mds: fix ckpt 20 11 failure [#3127]

2019-12-09 Thread Tran Thuan



Best Regards,
ThuanTr

-Original Message-
From: Minh Hon Chau  
Sent: Tuesday, December 10, 2019 9:51 AM
To: thuan.tran ; 'Nguyen Minh Vu' 
; 'thang . d . nguyen' 
; gary@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] mds: fix ckpt 20 11 failure [#3127]

Hi Thuan,

- We could give the patch title a bit more meanings than "fix ckpt 20 
11..", for example, something as "Using timer to continue sending queued 
message".
[Thuan] OK, will update short commit message.

- And a few comments inline

Thanks

Minh

On 5/12/19 3:05 pm, thuan.tran wrote:
> - In overflow, receive chunk ack may stuck in retrying to send pending
> messages then later chunk ack comming cannot proceed.
> - Instead of retrying to send pending messages, reuse timer send chunk
> ack to trigger send pending messages if any. By this, even no more Nack
> or ChunkAck event comming, pending messages will be sent by timer.
> ---
>   src/mds/mds_dt_tipc.c| 12 ++---
>   src/mds/mds_tipc_fctrl_intf.cc   | 10 
>   src/mds/mds_tipc_fctrl_portid.cc | 88 ++--
>   src/mds/mds_tipc_fctrl_portid.h  |  1 +
>   4 files changed, 56 insertions(+), 55 deletions(-)
>
> diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
> index 9b3290833..6b30846a1 100644
> --- a/src/mds/mds_dt_tipc.c
> +++ b/src/mds/mds_dt_tipc.c
> @@ -3183,13 +3183,13 @@ ssize_t mds_retry_sendto(int sockfd, const void *buf, 
> size_t len, int flags,
>   {
>   int retry = 5;
>   ssize_t send_len = 0;
> - while (retry >= 0) {
> + while (retry-- >= 0) {
>   send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen);
>   if (send_len == len) {
>   return send_len;
> - } else if (retry-- > 0) {
> - if (errno != ENOMEM &&
> - errno != ENOBUFS &&
> + } else if (retry >= 0) {
> + if (errno != EAGAIN && errno != EWOULDBLOCK &&
> + errno != ENOMEM && errno != ENOBUFS &&
>   errno != EINTR)
>   break;
>   osaf_nanosleep();

[Minh] We may need to do error-log the strerror and errno in case of 
failure in mds_retry_sendto(). Also,
[Thuan] error-log already done by upper callers.

uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) {

...

   m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno));
}

this logging "sendto()" should be now "TipcPortId::Send()"
[Thuan] OK, will update this log message.

> @@ -3242,7 +3242,7 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t 
> buff_len,
>   if (mds_tipc_fctrl_trysend(id, buffer, buff_len, is_queued)
>   == NCSCC_RC_SUCCESS) {
>   send_len = mds_retry_sendto(
> - tipc_cb.BSRsock, buffer, buff_len, 0,
> + tipc_cb.BSRsock, buffer, buff_len, MSG_DONTWAIT,
>   (struct sockaddr *)_addr, 
> sizeof(server_addr));
[Minh] There must be a reason that you want to use non-blocking with 
MSG_DONTWAIT?
[Thuan] without that flag, ckpt 20 11 hang in sendto() then fctrl control 
thread cannot handle anything.

>   if (send_len == buff_len) {
>   m_MDS_LOG_INFO("MDTM: Successfully sent message");
> @@ -3289,7 +3289,7 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t 
> size,
>   /*This can be scope-down to dest_svc_id  server_inst TBD*/
>   server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE);
>   ssize_t send_len =
> - mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0,
> + mds_retry_sendto(tipc_cb.BSRsock, buffer, size, MSG_DONTWAIT,
>  (struct sockaddr *)_addr, sizeof(server_addr));
>   
>   if (send_len == size) {
> diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
> index 7d0571e7c..b20205686 100644
> --- a/src/mds/mds_tipc_fctrl_intf.cc
> +++ b/src/mds/mds_tipc_fctrl_intf.cc
> @@ -102,6 +102,8 @@ void tmr_exp_cbk(void* uarg) {
>   
>   void process_timer_event(const Event& evt) {
> bool txprob_restart = false;
> +  m_MDS_LOG_DBG("FCTRL: process timer event start [evt:%d]",
> +static_cast(evt.type_));
> for (auto i : portid_map) {
>   TipcPortId* portid = i.second;
>   
> @@ -113,16 +115,20 @@ void process_timer_event(const Event& evt) {
>   
>   if (evt.type_ == Event::Type::kEvtTmrChunkAck) {
> portid->ReceiveTmrChunkAck();
> +  portid->SendUnsentMsg();
>   }
[Minh] The idea now is using ChunkAck timer to continue sending unsent 
message. This fix comes from a situation that we failed in the middle of 
sending unsent message due to "Cannot allocate memory...". In the 
scenario without such error "Cannot allocate ...", the function 
SendUnsentMsg() here will be sending extra messages from the "receiving 
channel" as ChunkAck timer apart from 

Re: [devel] [PATCH 1/1] mds: fix ckpt 20 11 failure [#3127]

2019-12-09 Thread Minh Hon Chau

Hi Thuan,

- We could give the patch title a bit more meanings than "fix ckpt 20 
11..", for example, something as "Using timer to continue sending queued 
message".


- And a few comments inline

Thanks

Minh

On 5/12/19 3:05 pm, thuan.tran wrote:

- In overflow, receive chunk ack may stuck in retrying to send pending
messages then later chunk ack comming cannot proceed.
- Instead of retrying to send pending messages, reuse timer send chunk
ack to trigger send pending messages if any. By this, even no more Nack
or ChunkAck event comming, pending messages will be sent by timer.
---
  src/mds/mds_dt_tipc.c| 12 ++---
  src/mds/mds_tipc_fctrl_intf.cc   | 10 
  src/mds/mds_tipc_fctrl_portid.cc | 88 ++--
  src/mds/mds_tipc_fctrl_portid.h  |  1 +
  4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 9b3290833..6b30846a1 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -3183,13 +3183,13 @@ ssize_t mds_retry_sendto(int sockfd, const void *buf, 
size_t len, int flags,
  {
int retry = 5;
ssize_t send_len = 0;
-   while (retry >= 0) {
+   while (retry-- >= 0) {
send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen);
if (send_len == len) {
return send_len;
-   } else if (retry-- > 0) {
-   if (errno != ENOMEM &&
-   errno != ENOBUFS &&
+   } else if (retry >= 0) {
+   if (errno != EAGAIN && errno != EWOULDBLOCK &&
+   errno != ENOMEM && errno != ENOBUFS &&
errno != EINTR)
break;
osaf_nanosleep();


[Minh] We may need to do error-log the strerror and errno in case of 
failure in mds_retry_sendto(). Also,


uint32_t TipcPortId::Send(uint8_t* data, uint16_t length) {

...

  m_MDS_LOG_ERR("FCTRL: sendto() failed, Error[%s]", strerror(errno));
}

this logging "sendto()" should be now "TipcPortId::Send()"


@@ -3242,7 +3242,7 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t 
buff_len,
if (mds_tipc_fctrl_trysend(id, buffer, buff_len, is_queued)
== NCSCC_RC_SUCCESS) {
send_len = mds_retry_sendto(
-   tipc_cb.BSRsock, buffer, buff_len, 0,
+   tipc_cb.BSRsock, buffer, buff_len, MSG_DONTWAIT,
(struct sockaddr *)_addr, 
sizeof(server_addr));
[Minh] There must be a reason that you want to use non-blocking with 
MSG_DONTWAIT?

if (send_len == buff_len) {
m_MDS_LOG_INFO("MDTM: Successfully sent message");
@@ -3289,7 +3289,7 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t 
size,
/*This can be scope-down to dest_svc_id  server_inst TBD*/
server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE);
ssize_t send_len =
-   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0,
+   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, MSG_DONTWAIT,
   (struct sockaddr *)_addr, sizeof(server_addr));
  
  	if (send_len == size) {

diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index 7d0571e7c..b20205686 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -102,6 +102,8 @@ void tmr_exp_cbk(void* uarg) {
  
  void process_timer_event(const Event& evt) {

bool txprob_restart = false;
+  m_MDS_LOG_DBG("FCTRL: process timer event start [evt:%d]",
+static_cast(evt.type_));
for (auto i : portid_map) {
  TipcPortId* portid = i.second;
  
@@ -113,16 +115,20 @@ void process_timer_event(const Event& evt) {
  
  if (evt.type_ == Event::Type::kEvtTmrChunkAck) {

portid->ReceiveTmrChunkAck();
+  portid->SendUnsentMsg();
  }
[Minh] The idea now is using ChunkAck timer to continue sending unsent 
message. This fix comes from a situation that we failed in the middle of 
sending unsent message due to "Cannot allocate memory...". In the 
scenario without such error "Cannot allocate ...", the function 
SendUnsentMsg() here will be sending extra messages from the "receiving 
channel" as ChunkAck timer apart from the "sending channel" as 
ReceiveChunkAck(). That would cause more undeliverable messages (the 
ones are now sent from ChunkAck timer) if the overloading starts to 
happen and sender keeps pushing more messages to send (more message 
pushes into queue).

}
if (txprob_restart) {
  txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk);
  m_MDS_LOG_DBG("FCTRL: Restart txprob");
}
+  m_MDS_LOG_DBG("FCTRL: process timer event end");
  }
  
  uint32_t process_flow_event(const Event& evt) {

uint32_t rc = NCSCC_RC_SUCCESS;
+  m_MDS_LOG_DBG("FCTRL: process flow event start [evt:%d]",
+static_cast(evt.type_));

[devel] [PATCH 1/1] mds: fix ckpt 20 11 failure [#3127]

2019-12-04 Thread thuan.tran
- In overflow, receive chunk ack may stuck in retrying to send pending
messages then later chunk ack comming cannot proceed.
- Instead of retrying to send pending messages, reuse timer send chunk
ack to trigger send pending messages if any. By this, even no more Nack
or ChunkAck event comming, pending messages will be sent by timer.
---
 src/mds/mds_dt_tipc.c| 12 ++---
 src/mds/mds_tipc_fctrl_intf.cc   | 10 
 src/mds/mds_tipc_fctrl_portid.cc | 88 ++--
 src/mds/mds_tipc_fctrl_portid.h  |  1 +
 4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/src/mds/mds_dt_tipc.c b/src/mds/mds_dt_tipc.c
index 9b3290833..6b30846a1 100644
--- a/src/mds/mds_dt_tipc.c
+++ b/src/mds/mds_dt_tipc.c
@@ -3183,13 +3183,13 @@ ssize_t mds_retry_sendto(int sockfd, const void *buf, 
size_t len, int flags,
 {
int retry = 5;
ssize_t send_len = 0;
-   while (retry >= 0) {
+   while (retry-- >= 0) {
send_len = sendto(sockfd, buf, len, flags, dest_addr, addrlen);
if (send_len == len) {
return send_len;
-   } else if (retry-- > 0) {
-   if (errno != ENOMEM &&
-   errno != ENOBUFS &&
+   } else if (retry >= 0) {
+   if (errno != EAGAIN && errno != EWOULDBLOCK &&
+   errno != ENOMEM && errno != ENOBUFS &&
errno != EINTR)
break;
osaf_nanosleep();
@@ -3242,7 +3242,7 @@ static uint32_t mdtm_sendto(uint8_t *buffer, uint16_t 
buff_len,
if (mds_tipc_fctrl_trysend(id, buffer, buff_len, is_queued)
== NCSCC_RC_SUCCESS) {
send_len = mds_retry_sendto(
-   tipc_cb.BSRsock, buffer, buff_len, 0,
+   tipc_cb.BSRsock, buffer, buff_len, MSG_DONTWAIT,
(struct sockaddr *)_addr, 
sizeof(server_addr));
if (send_len == buff_len) {
m_MDS_LOG_INFO("MDTM: Successfully sent message");
@@ -3289,7 +3289,7 @@ static uint32_t mdtm_mcast_sendto(void *buffer, size_t 
size,
/*This can be scope-down to dest_svc_id  server_inst TBD*/
server_addr.addr.nameseq.upper = HTONL(MDS_MDTM_UPPER_INSTANCE);
ssize_t send_len =
-   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, 0,
+   mds_retry_sendto(tipc_cb.BSRsock, buffer, size, MSG_DONTWAIT,
   (struct sockaddr *)_addr, sizeof(server_addr));
 
if (send_len == size) {
diff --git a/src/mds/mds_tipc_fctrl_intf.cc b/src/mds/mds_tipc_fctrl_intf.cc
index 7d0571e7c..b20205686 100644
--- a/src/mds/mds_tipc_fctrl_intf.cc
+++ b/src/mds/mds_tipc_fctrl_intf.cc
@@ -102,6 +102,8 @@ void tmr_exp_cbk(void* uarg) {
 
 void process_timer_event(const Event& evt) {
   bool txprob_restart = false;
+  m_MDS_LOG_DBG("FCTRL: process timer event start [evt:%d]",
+static_cast(evt.type_));
   for (auto i : portid_map) {
 TipcPortId* portid = i.second;
 
@@ -113,16 +115,20 @@ void process_timer_event(const Event& evt) {
 
 if (evt.type_ == Event::Type::kEvtTmrChunkAck) {
   portid->ReceiveTmrChunkAck();
+  portid->SendUnsentMsg();
 }
   }
   if (txprob_restart) {
 txprob_timer.Start(kBaseTimerInt, tmr_exp_cbk);
 m_MDS_LOG_DBG("FCTRL: Restart txprob");
   }
+  m_MDS_LOG_DBG("FCTRL: process timer event end");
 }
 
 uint32_t process_flow_event(const Event& evt) {
   uint32_t rc = NCSCC_RC_SUCCESS;
+  m_MDS_LOG_DBG("FCTRL: process flow event start [evt:%d]",
+static_cast(evt.type_));
   TipcPortId *portid = portid_lookup(evt.id_);
   if (portid == nullptr) {
 // the null portid normally should not happen; however because the
@@ -176,6 +182,7 @@ uint32_t process_flow_event(const Event& evt) {
   portid->ReceiveIntro();
 }
   }
+  m_MDS_LOG_DBG("FCTRL: process flow event end");
   return rc;
 }
 
@@ -495,6 +502,7 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t 
len,
   // if mds support flow control
   if (header.IsControlMessage()) {
 if (header.msg_type_ == ChunkAck::kChunkAckMsgType) {
+  m_MDS_LOG_DBG("FCTRL: Receive ChunkAck");
   // receive single ack message
   ChunkAck ack;
   ack.Decode(buffer);
@@ -508,6 +516,7 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t 
len,
 strerror(errno));
   }
 } else if (header.msg_type_ == Nack::kNackMsgType) {
+  m_MDS_LOG_DBG("FCTRL: Receive Nack");
   // receive nack message
   Nack nack;
   nack.Decode(buffer);
@@ -520,6 +529,7 @@ uint32_t mds_tipc_fctrl_rcv_data(uint8_t *buffer, uint16_t 
len,
 strerror(errno));
   }
 } else if (header.msg_type_ == Intro::kIntroMsgType) {
+  m_MDS_LOG_DBG("FCTRL: Receive Intro");
   // no need to decode intro message
   // the decoding intro message type is done in header