Hi Thuan

Would this be simpler?

+  while (retry < 1) {
+    rc = clms_mds_msg_send(cb, &clm_msg, &evt->fr_dest, &evt->mds_ctxt,
+                          MDS_SEND_PRIORITY_HIGH, NCSMDS_SVC_ID_CLMNA);
+    if (rc != NCSCC_RC_SUCCESS) {
+      ...
+      osaf_nanosleep(&kTenMilliseconds);
+       ++retry;
+    } else {
+      break;
+    }
+  }

Thanks
Gary

________________________________
From: Thuan Tran <thuan.t...@dektech.com.au>
Sent: 18 February 2020 17:38
To: Vu Minh Nguyen <vu.m.ngu...@dektech.com.au>; Minh Hon Chau 
<minh.c...@dektech.com.au>; Thang Duc Nguyen <thang.d.ngu...@dektech.com.au>; 
Gary Lee <gary....@dektech.com.au>
Cc: opensaf-devel@lists.sourceforge.net <opensaf-devel@lists.sourceforge.net>; 
Thuan Tran <thuan.t...@dektech.com.au>
Subject: [PATCH 1/1] clmd: retry once to send message to clmna [#3156]

- If a node reboot up, clmna svc_up is not yet come but clmd
get message join request then send message back clmna failed.
It leads to amfnd timeout init clm agent and delay send node up.
This may cause amfd order reboot that node if node up delay
(osafAmfDelayNodeFailoverNodeWaitTimeout) is set smaller than
total time amfnd retry until init clm agent successfully.
- One retry to send messsage to clmna help avoid this scenario.
---
 src/clm/clmd/clms_evt.cc | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/clm/clmd/clms_evt.cc b/src/clm/clmd/clms_evt.cc
index 1059c6cfa..59e9c4156 100644
--- a/src/clm/clmd/clms_evt.cc
+++ b/src/clm/clmd/clms_evt.cc
@@ -34,6 +34,7 @@
 #include "base/logtrace.h"
 #include "base/ncsgl_defs.h"
 #include "base/osaf_utility.h"
+#include "base/osaf_time.h"
 #include "clm/clmd/clms.h"

 static uint32_t process_api_evt(CLMSV_CLMS_EVT *evt);
@@ -535,6 +536,7 @@ uint32_t proc_node_up_msg(CLMS_CB *cb, CLMSV_CLMS_EVT *evt) 
{
   SaNameT node_name = {0};
   CLMSV_MSG clm_msg;
   SaBoolT check_member;
+  int retry = 0;

   TRACE_ENTER2("Node up mesg for nodename length %d %s",
                nodeup_info->node_name.length, nodeup_info->node_name.value);
@@ -636,8 +638,20 @@ uint32_t proc_node_up_msg(CLMS_CB *cb, CLMSV_CLMS_EVT 
*evt) {
   clm_msg.info.api_resp_info.type = CLMSV_CLUSTER_JOIN_RESP;
   clm_msg.info.api_resp_info.param.node_name = node_name;
   /*rc will be updated down in the positive flow */
-  rc = clms_mds_msg_send(cb, &clm_msg, &evt->fr_dest, &evt->mds_ctxt,
-                         MDS_SEND_PRIORITY_HIGH, NCSMDS_SVC_ID_CLMNA);
+  do {
+    rc = clms_mds_msg_send(cb, &clm_msg, &evt->fr_dest, &evt->mds_ctxt,
+                          MDS_SEND_PRIORITY_HIGH, NCSMDS_SVC_ID_CLMNA);
+    if (rc != NCSCC_RC_SUCCESS && retry < 1) {
+      /* If a node reboot up, clmna svc_up is not yet come but clmd
+       * get message join request then send message back clmna failed.
+       * It leads to amfnd timeout init clm agent and delay send node up.
+       * This may cause amfd order reboot that node if node up delay
+       * (osafAmfDelayNodeFailoverNodeWaitTimeout) is set smaller than
+       * total time amfnd retry until init clm agent successfully.
+       * If retry here, it would help avoid this scenario */
+      osaf_nanosleep(&kTenMilliseconds);
+    }
+  } while (rc != NCSCC_RC_SUCCESS && retry++ < 1);
   /*if mds send failed, we need to report failure */
   if (rc != NCSCC_RC_SUCCESS) {
     LOG_NO("%s: send failed. dest:%" PRIx64, __FUNCTION__, evt->fr_dest);
--
2.17.1


_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to