During node start up, it loses connection with Active SC.
At that time CLM join cluster and AMFND tries to convert
CLM node to AMF node. But IMMND at that time is down
(e.i, unregister then register with MDS). The IMM OM API
return asap and cause the coredump.

Solution is to add some retry to convert CLM node to AMF node
if IMMND down.
---
 src/amf/amfnd/avnd_cb.h |  1 +
 src/amf/amfnd/clm.cc    | 23 +++++++++++++++++++----
 src/amf/amfnd/main.cc   |  2 ++
 src/amf/amfnd/mds.cc    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h
index a2e521359..c7f9ee8d2 100644
--- a/src/amf/amfnd/avnd_cb.h
+++ b/src/amf/amfnd/avnd_cb.h
@@ -120,6 +120,7 @@ typedef struct avnd_cb_tag {
   bool reboot_in_progress;
   AVND_SU *failed_su;
   bool cont_reboot_in_progress;
+  bool is_immnd_up;
 
   /* the duration that amfnd should tolerate absence of any SC */
   SaTimeT scs_absence_max_duration;
diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc
index 317674f10..cf3e49166 100644
--- a/src/amf/amfnd/clm.cc
+++ b/src/amf/amfnd/clm.cc
@@ -138,7 +138,7 @@ done:
   TRACE_LEAVE();
 }
 
-static void clm_to_amf_node(void) {
+static SaAisErrorT clm_to_amf_node(void) {
   SaAisErrorT error;
   SaImmSearchHandleT searchHandle;
   SaNameT amfdn, clmdn;
@@ -157,8 +157,7 @@ static void clm_to_amf_node(void) {
 
   error = saImmOmInitialize_cond(&immOmHandle, nullptr, &immVersion);
   if (SA_AIS_OK != error) {
-    LOG_WA("saImmOmInitialize failed. Use previous value of nodeName.");
-    osafassert(avnd_cb->amf_nodeName.empty() == false);
+    LOG_WA("saImmOmInitialize failed: %u", error);
     goto done1;
   }
 
@@ -192,6 +191,7 @@ done:
   immutil_saImmOmFinalize(immOmHandle);
 done1:
   TRACE_LEAVE2("%u", error);
+  return error;
 }
 
 /****************************************************************************
@@ -230,6 +230,8 @@ static void clm_track_cb(
     SaClmChangeStepT step, SaTimeT timeSupervision, SaAisErrorT error) {
   SaClmClusterNotificationT_4 *notifItem;
   uint32_t i;
+  uint32_t rc = SA_AIS_OK;
+
   TRACE_ENTER2("'%llu' '%u' '%u'", invocation, step, error);
 
   if (error != SA_AIS_OK) {
@@ -279,7 +281,20 @@ static void clm_track_cb(
           memcpy(&(avnd_cb->node_info), &(notifItem->clusterNode),
                  sizeof(SaClmClusterNodeT_4));
           /*get the amf node from clm node name */
-          if (avnd_cb->amf_nodeName.empty()) clm_to_amf_node();
+          if (avnd_cb->amf_nodeName.empty()) {
+            int count = 0;
+            rc = clm_to_amf_node();
+            while ((rc == SA_AIS_ERR_TRY_AGAIN) && ( ++count < 3 ) &&
+                (avnd_cb->is_immnd_up == false)) {
+              TRACE("IMMND not up, ivoke clm_to_amf_node() again");
+              osaf_nanosleep(&kOneSecond);
+              rc = clm_to_amf_node();
+            }
+          }
+          if (rc != SA_AIS_OK) {
+            LOG_ER("IMMND is not UP");
+            exit(EXIT_FAILURE);
+          }
           avnd_send_node_up_msg();
           avnd_cb->first_time_up = false;
         } else {
diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc
index 6d9ee95d4..1df5b3f82 100644
--- a/src/amf/amfnd/main.cc
+++ b/src/amf/amfnd/main.cc
@@ -349,6 +349,8 @@ AVND_CB *avnd_cb_create() {
 
   cb->amf_nodeName = "";
 
+  cb->is_immnd_up = false;
+
   /*** initialize avnd dbs ***/
 
   avnd_silist_init(cb);
diff --git a/src/amf/amfnd/mds.cc b/src/amf/amfnd/mds.cc
index 86d207c29..a0d0d96c6 100644
--- a/src/amf/amfnd/mds.cc
+++ b/src/amf/amfnd/mds.cc
@@ -184,6 +184,21 @@ uint32_t avnd_mds_reg(AVND_CB *cb) {
   }
   TRACE("MDS subscription for Controller AVND vdest success");
 
+  /*  Subscribe to IMMND up/down events */
+  mds_info.i_op = MDS_SUBSCRIBE;
+  mds_info.info.svc_subscribe.i_svc_ids = svc_ids;
+
+  /* subscribe to events from IMMND */
+  mds_info.info.svc_subscribe.i_scope = NCSMDS_SCOPE_NONE;
+  mds_info.info.svc_subscribe.i_num_svcs = 1;
+  svc_ids[0] = NCSMDS_SVC_ID_IMMND;
+  rc = ncsmds_api(&mds_info);
+  if (NCSCC_RC_SUCCESS != rc) {
+    LOG_CR("MDS subscription for IMMND failed");
+    goto done;
+  }
+  TRACE("MDS subscription for IMMND success");
+
   /* get the handle from MDS */
 
   memset(&ada_info, 0, sizeof(ada_info));
@@ -578,6 +593,14 @@ uint32_t avnd_mds_svc_evt(AVND_CB *cb, 
MDS_CALLBACK_SVC_EVENT_INFO *evt_info) {
           return rc;
           break;
 
+        case NCSMDS_SVC_ID_IMMND:
+          /* IMMND is up */
+          if (evt_info->i_node_id == ncs_get_node_id()) {
+            TRACE("Local IMMND up");
+            cb->is_immnd_up = true;
+          }
+          break;
+
         default:
           osafassert(0);
       }
@@ -618,6 +641,14 @@ uint32_t avnd_mds_svc_evt(AVND_CB *cb, 
MDS_CALLBACK_SVC_EVENT_INFO *evt_info) {
         case NCSMDS_SVC_ID_AVND_CNTLR:
           break;
 
+        case NCSMDS_SVC_ID_IMMND:
+          /* IMMND is down */
+          if (evt_info->i_node_id == ncs_get_node_id()) {
+            TRACE("Local IMMND down");
+            cb->is_immnd_up = false;
+          }
+          break;
+
         default:
           osafassert(0);
       }
-- 
2.17.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to