Re: [users] Two nodes got into an unrecoverable bad state

Mathivanan Naickan Palanivelu Fri, 21 Feb 2014 07:59:34 -0800

Hi,

I would recommend migrating to 4.3.1 atleast and also plan to migrate to 4.3.2 
or 4.4 once the GA is made available
beginning march.


Attached is the patch that neel had mentioned. You could try applying this or 
migrate to 4.3.1
(http://sourceforge.net/projects/opensaf/files/releases/opensaf-4.3.1.tar.gz/download)

Also, could you provide the output of thread apply all bt full , for the 
assertions below in AMF and CLM?

Thanks,
Mathi.

----- [email protected] wrote:

> OK I’ll try the patch (can it be applied to 4.3.0 or do I need to take
> a snapshot from the dev branch)?
> 
> Also found the following core’s from osafamfd last night.  Would these
> be related to this problem also or is this something else?
> 
> thanks
> —
> tony
> 
> — Assertion in osafamfd ---
> 
> #3  0x0000000000440fba in avd_mds_qsd_role_evh (cb=0x6c4d40 <_avd_cb>,
> evt=0x7f87240080a0) at avd_role.c:575
> 575 osafassert(0);
> (gdb) p cb
> $4 = (AVD_CL_CB *) 0x6c4d40 <_avd_cb>
> (gdb) p *cb
> $5 = {avd_mbx = 4291821569, avd_hb_mbx = 0, mds_handle = 0x0,
> init_state = AVD_APP_STATE, avd_fover_state = false, avail_state_avd =
> SA_AMF_HA_ACTIVE, vaddr_pwe_hdl = 65537, vaddr_hdl = 1, adest_hdl =
> 131071,
>   vaddr = 1, other_avd_adest = 0, local_avnd_adest = 298033400332316,
> nd_msg_queue_list = {nd_msg_queue = 0x0, tail = 0x0}, evt_queue =
> {evt_msg_queue = 0x0, tail = 0x0}, mbcsv_hdl = 4293918753,
>   ckpt_hdl = 4292870177, mbcsv_sel_obj = 13, stby_sync_state =
> AVD_STBY_IN_SYNC, synced_reo_type = 0, async_updt_cnt = {cb_updt = 61,
> node_updt = 8101, app_updt = 78, sg_updt = 3980, su_updt = 5980,
> si_updt = 2420,
>     sg_su_oprlist_updt = 1210, sg_admin_si_updt = 0, siass_updt =
> 1437, comp_updt = 6488, csi_updt = 0, compcstype_updt = 0,
> si_trans_updt = 0}, sync_required = true, async_updt_msgs =
> {async_updt_queue = 0x0,
>     tail = 0x0}, edu_hdl = {is_inited = true, tree = {root_node = {bit
> = -1, left = 0x8dc760, right = 0x6c4e08 <_avd_cb+200>, key_info =
> 0x8dc650 ""}, params = {key_size = 8, info_size = 0, actual_key_size =
> 0,
>         node_size = 0}, n_nodes = 22}, to_version = 6}, mds_edu_hdl =
> {is_inited = true, tree = {root_node = {bit = -1, left = 0x8dbbe0,
> right = 0x6c4e50 <_avd_cb+272>, key_info = 0x8ca840 ""}, params = {
>         key_size = 8, info_size = 393, actual_key_size = 0, node_size
> = 0}, n_nodes = 19}, to_version = 4}, cluster_init_time = 0,
> node_id_avd = 69391, node_id_avd_other = 69647, node_avd_failed = 0,
> node_list = {
>     root_node = {bit = -1, left = 0x6c4ea8 <_avd_cb+360>, right =
> 0x6c4ea8 <_avd_cb+360>, key_info = 0x8ca820 ""}, params = {key_size =
> 4, info_size = 0, actual_key_size = 0, node_size = 0}, n_nodes = 0},
>   amf_init_tmr = {tmr_id = 0x7f8724004020, type = AVD_TMR_CL_INIT,
> node_id = 0, spons_si_name = {length = 0, value = '\000' <repeats 255
> times>}, dep_si_name = {length = 0, value = '\000' <repeats 255
> times>},
>     is_active = false}, heartbeat_tmr = {tmr_id = 0x7f8724006f70, type
> = AVD_TMR_SND_HB, node_id = 0, spons_si_name = {length = 0, value =
> '\000' <repeats 255 times>}, dep_si_name = {length = 0,
>       value = '\000' <repeats 255 times>}, is_active = true},
> heartbeat_tmr_period = 10000000000, nodes_exit_cnt = 15, ntfHandle =
> 4279238657, ext_comp_info = {local_avnd_node = 0x0, ext_comp_hlt_check
> = 0x0},
>   peer_msg_fmt_ver = 4, avd_peer_ver = 6, immOiHandle = 77309480719,
> immOmHandle = 81604448015, imm_sel_obj = 17, is_implementer = true,
> clmHandle = 4285530113, clm_sel_obj = 15, swap_switch = SA_FALSE,
>   active_services_exist = true}
> (gdb) p evt
> $6 = (AVD_EVT *) 0x7f87240080a0
> (gdb) p *evt
> $7 = {next = {next = 0x0}, rcv_evt = AVD_EVT_MDS_QSD_ACK, info =
> {avnd_msg = 0x0, avd_msg = 0x0, node_id = 0, tmr = {tmr_id = 0x0, type
> = AVD_TMR_SND_HB, node_id = 0, spons_si_name = {length = 0,
>         value = '\000' <repeats 255 times>}, dep_si_name = {length =
> 0, value = '\000' <repeats 255 times>}, is_active = false}}}
> 
> 
> Core was generated by `/usr/lib64/opensaf/osafamfd osafamfd'.
> Program terminated with signal 6, Aborted.
> #0  0x0000003e42234bb5 in __GI_raise (sig=<optimized out>) at
> ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> 64   return INLINE_SYSCALL (tgkill, 3, pid, selftid, sig);
> (gdb) bt
> #0  0x0000003e42234bb5 in __GI_raise (sig=<optimized out>) at
> ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> #1  0x0000003e42237d13 in __GI_abort () at abort.c:91
> #2  0x0000003e4361a602 in __osafassert_fail (__file=0x4ac056
> "avd_role.c", __line=575, __func=0x4acca0 <__FUNCTION__.12339>
> "avd_mds_qsd_role_evh", __assertion=0x4ac5e0 "0") at sysf_def.c:301
> #3  0x0000000000440fba in avd_mds_qsd_role_evh (cb=0x6c4d40 <_avd_cb>,
> evt=0x7f87240080a0) at avd_role.c:575
> #4  0x000000000043fd56 in avd_process_event (cb_now=0x6c4d40
> <_avd_cb>, evt=0x7f87240080a0) at avd_proc.c:591
> #5  0x000000000043fab7 in avd_main_proc () at avd_proc.c:507
> #6  0x0000000000409e79 in main (argc=2, argv=0x7fffec8e6648) at
> amfd_main.c:47
> 
> 
> 
> — Assertion in clm —
> 
> (gdb) p nodeAddress
> $1 = (SaClmNodeAddressT *) 0x7f916c005b44
> (gdb) p *nodeAddress
> $2 = {family = (unknown: 0), length = 19365, value =
> "\000\000\000\000\000\000\017\001\001\000\001", '\000' <repeats 52
> times>}
> 
> 
> #0  0x0000003e42234bb5 in __GI_raise (sig=<optimized out>) at
> ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> 64   return INLINE_SYSCALL (tgkill, 3, pid, selftid, sig);
> (gdb) bt
> #0  0x0000003e42234bb5 in __GI_raise (sig=<optimized out>) at
> ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> #1  0x0000003e42237d13 in __GI_abort () at abort.c:91
> #2  0x0000003e4361a602 in __osafassert_fail (__file=0x425d55
> "clms_mds.c", __line=307, __func=0x4263a0 <__FUNCTION__.9929>
> "encodeNodeAddressT", __assertion=0x425e5a "0") at sysf_def.c:301
> #3  0x000000000041dd5e in encodeNodeAddressT (uba=0x7f9173ffe6c8,
> nodeAddress=0x7f916c005b44) at clms_mds.c:307
> #4  0x000000000041de72 in clms_enc_node_get_msg (uba=0x7f9173ffe6c8,
> msg=0x7f916c005b40) at clms_mds.c:332
> #5  0x000000000041e23d in clms_enc_cluster_ntf_buf_msg
> (uba=0x7f9173ffe6c8, notify_info=0x7f9173ffeb88) at clms_mds.c:418
> #6  0x000000000041e57b in clms_enc_track_cbk_msg (uba=0x7f9173ffe6c8,
> msg=0x7f9173ffeb70) at clms_mds.c:533
> #7  0x000000000041ecf7 in clms_mds_enc (info=0x7f9173ffe700) at
> clms_mds.c:724
> #8  0x000000000041f411 in clms_mds_enc_flat (info=0x7f9173ffe700) at
> clms_mds.c:908
> #9  0x000000000041fb0f in clms_mds_callback (info=0x7f9173ffe700) at
> clms_mds.c:1184
> #10 0x0000003e4364e6b7 in mcm_msg_encode_full_or_flat_and_send (to=2
> '\002', to_msg=0x7f9173ffe8c0, to_svc_id=35, svc_cb=0x6307d0,
> adest=299135479218219, dest_vdest_id=65535, snd_type=0, xch_id=0,
>     pri=MDS_SEND_PRIORITY_MEDIUM) at mds_c_sndrcv.c:1417
> #11 0x0000003e4364d96f in mds_mcm_send_msg_enc (to=2 '\002',
> svc_cb=0x6307d0, to_msg=0x7f9173ffe8c0, to_svc_id=35,
> dest_vdest_id=65535, req=0x7f9173ffe980, xch_id=0,
> dest=299135479218219, pri=MDS_SEND_PRIORITY_MEDIUM)
>     at mds_c_sndrcv.c:1084
> #12 0x0000003e4364d6b3 in mcm_pvt_normal_snd_process_common
> (env_hdl=65552, fr_svc_id=34, to_msg=..., to_dest=299135479218219,
> to_svc_id=35, req=0x7f9173ffe980, pri=MDS_SEND_PRIORITY_MEDIUM,
> xch_id=0)
>     at mds_c_sndrcv.c:1033
> #13 0x0000003e4364d1f8 in mcm_pvt_normal_svc_snd (env_hdl=65552,
> fr_svc_id=34, msg=0x7f9173ffeb70, to_dest=299135479218219,
> to_svc_id=35, req=0x7f9173ffe980, pri=MDS_SEND_PRIORITY_MEDIUM) at
> mds_c_sndrcv.c:890
> #14 0x0000003e4364cc8b in mds_mcm_send (info=0x7f9173ffeab0) at
> mds_c_sndrcv.c:675
> #15 0x0000003e4364c2a6 in mds_send (info=0x7f9173ffeab0) at
> mds_c_sndrcv.c:384
> #16 0x0000003e4364bf12 in ncsmds_api (svc_to_mds_info=0x7f9173ffeab0)
> at mds_papi.c:104
> #17 0x00000000004201d7 in clms_mds_msg_send (cb=0x62ac40 <_clms_cb>,
> msg=0x7f9173ffeb70, dest=0x65a668, mds_ctxt=0x0,
> prio=MDS_SEND_PRIORITY_MEDIUM, svc_id=NCSMDS_SVC_ID_CLMA) at
> clms_mds.c:1453
> #18 0x000000000040f499 in clms_prep_and_send_track (cb=0x62ac40
> <_clms_cb>, node=0x654ae0, client=0x65a640,
> step=SA_CLM_CHANGE_COMPLETED, notify=0x7f916c0009a0) at
> clms_imm.c:1064
> #19 0x000000000040e8db in clms_send_track (cb=0x62ac40 <_clms_cb>,
> node=0x654ae0, step=SA_CLM_CHANGE_COMPLETED) at clms_imm.c:835
> #20 0x0000000000409430 in clms_track_send_node_down (node=0x654ae0) at
> clms_evt.c:428
> #21 0x000000000040ca38 in imm_impl_set_node_down_proc (_cb=0x62ac40
> <_clms_cb>) at clms_imm.c:93
> #22 0x0000003e42a07e18 in start_thread (arg=0x7f9173fff700) at
> pthread_create.c:309
> #23 0x0000003e422e88bd in clone () at
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
> On Feb 21, 2014, at 7:35 AM, Neelakanta Reddy
> <[email protected]<mailto:[email protected]>>
> wrote:
> 
> Hi,
> 
> Comments inline.
> 
> /Neel.
> On Friday 21 February 2014 05:45 PM, Tony Hart wrote:
> Hi Neel,
> Thanks for the analysis.  It seems that multiple components in this
> case tripped on the race condition, I assume from your description
> that the fix was only applied to CLM?  Also in this case the node
> didn’t recover despite multiple restarts - does that fit with the
> scenario in ticket 528?
> Apply the patch for CLM and test. If it is reproducible, please share
> the sylogs of both the controllers.
> Is this reproducible? - not sure yet, this is the first time I’ve seen
> this particular crash, but we recently started testing on bigger
> systems and that could be a factor.
> 
> We really need a fix for this - should I open a ticket?
> 
> thanks
> —
> tony
> 
> On Feb 21, 2014, at 5:36 AM, Neelakanta Reddy
> <[email protected]<mailto:[email protected]>>
> wrote:
> 
> Hi,
> 
> The same problem is observed in CLM, and is fixed in
> sourceforge.net/p/opensaf/tickets/528<http://sourceforge.net/p/opensaf/tickets/528>
> .
> It is fixed in 4622, changeset for opensaf-4.3.x .
> 
> For other services, the problem is not yet fixed.
> 
> Can you, please confirm it is re-producible always.
> 
> /Neel.
> 
> 
> On Friday 21 February 2014 05:30 AM, Tony Hart wrote:
> 4.3.0
> 
> BTW is there a way to tell at runtime what version is installed?
> 
> 
> On Feb 20, 2014, at 4:03 AM, Neelakanta Reddy
> <[email protected]<mailto:[email protected]>>
> wrote:
> 
> Hi,
> 
> which version of OpenSAF is used. It looks to be an older release.
> 
> /Neel.
> 
> On Wednesday 19 February 2014 08:42 PM, Tony Hart wrote:
> Hi Neel,
> Thanks for the reply, I’ve attached a fuller log (just the osaf
> message) from SCM2,  unfortunately the logs from SCM1 are not
> available.
> 
> —
> tony
> 
> 
> 
> ------------------------------------------------------------------------------
> Managing the Performance of Cloud-Based Applications
> Take advantage of what the Cloud has to offer - Avoid Common
> Pitfalls.
> Read the Whitepaper.
> http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk
> _______________________________________________
> Opensaf-users mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/opensaf-users

diff --git a/osaf/services/saf/clmsv/clms/clms_amf.c b/osaf/services/saf/clmsv/clms/clms_amf.c
--- a/osaf/services/saf/clmsv/clms/clms_amf.c
+++ b/osaf/services/saf/clmsv/clms/clms_amf.c
@@ -255,17 +255,24 @@ static void clms_amf_csi_set_callback(Sa
 		role_change = false;
 
 	if (role_change == true) {
-
-		if(clms_cb->ha_state == SA_AMF_HA_ACTIVE)
+		if(clms_cb->ha_state == SA_AMF_HA_ACTIVE) {
 			clms_imm_impl_set(clms_cb);
+			/* Unconditionally refresh IMM for runtime attributes */
+			clms_switchon_all_pending_rtupdates();
+		}
 
 		if ((rc = clms_mds_change_role(clms_cb)) != NCSCC_RC_SUCCESS) {
 			LOG_ER("clms_mds_change_role FAILED");
 			error = SA_AIS_ERR_FAILED_OPERATION;
 		}
+
 		/* Inform MBCSV of HA state change */
 		if (NCSCC_RC_SUCCESS != (error = clms_mbcsv_change_HA_state(clms_cb)))
 			error = SA_AIS_ERR_FAILED_OPERATION;
+
+		/* Clear up any pending rtu updates, the active will take care of it */
+		if (clms_cb->ha_state == SA_AMF_HA_STANDBY)
+			clms_switchoff_all_pending_rtupdates();
 	}
 
  response:
diff --git a/osaf/services/saf/clmsv/clms/clms_cb.h b/osaf/services/saf/clmsv/clms/clms_cb.h
--- a/osaf/services/saf/clmsv/clms/clms_cb.h
+++ b/osaf/services/saf/clmsv/clms/clms_cb.h
@@ -40,6 +40,7 @@ typedef struct cluster_db_t {
 	SaNameT name;
 	SaUint32T num_nodes;
 	SaTimeT init_time;
+	SaBoolT rtu_pending; /* Flag to indicate whether an RTU failed and is pending */
 	/*struct cluster_db_t *next; */	/* Multiple cluster is not supported as of now */
 } CLMS_CLUSTER_INFO;
 
@@ -69,6 +70,8 @@ typedef struct cluster_node_t {
 	ADMIN_OP admin_op;	/*plm or clm operation */
 	timer_t lock_timerid;	/*Timer id for admin lock operation */
 	SaInvocationT plm_invid;	/*plmtrack callback invocation id */
+	SaBoolT rtu_pending; /* Flag to mark whether an IMM RunTime attribute Update is pending and to be retried */
+	SaBoolT admin_rtu_pending; /* Flag to mark whether an IMM RunTime attribute Update is pending and to be retried */
 	struct cluster_node_t *dep_node_list;	/*Dependent nodes list - in case of plm operation */
 	struct cluster_node_t *next;
 } CLMS_CLUSTER_NODE;
@@ -172,6 +175,7 @@ typedef struct clms_cb_t {
 	SaSelectionObjectT plm_sel_obj;	/* PLMSv selection object */
 	SaNtfHandleT ntf_hdl;	/* Handled obtained from NTFSv */
 	SaBoolT reg_with_plm;	/*plm present in system */
+	SaBoolT rtu_pending; /* Global flag to determine a pending RTU update and the poll timeout */
 	CLMA_DOWN_LIST *clma_down_list_head;	/* CLMA down reccords - Fix for Failover missed 
 						   down events Processing */
 	CLMA_DOWN_LIST *clma_down_list_tail;
diff --git a/osaf/services/saf/clmsv/clms/clms_imm.c b/osaf/services/saf/clmsv/clms/clms_imm.c
--- a/osaf/services/saf/clmsv/clms/clms_imm.c
+++ b/osaf/services/saf/clmsv/clms/clms_imm.c
@@ -22,6 +22,7 @@
 extern struct ImmutilWrapperProfile immutilWrapperProfile;
 
 void clms_all_node_rattr_update(void);
+static SaBoolT clms_is_any_rtu_pending(void);
 SaAisErrorT clms_node_ccb_comp_cb(CcbUtilOperationData_t * opdata);
 uint32_t clms_imm_node_unlock(CLMS_CLUSTER_NODE * nodeop);
 uint32_t clms_imm_node_lock(CLMS_CLUSTER_NODE * nodeop);
@@ -32,6 +33,10 @@ static uint32_t clms_lock_send_no_start_
 
 static SaVersionT immVersion = { 'A', 2, 1 };
 
+
+const unsigned int sleep_delay_ms = 500;
+const unsigned int max_waiting_time_ms = 60 * 1000;     /* 60 seconds */
+
 /**
 * Initialize the track response patricia tree for the node
 * @param[in] node node to initialize trackresponse tree
@@ -64,22 +69,49 @@ static void *imm_impl_set_node_down_proc
 	NODE_DOWN_LIST *node_down_rec = NULL;
 	NODE_DOWN_LIST *temp_node_down_rec = NULL;
 	CLMS_CLUSTER_NODE *node = NULL;
+	int msecs_waited;
 
 	TRACE_ENTER();
 
 	/* Update IMM */
-	if ((rc = immutil_saImmOiImplementerSet(cb->immOiHandle, IMPLEMENTER_NAME)) != SA_AIS_OK) {
-		LOG_ER("saImmOiImplementerSet failed rc:%u, exiting", rc);
+
+	msecs_waited = 0;
+	rc = saImmOiImplementerSet(cb->immOiHandle, IMPLEMENTER_NAME);
+	while (((rc == SA_AIS_ERR_TRY_AGAIN) || (rc == SA_AIS_ERR_EXIST)) &&
+						(msecs_waited < max_waiting_time_ms)) {
+		usleep(sleep_delay_ms * 1000);
+		msecs_waited += sleep_delay_ms;
+		rc = saImmOiImplementerSet(cb->immOiHandle, IMPLEMENTER_NAME);
+	}
+	if (rc != SA_AIS_OK) {
+		/* We have tried enough, now just exit */
+		LOG_ER("saImmOiImplementerSet failed, rc = %u", rc);
 		exit(EXIT_FAILURE);
 	}
-
-	if ((rc = immutil_saImmOiClassImplementerSet(cb->immOiHandle, "SaClmNode")) != SA_AIS_OK) {
-		LOG_ER("saImmOiClassImplementerSet failed for class SaClmNode rc:%u, exiting", rc);
+	
+	msecs_waited = 0;
+	rc = saImmOiClassImplementerSet(cb->immOiHandle, "SaClmNode");
+	while (((rc == SA_AIS_ERR_TRY_AGAIN) || (rc == SA_AIS_ERR_EXIST)) &&
+						(msecs_waited < max_waiting_time_ms)) {
+		usleep(sleep_delay_ms * 1000);
+		msecs_waited += sleep_delay_ms;
+		rc = saImmOiClassImplementerSet(cb->immOiHandle, "SaClmNode");
+	}
+	if (rc != SA_AIS_OK) {
+		LOG_ER("saImmOiClassImplementerSet failed for class SaClmNode, rc = %u", rc);
 		exit(EXIT_FAILURE);
 	}
-
-	if ((rc = immutil_saImmOiClassImplementerSet(cb->immOiHandle, "SaClmCluster")) != SA_AIS_OK) {
-		LOG_ER("saImmOiClassImplementerSet failed for class SaClmCluster rc:%u, exiting", rc);
+	
+	msecs_waited = 0;
+	rc = saImmOiClassImplementerSet(cb->immOiHandle, "SaClmCluster");
+	while (((rc == SA_AIS_ERR_TRY_AGAIN) || (rc == SA_AIS_ERR_EXIST)) &&
+						(msecs_waited < max_waiting_time_ms)) {
+		usleep(sleep_delay_ms * 1000);
+		msecs_waited += sleep_delay_ms;
+		rc = saImmOiClassImplementerSet(cb->immOiHandle, "SaClmCluster");
+	}
+	if (rc != SA_AIS_OK) {
+		LOG_ER("saImmOiClassImplementerSet failed for class SaClmCluster, rc = %u,", rc);
 		exit(EXIT_FAILURE);
 	}
 
@@ -175,7 +207,10 @@ CLMS_CLUSTER_NODE *clms_node_new(SaNameT
 	node->node_name.length = name->length;
 	node->node_addr.family = 1;
 	node->admin_state = SA_CLM_ADMIN_UNLOCKED;
+	node->rtu_pending = false;
+	node->admin_rtu_pending = false;
 
+	TRACE("RTU pending flag is switched off");
 	TRACE("nodename %s", node->node_name.value);
 
 	while ((attr = attrs[i++]) != NULL) {
@@ -350,6 +385,9 @@ SaAisErrorT clms_cluster_config_get(void
 		if (clms_cb->ha_state == SA_AMF_HA_ACTIVE) {
 			osaf_cluster->init_time = clms_get_SaTime();
 		}
+
+		osaf_cluster->rtu_pending = false;
+		TRACE("RTU pending flag is switched off");
 	}
 	rc = SA_AIS_OK;
  done2:
@@ -405,8 +443,8 @@ SaAisErrorT clms_imm_activate(CLMS_CB *c
 			goto done;
 		}
 
+		cb->is_impl_set = true;
 		clms_all_node_rattr_update();
-		cb->is_impl_set = true;
 	}
 
 	rc = SA_AIS_OK;
@@ -428,6 +466,13 @@ void clms_admin_state_update_rattr(CLMS_
 
 	TRACE_ENTER2("Admin state %d update for node %s", nd->admin_state, nd->node_name.value);
 
+	CLMS_CLUSTER_NODE *node = NULL;
+	/* If this update was attempted was for a node down and as a part of try-again-later, then
+	 * we need to lookup using name, because the node_id record would
+	 * have been deleted as a part of node down processing
+	 */
+	osafassert((node = clms_node_get_by_name(&nd->node_name)));
+
 	SaImmAttrValueT attrUpdateValue[] = { &nd->admin_state };
 	const SaImmAttrModificationT_2 *attrMods[] = {
 		&attr_Mod[0],
@@ -440,13 +485,34 @@ void clms_admin_state_update_rattr(CLMS_
 	attr_Mod[0].modAttr.attrValueType = SA_IMM_ATTR_SAUINT32T;
 	attr_Mod[0].modAttr.attrValues = attrUpdateValue;
 
-	int errorsAreFatal = immutilWrapperProfile.errorsAreFatal;
-	immutilWrapperProfile.errorsAreFatal = 0;
-	rc = immutil_saImmOiRtObjectUpdate_2(clms_cb->immOiHandle, &nd->node_name, attrMods);
-	immutilWrapperProfile.errorsAreFatal = errorsAreFatal;
+	rc = saImmOiRtObjectUpdate_2(clms_cb->immOiHandle, &nd->node_name, attrMods);
+	if (rc == SA_AIS_OK) {
+		node->admin_rtu_pending = false;
+		/* Walk through all nodes to find out if any other rtu is pending
+		 * and accordingly turn off the global flag in cb.
+		 */
+		if (clms_cb->rtu_pending == true) {
+			if (clms_is_any_rtu_pending() == false) {
+				clms_cb->rtu_pending = false;
+				TRACE("RTUpdate success. Turning off flag");
+			}
+		}
+	} else if ((rc == SA_AIS_ERR_TRY_AGAIN) || (rc == SA_AIS_ERR_TIMEOUT)) {
+		LOG_IN("saImmOiRtObjectUpdate for %s failed with rc = %u. Trying again", node->node_name.value, rc);
+		node->admin_rtu_pending = true;
+		clms_cb->rtu_pending = true;
+	} else {
+		/* Right now, there is no guarantee on IMM error codes. So Reinit for everyother error code */
+		LOG_IN("saImmOiRtObjectUpdate for %s failed with rc = %u. Reinit with IMM", node->node_name.value, rc);
+		node->admin_rtu_pending = true;
+		clms_cb->rtu_pending = true;
 
-	if (rc != SA_AIS_OK) {
-		LOG_ER("saImmOiRtObjectUpdate FAILED %u, '%s'", rc, nd->node_name.value);
+		saImmOiFinalize(clms_cb->immOiHandle);
+		clms_cb->immOiHandle = 0;
+		clms_cb->is_impl_set = false;
+
+		/* Initiate IMM reinitializtion in the background */
+		clm_imm_reinit_bg(clms_cb);
 	}
 
 	TRACE_LEAVE();
@@ -473,7 +539,18 @@ void clms_node_update_rattr(CLMS_CLUSTER
 		NULL
 	};
 
+	CLMS_CLUSTER_NODE *node = NULL;
+
 	TRACE_ENTER();
+	osafassert((node = clms_node_get_by_name(&nd->node_name)));
+
+	if (clms_cb->is_impl_set == false) {
+		TRACE("Implementer not yet set: Switching on the tryagain flag");
+		node->rtu_pending = true;
+		clms_cb->rtu_pending = true;
+		TRACE_LEAVE();
+		return;
+	}
 
 	attr_Mod[0].modType = SA_IMM_ATTR_VALUES_REPLACE;
 	attr_Mod[0].modAttr.attrName = "saClmNodeIsMember";
@@ -499,20 +576,41 @@ void clms_node_update_rattr(CLMS_CLUSTER
 	attr_Mod[3].modAttr.attrValueType = SA_IMM_ATTR_SAUINT64T;
 	attr_Mod[3].modAttr.attrValues = attrUpdateValue3;
 
-	int errorsAreFatal = immutilWrapperProfile.errorsAreFatal;
-	immutilWrapperProfile.errorsAreFatal = 0;
-	rc = immutil_saImmOiRtObjectUpdate_2(clms_cb->immOiHandle, &nd->node_name, attrMods);
-	immutilWrapperProfile.errorsAreFatal = errorsAreFatal;
+	rc = saImmOiRtObjectUpdate_2(clms_cb->immOiHandle, &nd->node_name, attrMods);
 
-	if (rc != SA_AIS_OK) {
-		LOG_ER("saImmOiRtObjectUpdate FAILED %u, '%s'", rc, nd->node_name.value);
+	if (rc == SA_AIS_OK) {
+		node->rtu_pending = false;
+		/* Walk through all nodes to find out if any other rtu is pending
+		 * and accordingly turn off the global flag in cb.
+		 */
+		if (clms_cb->rtu_pending == true) {
+			if (clms_is_any_rtu_pending() == false) {
+				clms_cb->rtu_pending = false;
+				TRACE("RTUpdate success. Turning off flag");
+			}
+		}
+	} else if ((rc == SA_AIS_ERR_TRY_AGAIN) || (rc == SA_AIS_ERR_TIMEOUT)) {
+		LOG_IN("saImmOiRtObjectUpdate for %s failed with rc = %u. Trying again", node->node_name.value, rc);
+		node->rtu_pending = true;
+		clms_cb->rtu_pending = true;
+	} else {
+		LOG_IN("saImmOiRtObjectUpdate for %s failed with rc = %u. Reinit with IMM", node->node_name.value, rc);
+		node->rtu_pending = true;
+		clms_cb->rtu_pending = true;
+
+		saImmOiFinalize(clms_cb->immOiHandle);
+		clms_cb->immOiHandle = 0;
+		clms_cb->is_impl_set = false;
+
+		/* Initiate IMM reinitializtion in the background */
+		clm_imm_reinit_bg(clms_cb);
 	}
 
 	TRACE_LEAVE();
 }
 
 /** 
-* Update IMMSv the runtime info of all node 
+* Update IMMSv the runtime info of all nodes 
 */
 void clms_all_node_rattr_update(void)
 {
@@ -528,6 +626,97 @@ void clms_all_node_rattr_update(void)
 }
 
 /** 
+*  Process all pending runtime attribute updates toward IMM
+*/
+void clms_retry_pending_rtupdates(void)
+{
+	CLMS_CLUSTER_NODE *node = NULL;
+	SaNameT nodename = {0};
+	TRACE_ENTER();
+
+	if (clms_cb->is_impl_set == false) {
+		TRACE_LEAVE2("Implementerset yet to happen, try later");
+		return;
+	}
+	for (node = clms_node_getnext_by_name(&nodename); node != NULL; node = clms_node_getnext_by_name(&nodename)) {
+		if (node->rtu_pending == true)
+			clms_node_update_rattr(node);
+		if (node->admin_rtu_pending == true)
+			clms_admin_state_update_rattr(node);
+		memcpy(&nodename, &node->node_name, sizeof(SaNameT));
+	}
+
+	if (osaf_cluster->rtu_pending == true)
+		clms_cluster_update_rattr(osaf_cluster);
+	TRACE_LEAVE();
+}
+
+/** 
+*  As a standby, clear all pending runtime attribute updates toward IMM
+*  The new active will take care of it.
+*/
+void clms_switchoff_all_pending_rtupdates(void)
+{
+	CLMS_CLUSTER_NODE *node = NULL;
+	SaNameT nodename = {0};
+	TRACE_ENTER();
+
+	for (node = clms_node_getnext_by_name(&nodename); node != NULL; node = clms_node_getnext_by_name(&nodename)) {
+		TRACE("Switching on the tryagain flag");
+		node->rtu_pending = false;
+		node->admin_rtu_pending = false;
+		memcpy(&nodename, &node->node_name, sizeof(SaNameT));
+	}
+	osaf_cluster->rtu_pending = false;
+	clms_cb->rtu_pending = false;
+	TRACE_LEAVE();
+}
+
+/** 
+*  As a active, mark runtime attribute updates pending for all nodes.
+*/
+void clms_switchon_all_pending_rtupdates(void)
+{
+	CLMS_CLUSTER_NODE *node = NULL;
+	SaNameT nodename = {0};
+	TRACE_ENTER();
+
+	for (node = clms_node_getnext_by_name(&nodename); node != NULL; node = clms_node_getnext_by_name(&nodename)) {
+		TRACE("Switching on the pending RTUs");
+		node->rtu_pending = true;
+		node->admin_rtu_pending = true;
+		memcpy(&nodename, &node->node_name, sizeof(SaNameT));
+	}
+	osaf_cluster->rtu_pending = true;
+	clms_cb->rtu_pending = true;
+	TRACE_LEAVE();
+}
+
+/** 
+*  As a standby, clear all pending runtime attribute updates toward IMM
+*  The new active will take care of it.
+*/
+static SaBoolT clms_is_any_rtu_pending(void)
+{
+	CLMS_CLUSTER_NODE *node = NULL;
+	SaNameT nodename = {0};
+	TRACE_ENTER();
+
+	if (osaf_cluster->rtu_pending == true)
+		return true;
+
+	for (node = clms_node_getnext_by_name(&nodename); node != NULL; node = clms_node_getnext_by_name(&nodename)) {
+		if ((node->rtu_pending == true) || (node->admin_rtu_pending == true)) {
+			TRACE_LEAVE2("There is a pending RTU");
+			return true;
+		}
+		memcpy(&nodename, &node->node_name, sizeof(SaNameT));
+	}
+	TRACE_LEAVE2("There are no pending RTUs");
+	return false;
+}
+
+/** 
 * Update IMMSv with the runtime info of the osaf cluster 
 * @param[in] osaf_cluster pointer to CLM Cluster
 */
@@ -546,6 +735,14 @@ void clms_cluster_update_rattr(CLMS_CLUS
 
 	TRACE_ENTER();
 
+	if (clms_cb->is_impl_set == false) {
+		TRACE("Implementer is not set. Switching on flag in %s", __FUNCTION__);
+		osaf_cluster->rtu_pending = true;
+		clms_cb->rtu_pending = true;
+		TRACE_LEAVE();
+		return;
+	}
+
 	attr_Mod[0].modType = SA_IMM_ATTR_VALUES_REPLACE;
 	attr_Mod[0].modAttr.attrName = "saClmClusterNumNodes";
 	attr_Mod[0].modAttr.attrValuesNumber = 1;
@@ -558,13 +755,31 @@ void clms_cluster_update_rattr(CLMS_CLUS
 	attr_Mod[1].modAttr.attrValueType = SA_IMM_ATTR_SATIMET;
 	attr_Mod[1].modAttr.attrValues = attrUpdateValue1;
 
-	int errorsAreFatal = immutilWrapperProfile.errorsAreFatal;
-	immutilWrapperProfile.errorsAreFatal = 0;
-	rc = immutil_saImmOiRtObjectUpdate_2(clms_cb->immOiHandle, &osaf_cluster->name, attrMods);
-	immutilWrapperProfile.errorsAreFatal = errorsAreFatal;
+	rc = saImmOiRtObjectUpdate_2(clms_cb->immOiHandle, &osaf_cluster->name, attrMods);
 
-	if (rc != SA_AIS_OK) {
-		LOG_ER("saImmOiRtObjectUpdate FAILED %u, '%s'", rc, osaf_cluster->name.value);
+	if (rc == SA_AIS_OK){
+		osaf_cluster->rtu_pending = false;
+		if (clms_cb->rtu_pending == true) {
+			if (clms_is_any_rtu_pending() == false) {
+				clms_cb->rtu_pending = false;
+				TRACE("RTU success, Switching off");
+			}
+		}
+	} else if ((rc == SA_AIS_ERR_TRY_AGAIN) || (rc == SA_AIS_ERR_TIMEOUT)) {
+		LOG_IN("saImmOiRtObjectUpdate failed for cluster object with rc = %u. Trying again", rc);
+		osaf_cluster->rtu_pending = true;
+		clms_cb->rtu_pending = true;
+	} else {
+		LOG_IN("saImmOiRtObjectUpdate failed for cluster object with rc = %u. Reinit with IMM", rc);
+		osaf_cluster->rtu_pending = true;
+		clms_cb->rtu_pending = true;
+
+		saImmOiFinalize(clms_cb->immOiHandle);
+		clms_cb->immOiHandle = 0;
+		clms_cb->is_impl_set = false;
+
+		/* Initiate IMM reinitializtion in the background */
+		clm_imm_reinit_bg(clms_cb);
 	}
 
 	/* TBD: We need to handle a case where there's only one node,
diff --git a/osaf/services/saf/clmsv/clms/clms_imm.h b/osaf/services/saf/clmsv/clms/clms_imm.h
--- a/osaf/services/saf/clmsv/clms/clms_imm.h
+++ b/osaf/services/saf/clmsv/clms/clms_imm.h
@@ -45,4 +45,7 @@ extern uint32_t clms_prep_and_send_track
 extern uint32_t clms_send_track_local(CLMS_CLUSTER_NODE * node, CLMS_CLIENT_INFO * client, 
 				     SaClmChangeStepT step);
 extern void clms_trackresp_patricia_init(CLMS_CLUSTER_NODE * node);
+extern void clms_switchoff_all_pending_rtupdates(void);
+extern void clms_switchon_all_pending_rtupdates(void);
+extern void clms_retry_pending_rtupdates(void);
 #endif
diff --git a/osaf/services/saf/clmsv/clms/clms_main.c b/osaf/services/saf/clmsv/clms/clms_main.c
--- a/osaf/services/saf/clmsv/clms/clms_main.c
+++ b/osaf/services/saf/clmsv/clms/clms_main.c
@@ -227,6 +227,7 @@ uint32_t clms_cb_init(CLMS_CB * clms_cb)
 	clms_cb->curr_invid = 1;
 	clms_cb->immOiHandle = 0;
 	clms_cb->is_impl_set = false;
+	clms_cb->rtu_pending = false; /* Flag to control try-again of rt-updates */
 
 	/* Assign Version. Currently, hardcoded, This will change later */
 	clms_cb->clm_ver.releaseCode = CLM_RELEASE_CODE;
@@ -378,6 +379,7 @@ int main(int argc, char *argv[])
 	SaAisErrorT error = SA_AIS_OK;
 	uint32_t rc;
 	osaf_cluster = NULL;
+	int timeout = -1;
 
 	daemonize(argc, argv);
 
@@ -405,6 +407,13 @@ int main(int argc, char *argv[])
 
 	while (1) {
 
+		if (clms_cb->rtu_pending == true) {
+			TRACE("There is an IMM task to be tried again. setting poll time out to 500");
+			timeout = 500;
+		}else {
+			timeout = -1;
+		}
+
 		if ((clms_cb->immOiHandle != 0) && (clms_cb->is_impl_set == true)) {
 			fds[FD_IMM].fd = clms_cb->imm_sel_obj;
 			fds[FD_IMM].events = POLLIN;
@@ -412,8 +421,7 @@ int main(int argc, char *argv[])
 		} else {
 			nfds = NUM_FD - 1;
 		}
-
-		int ret = poll(fds, nfds, -1);
+		int ret = poll(fds, nfds, timeout);
 
 		if (ret == -1) {
 			if (errno == EINTR)
@@ -422,6 +430,14 @@ int main(int argc, char *argv[])
 			LOG_ER("poll failed - %s", strerror(errno));
 			break;
 		}
+
+		if (ret == 0) {
+			/* Process any/all pending RTAttribute updates to IMM */
+			TRACE("poll time out processing pending updates");
+			clms_retry_pending_rtupdates();
+			continue;
+		}
+
 		if (fds[FD_AMF].revents & POLLIN) {
 			if (clms_cb->amf_hdl != 0) {
 				if ((error = saAmfDispatch(clms_cb->amf_hdl, SA_DISPATCH_ALL)) != SA_AIS_OK) {
@@ -496,7 +512,10 @@ int main(int argc, char *argv[])
 				}
 			}
 		}
-	}
+		/* Retry any pending updates */
+		if (clms_cb->rtu_pending == true)
+			clms_retry_pending_rtupdates();
+	} /* End while (1) */
 
  done:
 	LOG_ER("Failed, exiting...");

------------------------------------------------------------------------------
Managing the Performance of Cloud-Based Applications
Take advantage of what the Cloud has to offer - Avoid Common Pitfalls.
Read the Whitepaper.
http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk

_______________________________________________
Opensaf-users mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-users

Re: [users] Two nodes got into an unrecoverable bad state

Reply via email to