Re: [Openais] [PATCH] fix delayed shutdown

2009-05-18 Thread David Teigland
On Mon, May 18, 2009 at 01:44:50PM +0100, Chrissie Caulfield wrote:
> Steven Dake wrote:
> > I don't think this will be backwards compatible with whitetank.  IMO use
> > the memb_join_message_send function as outlined.  If you can show it
> > works with whitetank then looks good for commit.
> > 
> 
> OK, here's a new patch that doesn't create a new message type. The
> reason I had that in before was due to another bug I hadn't spotted :S
> 
> I've tested this against whitetank and it works fine.

Thanks, this works nicely.
Dave

___
Openais mailing list
Openais@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/openais


Re: [Openais] [PATCH] fix delayed shutdown

2009-05-18 Thread Steven Dake
great work good for merge

regards
-steve

On Mon, 2009-05-18 at 13:44 +0100, Chrissie Caulfield wrote:
> Steven Dake wrote:
> > I don't think this will be backwards compatible with whitetank.  IMO use
> > the memb_join_message_send function as outlined.  If you can show it
> > works with whitetank then looks good for commit.
> > 
> 
> OK, here's a new patch that doesn't create a new message type. The
> reason I had that in before was due to another bug I hadn't spotted :S
> 
> I've tested this against whitetank and it works fine.
> 
> Chrissie
> ___
> Openais mailing list
> Openais@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/openais

___
Openais mailing list
Openais@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/openais


Re: [Openais] [PATCH] fix delayed shutdown

2009-05-18 Thread Chrissie Caulfield
Steven Dake wrote:
> I don't think this will be backwards compatible with whitetank.  IMO use
> the memb_join_message_send function as outlined.  If you can show it
> works with whitetank then looks good for commit.
> 

OK, here's a new patch that doesn't create a new message type. The
reason I had that in before was due to another bug I hadn't spotted :S

I've tested this against whitetank and it works fine.

Chrissie
Index: services/cfg.c
===
--- services/cfg.c	(revision 2183)
+++ services/cfg.c	(working copy)
@@ -621,7 +621,7 @@
 
 	log_printf(LOGSYS_LEVEL_NOTICE, "Node %d was shut down by sysadmin\n", nodeid);
 	if (nodeid == api->totem_nodeid_get()) {
-		corosync_fatal_error(COROSYNC_FATAL_ERROR_EXIT);
+		api->request_shutdown();
 	}
 	LEAVE();
 }
Index: exec/totemnet.c
===
--- exec/totemnet.c	(revision 2183)
+++ exec/totemnet.c	(working copy)
@@ -1113,6 +1113,20 @@
 
 	worker_thread_group_exit (&instance->worker_thread_group);
 
+	if (instance->totemnet_sockets.mcast_recv > 0) {
+		close (instance->totemnet_sockets.mcast_recv);
+	 	poll_dispatch_delete (instance->totemnet_poll_handle,
+			instance->totemnet_sockets.mcast_recv);
+	}
+	if (instance->totemnet_sockets.mcast_send > 0) {
+		close (instance->totemnet_sockets.mcast_send);
+	}
+	if (instance->totemnet_sockets.token > 0) {
+		close (instance->totemnet_sockets.token);
+		poll_dispatch_delete (instance->totemnet_poll_handle,
+			instance->totemnet_sockets.token);
+	}
+
 	hdb_handle_put (&totemnet_instance_database, handle);
 
 error_exit:
Index: exec/totemsrp.c
===
--- exec/totemsrp.c	(revision 2183)
+++ exec/totemsrp.c	(working copy)
@@ -91,6 +91,7 @@
 #define MAXIOVS	5
 #define RETRANSMIT_ENTRIES_MAX			30
 #define TOKEN_SIZE_MAX64000 /* bytes */
+#define LEAVE_DUMMY_NODEID  0
 
 /*
  * Rollover handling:
@@ -560,6 +561,8 @@
 
 static int srp_addr_equal (const struct srp_addr *a, const struct srp_addr *b);
 
+static void memb_leave_message_send (struct totemsrp_instance *instance);
+
 static void memb_ring_id_create_or_load (struct totemsrp_instance *, struct memb_ring_id *);
 
 static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type);
@@ -872,6 +875,7 @@
 	if (res != 0) {
 		return;
 	}
+	memb_leave_message_send (instance);
 
 	hdb_handle_put (&totemsrp_instance_database, handle);
 }
@@ -1096,6 +1100,9 @@
 	int found = 0;
 	int i;
 
+	if (addr->addr[0].nodeid == LEAVE_DUMMY_NODEID)
+	return;
+
 	for (i = 0; i < instance->consensus_list_entries; i++) {
 		if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) {
 			found = 1;
@@ -2873,6 +2880,75 @@
 		addr_idx);
 }
 
+static void memb_leave_message_send (struct totemsrp_instance *instance)
+{
+	char memb_join_data[1];
+	struct memb_join *memb_join = (struct memb_join *)memb_join_data;
+	char *addr;
+	unsigned int addr_idx;
+	int active_memb_entries;
+	struct srp_addr active_memb[PROCESSOR_COUNT_MAX];
+
+	log_printf (instance->totemsrp_log_level_debug,
+		"sending join/leave message\n");
+
+	/*
+	 * add us to the failed list, and remove us from
+	 * the members list
+	 */
+	memb_set_merge(
+		   &instance->my_id, 1,
+		   instance->my_failed_list, &instance->my_failed_list_entries);
+
+	memb_set_subtract (active_memb, &active_memb_entries,
+			   instance->my_proc_list, instance->my_proc_list_entries,
+			   &instance->my_id, 1);
+
+
+	memb_join->header.type = MESSAGE_TYPE_MEMB_JOIN;
+	memb_join->header.endian_detector = ENDIAN_LOCAL;
+	memb_join->header.encapsulated = 0;
+	memb_join->header.nodeid = LEAVE_DUMMY_NODEID;
+
+	memb_join->ring_seq = instance->my_ring_id.seq;
+	memb_join->proc_list_entries = active_memb_entries;
+	memb_join->failed_list_entries = instance->my_failed_list_entries;
+	srp_addr_copy (&memb_join->system_from, &instance->my_id);
+	memb_join->system_from.addr[0].nodeid = LEAVE_DUMMY_NODEID;
+
+	// TODO: CC Maybe use the actual join send routine.
+	/*
+	 * This mess adds the joined and failed processor lists into the join
+	 * message
+	 */
+	addr = (char *)memb_join;
+	addr_idx = sizeof (struct memb_join);
+	memcpy (&addr[addr_idx],
+		active_memb,
+		active_memb_entries *
+			sizeof (struct srp_addr));
+	addr_idx += 
+		active_memb_entries *
+		sizeof (struct srp_addr);
+	memcpy (&addr[addr_idx],
+		instance->my_failed_list,
+		instance->my_failed_list_entries *
+		sizeof (struct srp_addr));
+	addr_idx += 
+		instance->my_failed_list_entries *
+		sizeof (struct srp_addr);
+
+
+	if (instance->totem_config->send_join_timeout) {
+		usleep (random() % (instance->totem_config->send_join_timeout * 1000));
+	}
+
+	totemrrp_mcast_flush_send (
+		instance->totemrrp_handle,
+		memb_join,
+		addr_idx);
+}
+
 static void memb_merge_detect_transmit (struct totemsrp_instance *instance)
 {
 	st

Re: [Openais] [PATCH] fix delayed shutdown

2009-05-14 Thread Steven Dake
I don't think this will be backwards compatible with whitetank.  IMO use
the memb_join_message_send function as outlined.  If you can show it
works with whitetank then looks good for commit.

Regards
-steve

On Thu, 2009-05-14 at 14:50 +0100, Chrissie Caulfield wrote:
> Steven Dake wrote:
> > On Thu, 2009-04-16 at 08:17 +0100, Chrissie Caulfield wrote:
> >> Fabio M. Di Nitto wrote:
> >>> On Wed, 2009-04-15 at 14:35 -0500, David Teigland wrote:
>  If I run 'cman_tool leave' on four nodes in parallel, node1 will leave 
>  right
>  away, but the other three nodes don't leave until the token timeout 
>  expires
>  for node1 causing a confchg for it, after which the other three all leave
>  right away.  This has only been annoying me recently, so I think it must 
>  have
>  been some recent change.
> >>> I have seen this in the form of:
> >>>
> >>> all nodes - 1 leave right away
> >>> the lonely node takes a while
> >>>
> >>> but this is triggered by "cman_tool leave remove" as invoked in the init
> >>> script.
> >>>
> >>> Agreed it's annoying.
> >>>
> >> It's a consequence of how the messaging works. Shutdown sends a message
> >> to al the other nodes telling them that it is going down. When that
> >> message is received by itself then it actually shuts down
> >>
> >> If you shut down a lot of nodes at the same time then one will go down
> >> right away as you observed, because the ring is fully active. Once
> >> that's happened the others will all wait for the token timeout for that
> >> node, before the rest of their messages can be delivered. And that
> >> happens for each node that is shut down.
> >>
> >> I suppose the solution is to hook the shutdown mechanism into totem
> >> somehow so that it can remove the node immediately it receives the
> >> shutdown message and not have to wait for the token to time-out too (try
> >> saying that with your teeth out!). I'm not sure how hard/politically
> >> incorrect this is though...
> >>
> >>
> > 
> > I think its a great idea if you are willing to work out a patch.  
> > 
> > totempg_finalize already exists and calls down through the stack to
> > totemsrp_finalize.  totemsrp_finalize is a noop atm.  totemnet needs a
> > finalize call which removes file descriptors from the poll mechanism and
> > shuts down those file descriptors.
> > 
> > A new abi is needed for orderly shutdown in the coroapi (which your
> > shutdown mechanism would call).  It should be linked to corosync_exit()
> > which is already implemented.
> > 
> > The trick is what do to in totemsrp_finalize() to trigger a new
> > membership phase.  One possibility is to create a function similar to
> > memb_join_message_send() except that the memb_join.proc_list_entries and
> > memb_join.failed_list_entries is equal to the current lists minus the
> > instance->my_id.  You can use memb_set_subtract to generate a new proc
> > and failed list.  memb_join.system_from would have to be some dummy
> > initialized node id (0 since it isn't allowed as a node id) and
> > memb_consensus_set() would have to check for that node id and then do no
> > operation if node id is 0.   After sending the message, call
> > totemnet_finalize().
> > 
> > I can't guarantee this will work but its worth investigation and would
> > be a nice improvement.  If you want to add this, please hurry as we are
> > approaching freeze.
> > 
> 
> OK, here's my first attempt at a patch to speed up shutdown using a
> variant of Steve's suggested method:
> 
> ___
> Openais mailing list
> Openais@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/openais

___
Openais mailing list
Openais@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/openais


[Openais] [PATCH] fix delayed shutdown

2009-05-14 Thread Chrissie Caulfield
Steven Dake wrote:
> On Thu, 2009-04-16 at 08:17 +0100, Chrissie Caulfield wrote:
>> Fabio M. Di Nitto wrote:
>>> On Wed, 2009-04-15 at 14:35 -0500, David Teigland wrote:
 If I run 'cman_tool leave' on four nodes in parallel, node1 will leave 
 right
 away, but the other three nodes don't leave until the token timeout expires
 for node1 causing a confchg for it, after which the other three all leave
 right away.  This has only been annoying me recently, so I think it must 
 have
 been some recent change.
>>> I have seen this in the form of:
>>>
>>> all nodes - 1 leave right away
>>> the lonely node takes a while
>>>
>>> but this is triggered by "cman_tool leave remove" as invoked in the init
>>> script.
>>>
>>> Agreed it's annoying.
>>>
>> It's a consequence of how the messaging works. Shutdown sends a message
>> to al the other nodes telling them that it is going down. When that
>> message is received by itself then it actually shuts down
>>
>> If you shut down a lot of nodes at the same time then one will go down
>> right away as you observed, because the ring is fully active. Once
>> that's happened the others will all wait for the token timeout for that
>> node, before the rest of their messages can be delivered. And that
>> happens for each node that is shut down.
>>
>> I suppose the solution is to hook the shutdown mechanism into totem
>> somehow so that it can remove the node immediately it receives the
>> shutdown message and not have to wait for the token to time-out too (try
>> saying that with your teeth out!). I'm not sure how hard/politically
>> incorrect this is though...
>>
>>
> 
> I think its a great idea if you are willing to work out a patch.  
> 
> totempg_finalize already exists and calls down through the stack to
> totemsrp_finalize.  totemsrp_finalize is a noop atm.  totemnet needs a
> finalize call which removes file descriptors from the poll mechanism and
> shuts down those file descriptors.
> 
> A new abi is needed for orderly shutdown in the coroapi (which your
> shutdown mechanism would call).  It should be linked to corosync_exit()
> which is already implemented.
> 
> The trick is what do to in totemsrp_finalize() to trigger a new
> membership phase.  One possibility is to create a function similar to
> memb_join_message_send() except that the memb_join.proc_list_entries and
> memb_join.failed_list_entries is equal to the current lists minus the
> instance->my_id.  You can use memb_set_subtract to generate a new proc
> and failed list.  memb_join.system_from would have to be some dummy
> initialized node id (0 since it isn't allowed as a node id) and
> memb_consensus_set() would have to check for that node id and then do no
> operation if node id is 0.   After sending the message, call
> totemnet_finalize().
> 
> I can't guarantee this will work but its worth investigation and would
> be a nice improvement.  If you want to add this, please hurry as we are
> approaching freeze.
> 

OK, here's my first attempt at a patch to speed up shutdown using a
variant of Steve's suggested method:

-- 

Chrissie
Index: services/cfg.c
===
--- services/cfg.c	(revision 2183)
+++ services/cfg.c	(working copy)
@@ -621,7 +621,7 @@
 
 	log_printf(LOGSYS_LEVEL_NOTICE, "Node %d was shut down by sysadmin\n", nodeid);
 	if (nodeid == api->totem_nodeid_get()) {
-		corosync_fatal_error(COROSYNC_FATAL_ERROR_EXIT);
+		api->request_shutdown();
 	}
 	LEAVE();
 }
Index: exec/totemnet.c
===
--- exec/totemnet.c	(revision 2183)
+++ exec/totemnet.c	(working copy)
@@ -1113,6 +1113,20 @@
 
 	worker_thread_group_exit (&instance->worker_thread_group);
 
+	if (instance->totemnet_sockets.mcast_recv > 0) {
+		close (instance->totemnet_sockets.mcast_recv);
+	 	poll_dispatch_delete (instance->totemnet_poll_handle,
+			instance->totemnet_sockets.mcast_recv);
+	}
+	if (instance->totemnet_sockets.mcast_send > 0) {
+		close (instance->totemnet_sockets.mcast_send);
+	}
+	if (instance->totemnet_sockets.token > 0) {
+		close (instance->totemnet_sockets.token);
+		poll_dispatch_delete (instance->totemnet_poll_handle,
+			instance->totemnet_sockets.token);
+	}
+
 	hdb_handle_put (&totemnet_instance_database, handle);
 
 error_exit:
Index: exec/totemsrp.c
===
--- exec/totemsrp.c	(revision 2183)
+++ exec/totemsrp.c	(working copy)
@@ -91,6 +91,7 @@
 #define MAXIOVS	5
 #define RETRANSMIT_ENTRIES_MAX			30
 #define TOKEN_SIZE_MAX64000 /* bytes */
+#define LEAVE_DUMMY_NODEID  0
 
 /*
  * Rollover handling:
@@ -132,12 +133,13 @@
 #define ENDIAN_LOCAL	 0xff22
 
 enum message_type {
-	MESSAGE_TYPE_ORF_TOKEN = 0,		/* Ordering, Reliability, Flow (ORF) control Token */
+MESSAGE_TYPE_ORF_TOKEN = 0,		/* Ordering, Reliability, Flow (ORF) control Token */
 	MESSAGE_TYPE_MCAST = 1,			/* ring or