Hi,

"Andrew Beekhof" <[EMAIL PROTECTED]> writes:

> On Wed, Apr 16, 2008 at 1:31 PM, HIDEO YAMAUCHI
> <[EMAIL PROTECTED]> wrote:
>> Hi Andrew,
>>
>>
>>  > I asked for the right function but the wrong frame number - I should
>>  > have asked for frame 2.  Sorry :(
>>
>>  (gdb) frame 2
>>  #2  0x0000000000416c74 in stop_recurring_action_by_rsc (key=0x755f60, 
>> value=0x755f40,
>>  user_data=0x545a10) at lrm.c:1442
>>  1442            if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) {
>>  (gdb) print *rsc
>>  Variable "rsc" is not available.
>>  (gdb) print *op
>>  No symbol "op" in current context.
>>
>>  Is what or my operation a mistake?
>
> Looks like gcc is being too clever for it's own good (by optimizing
> away some of the variables) :-(
>
> Can you try the following patch please?
>
> diff -r be12cb83cd2d crmd/lrm.c
> --- a/crmd/lrm.c      Wed Apr 16 10:46:59 2008 +0200
> +++ b/crmd/lrm.c      Wed Apr 16 15:02:16 2008 +0200
> @@ -1451,7 +1451,7 @@ stop_recurring_action_by_rsc(gpointer ke
>  {
>       lrm_rsc_t *rsc = user_data;
>       struct recurring_op_s *op = (struct recurring_op_s*)value;
> -     
> +     crm_info("op->rsc=%s (%p), rsc=%s (%p)", crm_str(op->rsc_id),
> op->rsc_id, crm_str(rsc->id), rsc->id);
>       if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) {
>               cancel_op(rsc, key, op->call_id, FALSE);
>       }



I think I found the cause of this issue.
I attached the additional log with your patch (a bit different though)
and the stacktrace.

Here's my observation:

 - An element of pending_ops is removed at lrm.c:L497
 - It is called inside from g_has_table_foreach() at L1475
 - This is violating the usage of g_has_table_foreach() according
   to the glib manual.
 - Therefore the iteration can not proceed correctly and would
   try to refer to a removed element.

http://hg.linux-ha.org/lha-2.1/annotate/333aef5bd4ed/crm/crmd/lrm.c
(...)
946             /* not doing this will block the node from shutting down */
947             g_hash_table_remove(pending_ops, key);
(...)
1475            g_hash_table_foreach(pending_ops, stop_recurring_action_by_rsc, 
rsc);

http://library.gnome.org/devel/glib/stable/glib-Hash-Tables.html#g-hash-table-foreach
(...)
The hash table may not be modified while iterating over it (you can't 
add/remove items).


I also attached my suggested patch, although I can not guarantee
the correctness but just to show you the idea.

Thanks,

-- 
Keisuke MORI
NTT DATA Intellilink Corporation

Attachment: ms-additional-log-20080422.tar.gz
Description: ms-additional-log-20080422.tar.gz

diff -r 333aef5bd4ed -r 36c0fd90691d crm/crmd/lrm.c
--- a/crm/crmd/lrm.c	Thu Apr 17 18:55:57 2008 +0200
+++ b/crm/crmd/lrm.c	Tue Apr 22 17:48:47 2008 +0900
@@ -943,8 +943,9 @@ cancel_op(lrm_rsc_t *rsc, const char *ke
 		if(key && remove) {
 			delete_op_entry(NULL, rsc->id, key, op);
 		}
+		/* return FALSE to be removed from pending_ops */
 		/* not doing this will block the node from shutting down */
-		g_hash_table_remove(pending_ops, key);
+		return FALSE;
 	}
 	
 	return TRUE;
@@ -954,15 +955,20 @@ gboolean cancel_done = FALSE;
 gboolean cancel_done = FALSE;
 lrm_rsc_t *cancel_rsc = NULL;
 
-static void
+static gboolean
 cancel_action_by_key(gpointer key, gpointer value, gpointer user_data)
 {
 	struct recurring_op_s *op = (struct recurring_op_s*)value;
 	
 	if(safe_str_eq(op->op_key, cancel_key)) {
 		cancel_done = TRUE;
-		cancel_op(cancel_rsc, key, op->call_id, TRUE);
-	}
+		if (!cancel_op(cancel_rsc, key, op->call_id, TRUE)) {
+			/* return TRUE to be removed from pending_ops */
+			/* when the cancellation failed */
+			return TRUE;
+		}
+	}
+	return FALSE;
 }
 
 static gboolean
@@ -976,7 +982,7 @@ cancel_op_key(lrm_rsc_t *rsc, const char
 
 	CRM_CHECK(key != NULL, return FALSE);
 	
-	g_hash_table_foreach(pending_ops, cancel_action_by_key, NULL);
+	g_hash_table_foreach_remove(pending_ops, cancel_action_by_key, NULL);
 
 	if(cancel_done == FALSE && remove) {
 		crm_err("No known %s operation to cancel", key);
@@ -1433,15 +1439,21 @@ send_direct_ack(const char *to_host, con
 	free_xml(update);
 }
 
-static void
+static gboolean
 stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data)
 {
 	lrm_rsc_t *rsc = user_data;
 	struct recurring_op_s *op = (struct recurring_op_s*)value;
 	
 	if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) {
-		cancel_op(rsc, key, op->call_id, FALSE);
-	}
+		if (!cancel_op(rsc, key, op->call_id, FALSE)) {
+			/* return TRUE to be removed from pending_ops */
+			/* when the cancellation failed */
+			return TRUE;
+		}
+	}
+
+	return FALSE;
 }
 
 void
@@ -1472,7 +1484,7 @@ do_lrm_rsc_op(lrm_rsc_t *rsc, const char
 	   || crm_str_eq(operation, CRMD_ACTION_DEMOTE, TRUE)
 	   || crm_str_eq(operation, CRMD_ACTION_PROMOTE, TRUE)
 	   || crm_str_eq(operation, CRMD_ACTION_MIGRATE, TRUE)) {
-		g_hash_table_foreach(pending_ops, stop_recurring_action_by_rsc, rsc);
+		g_hash_table_foreach_remove(pending_ops, stop_recurring_action_by_rsc, rsc);
 	}
 	
 	/* now do the op */
_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

Reply via email to