Hi, "Andrew Beekhof" <[EMAIL PROTECTED]> writes:
> On Wed, Apr 16, 2008 at 1:31 PM, HIDEO YAMAUCHI > <[EMAIL PROTECTED]> wrote: >> Hi Andrew, >> >> >> > I asked for the right function but the wrong frame number - I should >> > have asked for frame 2. Sorry :( >> >> (gdb) frame 2 >> #2 0x0000000000416c74 in stop_recurring_action_by_rsc (key=0x755f60, >> value=0x755f40, >> user_data=0x545a10) at lrm.c:1442 >> 1442 if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) { >> (gdb) print *rsc >> Variable "rsc" is not available. >> (gdb) print *op >> No symbol "op" in current context. >> >> Is what or my operation a mistake? > > Looks like gcc is being too clever for it's own good (by optimizing > away some of the variables) :-( > > Can you try the following patch please? > > diff -r be12cb83cd2d crmd/lrm.c > --- a/crmd/lrm.c Wed Apr 16 10:46:59 2008 +0200 > +++ b/crmd/lrm.c Wed Apr 16 15:02:16 2008 +0200 > @@ -1451,7 +1451,7 @@ stop_recurring_action_by_rsc(gpointer ke > { > lrm_rsc_t *rsc = user_data; > struct recurring_op_s *op = (struct recurring_op_s*)value; > - > + crm_info("op->rsc=%s (%p), rsc=%s (%p)", crm_str(op->rsc_id), > op->rsc_id, crm_str(rsc->id), rsc->id); > if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) { > cancel_op(rsc, key, op->call_id, FALSE); > } I think I found the cause of this issue. I attached the additional log with your patch (a bit different though) and the stacktrace. Here's my observation: - An element of pending_ops is removed at lrm.c:L497 - It is called inside from g_has_table_foreach() at L1475 - This is violating the usage of g_has_table_foreach() according to the glib manual. - Therefore the iteration can not proceed correctly and would try to refer to a removed element. http://hg.linux-ha.org/lha-2.1/annotate/333aef5bd4ed/crm/crmd/lrm.c (...) 946 /* not doing this will block the node from shutting down */ 947 g_hash_table_remove(pending_ops, key); (...) 1475 g_hash_table_foreach(pending_ops, stop_recurring_action_by_rsc, rsc); http://library.gnome.org/devel/glib/stable/glib-Hash-Tables.html#g-hash-table-foreach (...) The hash table may not be modified while iterating over it (you can't add/remove items). I also attached my suggested patch, although I can not guarantee the correctness but just to show you the idea. Thanks, -- Keisuke MORI NTT DATA Intellilink Corporation
ms-additional-log-20080422.tar.gz
Description: ms-additional-log-20080422.tar.gz
diff -r 333aef5bd4ed -r 36c0fd90691d crm/crmd/lrm.c --- a/crm/crmd/lrm.c Thu Apr 17 18:55:57 2008 +0200 +++ b/crm/crmd/lrm.c Tue Apr 22 17:48:47 2008 +0900 @@ -943,8 +943,9 @@ cancel_op(lrm_rsc_t *rsc, const char *ke if(key && remove) { delete_op_entry(NULL, rsc->id, key, op); } + /* return FALSE to be removed from pending_ops */ /* not doing this will block the node from shutting down */ - g_hash_table_remove(pending_ops, key); + return FALSE; } return TRUE; @@ -954,15 +955,20 @@ gboolean cancel_done = FALSE; gboolean cancel_done = FALSE; lrm_rsc_t *cancel_rsc = NULL; -static void +static gboolean cancel_action_by_key(gpointer key, gpointer value, gpointer user_data) { struct recurring_op_s *op = (struct recurring_op_s*)value; if(safe_str_eq(op->op_key, cancel_key)) { cancel_done = TRUE; - cancel_op(cancel_rsc, key, op->call_id, TRUE); - } + if (!cancel_op(cancel_rsc, key, op->call_id, TRUE)) { + /* return TRUE to be removed from pending_ops */ + /* when the cancellation failed */ + return TRUE; + } + } + return FALSE; } static gboolean @@ -976,7 +982,7 @@ cancel_op_key(lrm_rsc_t *rsc, const char CRM_CHECK(key != NULL, return FALSE); - g_hash_table_foreach(pending_ops, cancel_action_by_key, NULL); + g_hash_table_foreach_remove(pending_ops, cancel_action_by_key, NULL); if(cancel_done == FALSE && remove) { crm_err("No known %s operation to cancel", key); @@ -1433,15 +1439,21 @@ send_direct_ack(const char *to_host, con free_xml(update); } -static void +static gboolean stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data) { lrm_rsc_t *rsc = user_data; struct recurring_op_s *op = (struct recurring_op_s*)value; if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) { - cancel_op(rsc, key, op->call_id, FALSE); - } + if (!cancel_op(rsc, key, op->call_id, FALSE)) { + /* return TRUE to be removed from pending_ops */ + /* when the cancellation failed */ + return TRUE; + } + } + + return FALSE; } void @@ -1472,7 +1484,7 @@ do_lrm_rsc_op(lrm_rsc_t *rsc, const char || crm_str_eq(operation, CRMD_ACTION_DEMOTE, TRUE) || crm_str_eq(operation, CRMD_ACTION_PROMOTE, TRUE) || crm_str_eq(operation, CRMD_ACTION_MIGRATE, TRUE)) { - g_hash_table_foreach(pending_ops, stop_recurring_action_by_rsc, rsc); + g_hash_table_foreach_remove(pending_ops, stop_recurring_action_by_rsc, rsc); } /* now do the op */
_______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/