Hi, The recent heartbeat on the tip would cause an assertion fail in pacemaker-1.0 and generate a core: {{{ Oct 25 17:15:08 srv02 cib: [31333]: ERROR: crm_abort: crm_glib_handler: Forked child 31338 to record non-fatal assert at utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed Oct 25 17:15:08 srv02 cib: [31333]: ERROR: crm_abort: crm_glib_handler: Forked child 31339 to record non-fatal assert at utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed Oct 25 17:15:11 srv02 crmd: [31337]: ERROR: crm_abort: crm_glib_handler: Forked child 31341 to record non-fatal assert at utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed Oct 25 17:15:11 srv02 crmd: [31337]: ERROR: crm_abort: crm_glib_handler: Forked child 31342 to record non-fatal assert at utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed }}}
This seems introduced by the following changeset: http://hg.linux-ha.org/dev/rev/231b0b8555be The stack trace and my suggested patch are attached. The changeset in question had changed to use get_next_random() here which eventually calls g_main_loop_is_running() but it may fail because g_main_loop is not initialized yet in cib/crmd. My suggested patch would just revert the old behavior but only changes the delay as 50ms. Thanks, -- Keisuke MORI
(gdb) where #0 0x00669410 in __kernel_vsyscall () #1 0x00692df0 in raise () from /lib/libc.so.6 #2 0x00694701 in abort () from /lib/libc.so.6 #3 0x00c0d82f in crm_abort (file=0xc26955 "utils.c", function=0xc26dda "crm_glib_handler", line=449, assert_condition=0x8933d58 "g_main_loop_is_running: assertion `loop != NULL' failed", do_core=1, do_fork=1) at utils.c:1382 #4 0x00c09f05 in crm_glib_handler (log_domain=0x167686 "GLib", flags=G_LOG_LEVEL_CRITICAL, message=0x8933d58 "g_main_loop_is_running: assertion `loop != NULL' failed", user_data=0x0) at utils.c:449 #5 0x00143b67 in g_logv () from /lib/libglib-2.0.so.0 #6 0x00143d39 in g_log () from /lib/libglib-2.0.so.0 #7 0x00143e1b in g_return_if_fail_warning () from /lib/libglib-2.0.so.0 #8 0x0013981b in g_main_loop_is_running () from /lib/libglib-2.0.so.0 #9 0x00880811 in get_more_random () at cl_random.c:95 #10 0x00880945 in cl_init_random () at cl_random.c:128 #11 0x00880644 in gen_a_random () at cl_random.c:68 #12 0x00880896 in get_next_random () at cl_random.c:106 #13 0x00fdbabb in get_clientstatus (lcl=0x8931bd8, host=0x0, clientid=0x805b779 "cib", timeout=-1) at client_lib.c:974 #14 0x080557ee in cib_init () at main.c:461 #15 0x08054c4b in main (argc=1, argv=0xbfcd6124) at main.c:218 (gdb)
# HG changeset patch # User Keisuke MORI <kskm...@intellilink.co.jp> # Date 1288003477 -32400 # Node ID 96b67422b12814f64dc7dd61c670801c7ba213b6 # Parent 82fc843fbcf9733e50bbc169c95e51b6c7f97c54 Medium: reduce max delay in get_client_status (revised 231b0b8555be) revert the old code to avoid calling g_main_loop_is_running() which may fail when used in Pacemaker cib/crmd. diff -r 82fc843fbcf9 -r 96b67422b128 lib/hbclient/client_lib.c --- a/lib/hbclient/client_lib.c Mon Oct 04 22:12:37 2010 +0200 +++ b/lib/hbclient/client_lib.c Mon Oct 25 19:44:37 2010 +0900 @@ -966,16 +966,6 @@ get_nodesite(ll_cluster_t* lcl, const ch * Return the status of the given client. */ -#ifndef HAVE_CL_RAND_FROM_INTERVAL -/* you should grab latest glue headers! */ -static inline int cl_rand_from_interval(const int a, const int b) -{ - /* RAND_MAX may be INT_MAX, or (b-a) may be huge. */ - long long r = get_next_random(); - return a + (r * (b-a) + RAND_MAX/2)/RAND_MAX; -} -#endif - static const char * get_clientstatus(ll_cluster_t* lcl, const char *host , const char *clientid, int timeout) @@ -1027,8 +1017,9 @@ get_clientstatus(ll_cluster_t* lcl, cons * in a 100-node cluster, the max delay is 5 seconds */ num_nodes = get_num_nodes(lcl); - max_delay = num_nodes * 50000; - delay = cl_rand_from_interval(0, max_delay); + max_delay = num_nodes * 50000; /* in microsecond*/ + srand(cl_randseed()); + delay = (1.0* rand()/RAND_MAX)*max_delay; if (ANYDEBUG){ cl_log(LOG_DEBUG, "Delaying cstatus request for %d ms", delay/1000); }
_______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/