Hi,

The recent heartbeat on the tip would cause an assertion fail in
pacemaker-1.0 and generate a core:
{{{
Oct 25 17:15:08 srv02 cib: [31333]: ERROR: crm_abort:
crm_glib_handler: Forked child 31338 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
Oct 25 17:15:08 srv02 cib: [31333]: ERROR: crm_abort:
crm_glib_handler: Forked child 31339 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
Oct 25 17:15:11 srv02 crmd: [31337]: ERROR: crm_abort:
crm_glib_handler: Forked child 31341 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
Oct 25 17:15:11 srv02 crmd: [31337]: ERROR: crm_abort:
crm_glib_handler: Forked child 31342 to record non-fatal assert at
utils.c:449 : g_main_loop_is_running: assertion `loop != NULL' failed
}}}


This seems introduced by the following changeset:
http://hg.linux-ha.org/dev/rev/231b0b8555be

The stack trace and my suggested patch are attached.

The changeset in question had changed to use get_next_random() here
which eventually calls g_main_loop_is_running() but it may fail
because g_main_loop is not initialized yet in cib/crmd.

My suggested patch would just revert the old behavior but only changes
the delay as 50ms.

Thanks,

-- 
Keisuke MORI
(gdb) where
#0  0x00669410 in __kernel_vsyscall ()
#1  0x00692df0 in raise () from /lib/libc.so.6
#2  0x00694701 in abort () from /lib/libc.so.6
#3  0x00c0d82f in crm_abort (file=0xc26955 "utils.c", 
    function=0xc26dda "crm_glib_handler", line=449, 
    assert_condition=0x8933d58 "g_main_loop_is_running: assertion `loop != 
NULL' failed", do_core=1, do_fork=1) at utils.c:1382
#4  0x00c09f05 in crm_glib_handler (log_domain=0x167686 "GLib", 
    flags=G_LOG_LEVEL_CRITICAL, 
    message=0x8933d58 "g_main_loop_is_running: assertion `loop != NULL' 
failed", user_data=0x0) at utils.c:449
#5  0x00143b67 in g_logv () from /lib/libglib-2.0.so.0
#6  0x00143d39 in g_log () from /lib/libglib-2.0.so.0
#7  0x00143e1b in g_return_if_fail_warning () from /lib/libglib-2.0.so.0
#8  0x0013981b in g_main_loop_is_running () from /lib/libglib-2.0.so.0
#9  0x00880811 in get_more_random () at cl_random.c:95
#10 0x00880945 in cl_init_random () at cl_random.c:128
#11 0x00880644 in gen_a_random () at cl_random.c:68
#12 0x00880896 in get_next_random () at cl_random.c:106
#13 0x00fdbabb in get_clientstatus (lcl=0x8931bd8, host=0x0, 
    clientid=0x805b779 "cib", timeout=-1) at client_lib.c:974
#14 0x080557ee in cib_init () at main.c:461
#15 0x08054c4b in main (argc=1, argv=0xbfcd6124) at main.c:218
(gdb) 
# HG changeset patch
# User Keisuke MORI <kskm...@intellilink.co.jp>
# Date 1288003477 -32400
# Node ID 96b67422b12814f64dc7dd61c670801c7ba213b6
# Parent  82fc843fbcf9733e50bbc169c95e51b6c7f97c54
Medium: reduce max delay in get_client_status (revised 231b0b8555be)
revert the old code to avoid calling g_main_loop_is_running()
which may fail when used in Pacemaker cib/crmd.

diff -r 82fc843fbcf9 -r 96b67422b128 lib/hbclient/client_lib.c
--- a/lib/hbclient/client_lib.c	Mon Oct 04 22:12:37 2010 +0200
+++ b/lib/hbclient/client_lib.c	Mon Oct 25 19:44:37 2010 +0900
@@ -966,16 +966,6 @@ get_nodesite(ll_cluster_t* lcl, const ch
 * Return the status of the given client.
 */
 
-#ifndef HAVE_CL_RAND_FROM_INTERVAL
-/* you should grab latest glue headers! */
-static inline int cl_rand_from_interval(const int a, const int b)
-{
-	/* RAND_MAX may be INT_MAX, or (b-a) may be huge. */
-	long long r = get_next_random();
-	return a + (r * (b-a) + RAND_MAX/2)/RAND_MAX;
-}
-#endif
-
 static const char *
 get_clientstatus(ll_cluster_t* lcl, const char *host
 ,		const char *clientid, int timeout)
@@ -1027,8 +1017,9 @@ get_clientstatus(ll_cluster_t* lcl, cons
 		 * in a 100-node cluster, the max delay is 5 seconds
 		 */
 		num_nodes = get_num_nodes(lcl);
-		max_delay = num_nodes * 50000;
-		delay = cl_rand_from_interval(0, max_delay);
+		max_delay = num_nodes * 50000; /* in microsecond*/
+		srand(cl_randseed());
+		delay = (1.0* rand()/RAND_MAX)*max_delay;
 		if (ANYDEBUG){
 			cl_log(LOG_DEBUG, "Delaying cstatus request for %d ms", delay/1000);
 		}
_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

Reply via email to