what: guile-2.2-stable, from git. I've got a large, complex, heavily multi-threaded guile program that hangs during garbage collection; usually after running for half a day. It hangs in a tight loop in scm_i_vm_mark_stack, spinning at 100% of CPU.
This is due to the for-loop line fp = SCM_FRAME_DYNAMIC_LINK (fp)) at libguile/vm.c line 975 failing to advance the frame pointer. There's no "obvious" corruption in the stack; it simply looks like the frame was incompletely set up, and so incrementing to the next fp does not go anywhere. I have recompiled guile with VM_ENABLE_ASSERTIONS and am trying to reproduce the bug now. The rest of this email is a record of a long debug session isolating the problem, and showing that, overall, the thread and stack data look more-or-less correct and uncorrupoted, except for the inability to walk forward in the frame. -- linas (gdb) bt #0 scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:1011 #1 0x00007f3e9db8835e in GC_mark_from (mark_stack_top=0x7f3e9b783ee0, mark_stack_top@entry=0x7f3e9b783f00, mark_stack=mark_stack@entry=0x7f3e9b783eb0, mark_stack_limit=mark_stack_limit@entry=0x7f3e9b793eb0) at ../mark.c:772 #2 0x00007f3e9db8897e in GC_do_local_mark (local_mark_stack=0x7f3e9b783eb0, local_top=0x7f3e9b783f00) at ../mark.c:1037 #3 0x00007f3e9db88b98 in GC_mark_local ( local_mark_stack=local_mark_stack@entry=0x7f3e9b783eb0, id=id@entry=4) at ../mark.c:1170 #4 0x00007f3e9db88eaa in GC_help_marker (my_mark_no=my_mark_no@entry=80003) at ../mark.c:1238 #5 0x00007f3e9db92e3c in GC_mark_thread (id=<optimized out>) at ../pthread_support.c:380 #6 0x00007f3e9e2ae6ba in start_thread (arg=0x7f3e9b794700) at pthread_create.c:333 #7 0x00007f3e9dfdd82d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 (gdb) step #0 find_slot_map (cache=0x7f3e9b783ab0, ip=0x1) at ../../libguile/vm.c:935 935 if (cache->entries[slot].ip == ip) (gdb) print slot $1 = 0 (gdb) print cache->entries[slot].ip $2 = (scm_t_uint32 *) 0x1 (gdb) print cache->entries[slot].map $4 = (const scm_t_uint8 *) 0x0 (gdb) step scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:1011 1011 slot_map = find_slot_map (SCM_FRAME_RETURN_ADDRESS (fp), &cache); (gdb) print fp $5 = (union scm_vm_stack_element *) 0x7f3e94cdee38 #define SCM_FRAME_RETURN_ADDRESS(fp) ((fp)[0].as_ip) (gdb) print (fp)[0].as_ip $6 = (scm_t_uint32 *) 0x1 OK that looks weird ... is this corrupted ?? but whatever, because the returned slot_map is never used ... because ... (gdb) step scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:975 975 fp = SCM_FRAME_DYNAMIC_LINK (fp)) (gdb) print fp $7 = (union scm_vm_stack_element *) 0x7f3e94cdee38 frames.h:#define SCM_FRAME_DYNAMIC_LINK(fp) ((fp) + (fp)[1].as_uint) (gdb) print (fp)[1].as_uint $8 = 0 OK, that seems bad, because now fp never advances, it just repeats over and over with this same value. (gdb) 979 for (slot = nlocals - 1; sp < fp; sp++, slot--) (gdb) print nlocals $10 = -2 (gdb) print sp $11 = (union scm_vm_stack_element *) 0x7f3e94cdee48 #define SCM_FRAME_NUM_LOCALS(fp, sp) ((fp) - (sp)) (gdb) print fp $12 = (union scm_vm_stack_element *) 0x7f3e94cdee38 (gdb) print ((fp) - (sp)) $13 = -2 Ohh .. its not -16 because its -2 * sizeof (union scm_vm_stack_element *) so that's OK. So for loops is skipped, it should go to. sp = SCM_FRAME_PREVIOUS_SP (fp); frames.h:#define SCM_FRAME_PREVIOUS_SP(fp) ((fp) + 2) and so now it loops around and repeats. (gdb) print cache $19 = {entries = {{ip = 0x1, map = 0x0}, {ip = 0x0, map = 0x0} <repeats 23 times>, {ip = 0x1a1b5e0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}}} (gdb) print &cache $20 = (struct slot_map_cache *) 0x7f3e9b783ab0 and so the loop repeates forever, because fp = SCM_FRAME_DYNAMIC_LINK (fp) never advances fp, because (fp)[1].as_uint is zero. So where is fp pointing to? recall fp == 0x7f3e94cdee38 (gdb) x/20x 0x7f3e94cdee00 0x7f3e94cdee00: 0x0000000000000904 0x000000000000200c 0x7f3e94cdee10: 0x0000000001a1b7e0 0x0000000001a1b5e0 0x7f3e94cdee20: 0x0000000000000004 0x000000000000200c 0x7f3e94cdee30: 0x0000000000169bd6 0x0000000000000001 0x7f3e94cdee40: 0x0000000000000000 0x000000000192acd0 0x7f3e94cdee50: 0x0000000001fa4bc0 0x0000000001a1c6d0 0x7f3e94cdee60: 0x0000000044507950 0x0000000000000002 0x7f3e94cdee70: 0x000000000005a6f5 0x00000000018febd0 0x7f3e94cdee80: 0x000000006d490d00 0x00007f3e9e97241c 0x7f3e94cdee90: 0x0000000000000002 0x00007f3e9e97241c (gdb) x/s 0x7f3e94cdee60 0x7f3e94cdee60: "PyPD" <<< ?? is this a meaningul string? (gdb) x/s 0x7f3e94cdee81 0x7f3e94cdee81: "\rIm" << how about this ??? So this does not look healthy. How did we get here? (gdb) print vp $21 = (struct scm_vm *) 0x755c1bd0 (gdb) print mark_stack_ptr $22 = (struct GC_ms_entry *) 0x7f3e9b783f40 (gdb) print mark_stack_limit $23 = (struct GC_ms_entry *) 0x7f3e9b793eb0 (gdb) print upper $24 = (void *) 0x7ffdd986 (gdb) print GC_greatest_plausible_heap_addr $25 = (void *) 0x7ffdd986 (gdb) print lower $26 = (void *) 0x17a1ff8 (gdb) print vp->fp $27 = (union scm_vm_stack_element *) 0x7f3e94cdee18 (gdb) print vp->sp $28 = (union scm_vm_stack_element *) 0x7f3e94cdee00 (gdb) print vp->stack_top $29 = (union scm_vm_stack_element *) 0x7f3e94cdf000 So lets walk manually. frames.h:#define SCM_FRAME_DYNAMIC_LINK(fp) ((fp) + (fp)[1].as_uint) (gdb) print (vp->fp) + (vp->fp)[1].as_uint $31 = (union scm_vm_stack_element *) 0x7f3e94cdee38 So we are instanly in trouble! once through the loop, and we've got the broken fp So how did we get here? (gdb) print vp $32 = (struct scm_vm *) 0x755c1bd0 (gdb) print *vp $33 = {ip = 0x7f3e9e5a6378 <subr_stub_code+56>, sp = 0x7f3e94cdee00, fp = 0x7f3e94cdee18, stack_limit = 0x7f3e94cde000, trace_level = 0, sp_min_since_gc = 0x7f3e94cdee00, stack_size = 512, stack_bottom = 0x7f3e94cde000, stack_top = 0x7f3e94cdf000, overflow_handler_stack = 0x304, hooks = {0x4, 0x4, 0x4, 0x4, 0x4}, resumable_prompt_cookie = 0x7f3d397f9440, engine = 1} Looks plausible, not corrupted. Why is engine=1 ?? #define SCM_VM_REGULAR_ENGINE 0 #define SCM_VM_DEBUG_ENGINE 1 #define SCM_VM_NUM_ENGINES 2 # define SCM_I_CURRENT_THREAD (scm_i_current_thread) (gdb) print scm_i_current_thread $34 = (scm_i_thread *) 0x0 OK, that's not excellent. (gdb) print thread_count $35 = 161 (gdb) bt #0 scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:1011 #1 0x00007f3e9db8835e in GC_mark_from (mark_stack_top=0x7f3e9b783ee0, mark_stack_top@entry=0x7f3e9b783f00, mark_stack=mark_stack@entry=0x7f3e9b783eb0, mark_stack_limit=mark_stack_limit@entry=0x7f3e9b793eb0) at ../mark.c:772 #2 0x00007f3e9db8897e in GC_do_local_mark (local_mark_stack=0x7f3e9b783eb0, local_top=0x7f3e9b783f00) at ../mark.c:1037 #3 0x00007f3e9db88b98 in GC_mark_local ( local_mark_stack=local_mark_stack@entry=0x7f3e9b783eb0, id=id@entry=4) at ../mark.c:1170 #4 0x00007f3e9db88eaa in GC_help_marker (my_mark_no=my_mark_no@entry=80003) at ../mark.c:1238 #5 0x00007f3e9db92e3c in GC_mark_thread (id=<optimized out>) at ../pthread_support.c:380 #6 0x00007f3e9e2ae6ba in start_thread (arg=0x7f3e9b794700) at pthread_create.c:333 #7 0x00007f3e9dfdd82d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 so bdwgc is version gc7_6_0 So ../mark.c:772 is GET_HDR and GET_HDR is in include/private/gc_hdrs.h define GET_HDR(p, hhdr) ... ((hhdr) = HDR_INNER(p) # define HDR_INNER(p) HDR_FROM_BI(BI(p),p) #define HDR_FROM_BI(bi, p) \ ((bi)->index[((word)(p) >> LOG_HBLKSIZE) & (BOTTOM_SZ - 1)]) This seems all wrong.. wtf .... is it called by thread_mark ?? in libguile/thread.c: thread_gc_kind = GC_new_kind (GC_new_free_list (), GC_MAKE_PROC (GC_new_proc (thread_mark), 0), 0, 1) in bdwgc/misc.c GC_new_proc GC_mark_procs[result] = proc #define GC_mark_procs GC_arrays._mark_procs #define PROC(descr) \ (GC_mark_procs[((descr) >> GC_DS_TAG_BITS) & (GC_MAX_MARK_PROCS-1)]) GC_obj_kinds[result].ok_descriptor = descr; <<< this is where MAKE_PROC goes. anyway ... in mark.c line 733 case GC_DS_PROC: mark_stack_top--; credit -= GC_PROC_BYTES; mark_stack_top = (*PROC(descr))((word *)current_p, mark_stack_top, mark_stack_limit, ENV(descr)); continue; But that's the wrong line number. line 772 is in case GC_DS_PER_OBJECT: if ((signed_word)descr >= 0) { /* Descriptor is in the object. */ descr = *(word *)(current_p + descr - GC_DS_PER_OBJECT); Oh, it goes to retry, which loops around and calls PROC which is thread_mark which is called with args mark_stack_top = (*PROC(descr))((word *)current_p, mark_stack_top, mark_stack_limit, ENV(descr)); (gdb) print current_p $39 = (ptr_t) 0x7225d380 "@\325%r" so back in libguile/threads.c line 84: const struct scm_i_thread *t = (struct scm_i_thread *) addr; (gdb) print *(struct scm_i_thread *) current_p $40 = {next_thread = 0x7225d540, handle = 0x5763f3b0, pthread = 139900934399744, result = 0x4, exited = 0, guile_mode = 1, needs_unregister = 1, wake = 0x0, sleep_cond = {__data = {__lock = 0, __futex = 0, __total_seq = 0, __wakeup_seq = 0, __woken_seq = 0, __mutex = 0x0, __nwaiters = 0, __broadcast_seq = 0}, __size = '\000' <repeats 47 times>, __align = 0}, sleep_pipe = {151, 197}, freelists = 0x71cfe800, pointerless_freelists = 0x71cfe780, dynamic_state = 0x71de7900, dynstack = { base = 0x698c3800, top = 0x698c3940, limit = 0x698c3a00}, pending_asyncs = 0x304, block_asyncs = 0, continuation_root = 0x5763f340, continuation_base = 0x7f3d397f9c90, vp = 0x755c1bd0, base = 0x7f3d397f9da0, regs = {{__jmpbuf = {139905204221696, 139906923931021, 1, 139894281647232, 139906913276512, 0, 139906918327551, 8388608}, __mask_was_saved = 964664752, __saved_mask = {__val = {0, 139900934399744, 139900934399744, 139900934397344, 139906923929400, 139893936840720, 1804170656, 139906913276512, 139900934399744, 139900934397344, 139900934397504, 139893936840720, 139906910859134, 139906913276484, 139906910857112, 27401856}}}}} (gdb) print ((struct scm_i_thread *) current_p)->vp $41 = (struct scm_vm *) 0x755c1bd0 which is exactly the vp we had before, so this is the correct thread. So ... all the values in *(struct scm_i_thread *) current_p look more or less OK; its not corrupted. And all the values in *vp look more or less OK, its not corrrupted except that engine=1 looks wrong. So what's up with the weird stack looping? Again: >From before, we had: (gdb) print *vp $42 = {ip = 0x7f3e9e5a6378 <subr_stub_code+56>, sp = 0x7f3e94cdee00, fp = 0x7f3e94cdee18, stack_limit = 0x7f3e94cde000, trace_level = 0, sp_min_since_gc = 0x7f3e94cdee00, stack_size = 512, stack_bottom = 0x7f3e94cde000, stack_top = 0x7f3e94cdf000, overflow_handler_stack = 0x304, hooks = {0x4, 0x4, 0x4, 0x4, 0x4}, resumable_prompt_cookie = 0x7f3d397f9440, engine = 1} so fp, sp, stack_bottom, stack_top all are self-consistent with each other. The problem is that fp = SCM_FRAME_DYNAMIC_LINK (fp) goes nowhere, after the first iteration. Is the frame corrupted? How? what does the rest of the stack look like? (gdb) x/32x 0x7f3e94cdee00 0x7f3e94cdee00: 0x0000000000000904 0x000000000000200c 0x7f3e94cdee10: 0x0000000001a1b7e0 0x0000000001a1b5e0 0x7f3e94cdee20: 0x0000000000000004 0x000000000000200c <<< theres the 4 0x7f3e94cdee30: 0x0000000000169bd6 0x0000000000000001 <<< there's the new fp 0x7f3e94cdee40: 0x0000000000000000 0x000000000192acd0 <<< there's the zero. 0x7f3e94cdee50: 0x0000000001fa4bc0 0x0000000001a1c6d0 0x7f3e94cdee60: 0x0000000044507950 0x0000000000000002 0x7f3e94cdee70: 0x000000000005a6f5 0x00000000018febd0 0x7f3e94cdee80: 0x000000006d490d00 0x00007f3e9e97241c 0x7f3e94cdee90: 0x0000000000000002 0x00007f3e9e97241c 0x7f3e94cdeea0: 0x0000000000000002 0x00007f3e8d797630 0x7f3e94cdeeb0: 0x0000000000000002 0x00007f3e86effacc 0x7f3e94cdeec0: 0x0000000000000003 0x000000006d490d00 0x7f3e94cdeed0: 0x00007f3e86eb8698 0x0000000000000002 0x7f3e94cdeee0: 0x00007f3e8e51f450 0x0000000000000008 0x7f3e94cdeef0: 0x0000000001a37c10 0x000000000192acd0 So... who sets up thread->vp ? /* #define VM_ENABLE_ASSERTIONS */ However, defining this breaks the build. diff --git a/libguile/vm-engine.c b/libguile/vm-engine.c index 6c88ebf..f0f8164 100644 --- a/libguile/vm-engine.c +++ b/libguile/vm-engine.c @@ -2050,11 +2050,11 @@ VM_NAME (scm_i_thread *thread, struct scm_vm *vp, if (!scm_module_system_booted_p) { - ASSERT (scm_is_true + ASSERT (scm_is_true( scm_equal_p (modname, scm_list_2 (SCM_BOOL_T, - scm_from_utf8_symbol ("guile")))); + scm_from_utf8_symbol ("guile"))))); var = scm_lookup (sym); } else if (scm_is_true (SCM_CAR (modname))) diff --git a/libguile/vm.c b/libguile/vm.c index 18f2192..c26066e 100644 --- a/libguile/vm.c +++ b/libguile/vm.c @@ -63,7 +63,7 @@ static size_t page_size; /* The VM has a number of internal assertions that shouldn't normally be necessary, but might be if you think you found a bug in the VM. */ -/* #define VM_ENABLE_ASSERTIONS */ +#define VM_ENABLE_ASSERTIONS static void vm_expand_stack (struct scm_vm *vp, union scm_vm_stack_element *new_sp) SCM_NOINLINE; GEN guile-procedures.texi /bin/bash: line 1: 24078 Broken pipe ./meta/guile Aborted ./libguile/guile Aborted gdb libguile/.libs/lt-guile #0 0x00007ffff74f7428 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:54 #1 0x00007ffff74f902a in __GI_abort () at abort.c:89 #2 0x00007ffff7b707c4 in vm_regular_engine (thread=0x7282, vp=0x806f30, registers=0x6, resume=-145787864) at ../../libguile/vm-engine.c:2053 #3 0x00007ffff7b711d2 in scm_call_n (proc=0x832b30, argv=argv@entry=0x0, nargs=nargs@entry=0) at ../../libguile/vm.c:1257 #4 0x00007ffff7af14e9 in scm_call_0 (proc=<optimized out>) at ../../libguile/eval.c:481 #5 0x00007ffff7b10568 in scm_primitive_load_path (args=<optimized out>) at ../../libguile/load.c:1248 #6 0x00007ffff7b6e976 in vm_regular_engine (thread=0x7282, vp=0x806f30, registers=0x6, resume=-145787864) at ../../libguile/vm-engine.c:784 Oh. wtf. (define last-gc (gc-stats)) (define start-time (get-internal-real-time)) (define run-time (get-internal-run-time)) (define (avg-time-taken) (define now (get-internal-real-time)) (define run (get-internal-run-time)) (define cur (gc-stats)) (define gc-time-taken (* 1.0e-9 (- (cdar cur) (cdar last-gc)))) (define elapsed-time (* 1.0e-9 (- now start-time))) (define cpu-time (* 1.0e-9 (- run run-time))) (format #t "Elapsed time: ~5f secs. GC-time-taken: ~5f s or ~5f% cpu-usage: ~5f%\n" elapsed-time gc-time-taken (* 100 (/ gc-time-taken elapsed-time)) (* 100 (/ cpu-time elapsed-time)) ) (set! last-gc cur) (set! start-time now) (set! run-time run) ) linas@fanny: ~/src/novamente/src/opencog-git/opencog/nlp/learn $ vi x linas@fanny: ~/src/novamente/src/opencog-git/opencog/nlp/learn $ cat x #0 find_slot_map (cache=0x7f3e9b783ab0, ip=0x1) at ../../libguile/vm.c:935 935 if (cache->entries[slot].ip == ip) (gdb) print slot $1 = 0 (gdb) print cache->entries[slot].ip $2 = (scm_t_uint32 *) 0x1 (gdb) print cache->entries[slot].map $4 = (const scm_t_uint8 *) 0x0 (gdb) step scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:1011 1011 slot_map = find_slot_map (SCM_FRAME_RETURN_ADDRESS (fp), &cache); (gdb) print fp $5 = (union scm_vm_stack_element *) 0x7f3e94cdee38 #define SCM_FRAME_RETURN_ADDRESS(fp) ((fp)[0].as_ip) (gdb) print (fp)[0].as_ip $6 = (scm_t_uint32 *) 0x1 OK that looks weird ... is this corrupted ?? but whatever, because the returned slot_map is never used ... because ... (gdb) step scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:975 975 fp = SCM_FRAME_DYNAMIC_LINK (fp)) (gdb) print fp $7 = (union scm_vm_stack_element *) 0x7f3e94cdee38 frames.h:#define SCM_FRAME_DYNAMIC_LINK(fp) ((fp) + (fp)[1].as_uint) (gdb) print (fp)[1].as_uint $8 = 0 OK, that seems bad, because now fp never advances, it just repeats over and over with this same value. (gdb) 979 for (slot = nlocals - 1; sp < fp; sp++, slot--) (gdb) print nlocals $10 = -2 (gdb) print sp $11 = (union scm_vm_stack_element *) 0x7f3e94cdee48 #define SCM_FRAME_NUM_LOCALS(fp, sp) ((fp) - (sp)) (gdb) print fp $12 = (union scm_vm_stack_element *) 0x7f3e94cdee38 (gdb) print ((fp) - (sp)) $13 = -2 Ohh .. its not -16 because its -2 * sizeof (union scm_vm_stack_element *) so that's OK. So for loops is skipped, it should go to. sp = SCM_FRAME_PREVIOUS_SP (fp); frames.h:#define SCM_FRAME_PREVIOUS_SP(fp) ((fp) + 2) and so now it loops around and repeats. (gdb) print cache $19 = {entries = {{ip = 0x1, map = 0x0}, {ip = 0x0, map = 0x0} <repeats 23 times>, {ip = 0x1a1b5e0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}, {ip = 0x0, map = 0x0}}} (gdb) print &cache $20 = (struct slot_map_cache *) 0x7f3e9b783ab0 and so the loop repeates forever, because fp = SCM_FRAME_DYNAMIC_LINK (fp) never advances fp, because (fp)[1].as_uint is zero. So where is fp pointing to? recall fp == 0x7f3e94cdee38 (gdb) x/20x 0x7f3e94cdee00 0x7f3e94cdee00: 0x0000000000000904 0x000000000000200c 0x7f3e94cdee10: 0x0000000001a1b7e0 0x0000000001a1b5e0 0x7f3e94cdee20: 0x0000000000000004 0x000000000000200c 0x7f3e94cdee30: 0x0000000000169bd6 0x0000000000000001 0x7f3e94cdee40: 0x0000000000000000 0x000000000192acd0 0x7f3e94cdee50: 0x0000000001fa4bc0 0x0000000001a1c6d0 0x7f3e94cdee60: 0x0000000044507950 0x0000000000000002 0x7f3e94cdee70: 0x000000000005a6f5 0x00000000018febd0 0x7f3e94cdee80: 0x000000006d490d00 0x00007f3e9e97241c 0x7f3e94cdee90: 0x0000000000000002 0x00007f3e9e97241c (gdb) x/s 0x7f3e94cdee60 0x7f3e94cdee60: "PyPD" <<< ?? is this a meaningul string? (gdb) x/s 0x7f3e94cdee81 0x7f3e94cdee81: "\rIm" << how about this ??? So this does not look healthy. How did we get here? (gdb) print vp $21 = (struct scm_vm *) 0x755c1bd0 (gdb) print mark_stack_ptr $22 = (struct GC_ms_entry *) 0x7f3e9b783f40 (gdb) print mark_stack_limit $23 = (struct GC_ms_entry *) 0x7f3e9b793eb0 (gdb) print upper $24 = (void *) 0x7ffdd986 (gdb) print GC_greatest_plausible_heap_addr $25 = (void *) 0x7ffdd986 (gdb) print lower $26 = (void *) 0x17a1ff8 (gdb) print vp->fp $27 = (union scm_vm_stack_element *) 0x7f3e94cdee18 (gdb) print vp->sp $28 = (union scm_vm_stack_element *) 0x7f3e94cdee00 (gdb) print vp->stack_top $29 = (union scm_vm_stack_element *) 0x7f3e94cdf000 So lets walk manually. frames.h:#define SCM_FRAME_DYNAMIC_LINK(fp) ((fp) + (fp)[1].as_uint) (gdb) print (vp->fp) + (vp->fp)[1].as_uint $31 = (union scm_vm_stack_element *) 0x7f3e94cdee38 So we are instanly in trouble! once through the loop, and we've got the broken fp So how did we get here? (gdb) print vp $32 = (struct scm_vm *) 0x755c1bd0 (gdb) print *vp $33 = {ip = 0x7f3e9e5a6378 <subr_stub_code+56>, sp = 0x7f3e94cdee00, fp = 0x7f3e94cdee18, stack_limit = 0x7f3e94cde000, trace_level = 0, sp_min_since_gc = 0x7f3e94cdee00, stack_size = 512, stack_bottom = 0x7f3e94cde000, stack_top = 0x7f3e94cdf000, overflow_handler_stack = 0x304, hooks = {0x4, 0x4, 0x4, 0x4, 0x4}, resumable_prompt_cookie = 0x7f3d397f9440, engine = 1} Looks plausible, not corrupted. Why is engine=1 ?? #define SCM_VM_REGULAR_ENGINE 0 #define SCM_VM_DEBUG_ENGINE 1 #define SCM_VM_NUM_ENGINES 2 # define SCM_I_CURRENT_THREAD (scm_i_current_thread) (gdb) print scm_i_current_thread $34 = (scm_i_thread *) 0x0 OK, that's not excellent. (gdb) print thread_count $35 = 161 (gdb) bt #0 scm_i_vm_mark_stack (vp=0x755c1bd0, mark_stack_ptr=0x7f3e9b783f40, mark_stack_limit=0x7f3e9b793eb0) at ../../libguile/vm.c:1011 #1 0x00007f3e9db8835e in GC_mark_from (mark_stack_top=0x7f3e9b783ee0, mark_stack_top@entry=0x7f3e9b783f00, mark_stack=mark_stack@entry=0x7f3e9b783eb0, mark_stack_limit=mark_stack_limit@entry=0x7f3e9b793eb0) at ../mark.c:772 #2 0x00007f3e9db8897e in GC_do_local_mark (local_mark_stack=0x7f3e9b783eb0, local_top=0x7f3e9b783f00) at ../mark.c:1037 #3 0x00007f3e9db88b98 in GC_mark_local ( local_mark_stack=local_mark_stack@entry=0x7f3e9b783eb0, id=id@entry=4) at ../mark.c:1170 #4 0x00007f3e9db88eaa in GC_help_marker (my_mark_no=my_mark_no@entry=80003) at ../mark.c:1238 #5 0x00007f3e9db92e3c in GC_mark_thread (id=<optimized out>) at ../pthread_support.c:380 #6 0x00007f3e9e2ae6ba in start_thread (arg=0x7f3e9b794700) at pthread_create.c:333 #7 0x00007f3e9dfdd82d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 so bdwgc is version gc7_6_0 So ../mark.c:772 is GET_HDR and GET_HDR is in include/private/gc_hdrs.h define GET_HDR(p, hhdr) ... ((hhdr) = HDR_INNER(p) # define HDR_INNER(p) HDR_FROM_BI(BI(p),p) #define HDR_FROM_BI(bi, p) \ ((bi)->index[((word)(p) >> LOG_HBLKSIZE) & (BOTTOM_SZ - 1)]) This seems all wrong.. wtf .... is it called by thread_mark ?? in libguile/thread.c: thread_gc_kind = GC_new_kind (GC_new_free_list (), GC_MAKE_PROC (GC_new_proc (thread_mark), 0), 0, 1) in bdwgc/misc.c GC_new_proc GC_mark_procs[result] = proc #define GC_mark_procs GC_arrays._mark_procs #define PROC(descr) \ (GC_mark_procs[((descr) >> GC_DS_TAG_BITS) & (GC_MAX_MARK_PROCS-1)]) GC_obj_kinds[result].ok_descriptor = descr; <<< this is where MAKE_PROC goes. anyway ... in mark.c line 733 case GC_DS_PROC: mark_stack_top--; credit -= GC_PROC_BYTES; mark_stack_top = (*PROC(descr))((word *)current_p, mark_stack_top, mark_stack_limit, ENV(descr)); continue; But that's the wrong line number. line 772 is in case GC_DS_PER_OBJECT: if ((signed_word)descr >= 0) { /* Descriptor is in the object. */ descr = *(word *)(current_p + descr - GC_DS_PER_OBJECT); Oh, it goes to retry, which loops around and calls PROC which is thread_mark which is called with args mark_stack_top = (*PROC(descr))((word *)current_p, mark_stack_top, mark_stack_limit, ENV(descr)); (gdb) print current_p $39 = (ptr_t) 0x7225d380 "@\325%r" so back in libguile/threads.c line 84: const struct scm_i_thread *t = (struct scm_i_thread *) addr; (gdb) print *(struct scm_i_thread *) current_p $40 = {next_thread = 0x7225d540, handle = 0x5763f3b0, pthread = 139900934399744, result = 0x4, exited = 0, guile_mode = 1, needs_unregister = 1, wake = 0x0, sleep_cond = {__data = {__lock = 0, __futex = 0, __total_seq = 0, __wakeup_seq = 0, __woken_seq = 0, __mutex = 0x0, __nwaiters = 0, __broadcast_seq = 0}, __size = '\000' <repeats 47 times>, __align = 0}, sleep_pipe = {151, 197}, freelists = 0x71cfe800, pointerless_freelists = 0x71cfe780, dynamic_state = 0x71de7900, dynstack = { base = 0x698c3800, top = 0x698c3940, limit = 0x698c3a00}, pending_asyncs = 0x304, block_asyncs = 0, continuation_root = 0x5763f340, continuation_base = 0x7f3d397f9c90, vp = 0x755c1bd0, base = 0x7f3d397f9da0, regs = {{__jmpbuf = {139905204221696, 139906923931021, 1, 139894281647232, 139906913276512, 0, 139906918327551, 8388608}, __mask_was_saved = 964664752, __saved_mask = {__val = {0, 139900934399744, 139900934399744, 139900934397344, 139906923929400, 139893936840720, 1804170656, 139906913276512, 139900934399744, 139900934397344, 139900934397504, 139893936840720, 139906910859134, 139906913276484, 139906910857112, 27401856}}}}} (gdb) print ((struct scm_i_thread *) current_p)->vp $41 = (struct scm_vm *) 0x755c1bd0 which is exactly the vp we had before, so this is the correct thread. So ... all the values in *(struct scm_i_thread *) current_p look more or less OK; its not corrupted. And all the values in *vp look more or less OK, its not corrrupted except that engine=1 looks wrong. So what's up with the weird stack looping? Again: >From before, we had: (gdb) print *vp $42 = {ip = 0x7f3e9e5a6378 <subr_stub_code+56>, sp = 0x7f3e94cdee00, fp = 0x7f3e94cdee18, stack_limit = 0x7f3e94cde000, trace_level = 0, sp_min_since_gc = 0x7f3e94cdee00, stack_size = 512, stack_bottom = 0x7f3e94cde000, stack_top = 0x7f3e94cdf000, overflow_handler_stack = 0x304, hooks = {0x4, 0x4, 0x4, 0x4, 0x4}, resumable_prompt_cookie = 0x7f3d397f9440, engine = 1} so fp, sp, stack_bottom, stack_top all are self-consistent with each other. The problem is that fp = SCM_FRAME_DYNAMIC_LINK (fp) goes nowhere, after the first iteration. Is the frame corrupted? How? what does the rest of the stack look like? (gdb) x/32x 0x7f3e94cdee00 0x7f3e94cdee00: 0x0000000000000904 0x000000000000200c 0x7f3e94cdee10: 0x0000000001a1b7e0 0x0000000001a1b5e0 0x7f3e94cdee20: 0x0000000000000004 0x000000000000200c <<< theres the 4 0x7f3e94cdee30: 0x0000000000169bd6 0x0000000000000001 <<< there's the new fp 0x7f3e94cdee40: 0x0000000000000000 0x000000000192acd0 <<< there's the zero. 0x7f3e94cdee50: 0x0000000001fa4bc0 0x0000000001a1c6d0 0x7f3e94cdee60: 0x0000000044507950 0x0000000000000002 0x7f3e94cdee70: 0x000000000005a6f5 0x00000000018febd0 0x7f3e94cdee80: 0x000000006d490d00 0x00007f3e9e97241c 0x7f3e94cdee90: 0x0000000000000002 0x00007f3e9e97241c 0x7f3e94cdeea0: 0x0000000000000002 0x00007f3e8d797630 0x7f3e94cdeeb0: 0x0000000000000002 0x00007f3e86effacc 0x7f3e94cdeec0: 0x0000000000000003 0x000000006d490d00 0x7f3e94cdeed0: 0x00007f3e86eb8698 0x0000000000000002 0x7f3e94cdeee0: 0x00007f3e8e51f450 0x0000000000000008 0x7f3e94cdeef0: 0x0000000001a37c10 0x000000000192acd0 So... who sets up thread->vp ? /* #define VM_ENABLE_ASSERTIONS */ However, defining this breaks the build. diff --git a/libguile/vm-engine.c b/libguile/vm-engine.c index 6c88ebf..f0f8164 100644 --- a/libguile/vm-engine.c +++ b/libguile/vm-engine.c @@ -2050,11 +2050,11 @@ VM_NAME (scm_i_thread *thread, struct scm_vm *vp, if (!scm_module_system_booted_p) { - ASSERT (scm_is_true + ASSERT (scm_is_true( scm_equal_p (modname, scm_list_2 (SCM_BOOL_T, - scm_from_utf8_symbol ("guile")))); + scm_from_utf8_symbol ("guile"))))); var = scm_lookup (sym); } else if (scm_is_true (SCM_CAR (modname))) diff --git a/libguile/vm.c b/libguile/vm.c index 18f2192..c26066e 100644 --- a/libguile/vm.c +++ b/libguile/vm.c @@ -63,7 +63,7 @@ static size_t page_size; /* The VM has a number of internal assertions that shouldn't normally be necessary, but might be if you think you found a bug in the VM. */ -/* #define VM_ENABLE_ASSERTIONS */ +#define VM_ENABLE_ASSERTIONS static void vm_expand_stack (struct scm_vm *vp, union scm_vm_stack_element *new_sp) SCM_NOINLINE; GEN guile-procedures.texi /bin/bash: line 1: 24078 Broken pipe ./meta/guile Aborted ./libguile/guile Aborted gdb libguile/.libs/lt-guile #0 0x00007ffff74f7428 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:54 #1 0x00007ffff74f902a in __GI_abort () at abort.c:89 #2 0x00007ffff7b707c4 in vm_regular_engine (thread=0x7282, vp=0x806f30, registers=0x6, resume=-145787864) at ../../libguile/vm-engine.c:2053 #3 0x00007ffff7b711d2 in scm_call_n (proc=0x832b30, argv=argv@entry=0x0, nargs=nargs@entry=0) at ../../libguile/vm.c:1257 #4 0x00007ffff7af14e9 in scm_call_0 (proc=<optimized out>) at ../../libguile/eval.c:481 #5 0x00007ffff7b10568 in scm_primitive_load_path (args=<optimized out>) at ../../libguile/load.c:1248 #6 0x00007ffff7b6e976 in vm_regular_engine (thread=0x7282, vp=0x806f30, registers=0x6, resume=-145787864) at ../../libguile/vm-engine.c:784