Re: Hung threads

2008-11-16 Thread Linas Vepstas
Hi,

2008/11/14 Linas Vepstas [EMAIL PROTECTED]:
 Here's a deadlock I saw today.

Here's a different deadlock that is fully debugged. The
guilty code leading to the deadlock is in make_struct(),
in struct.c circa line 463, which tries to alloc memory
while holding a CRITICAL_SECTION lock.  Of course,
everything deadlocks in GC.

I am trying to figure out how to fix this now. its kind
of gnarly.

Summary:
thread 7 -- holding critical section lock, sleeping on scm_i_sweep_mutex
thread 5 -- holding heap_mutex, sleeping on critical section
thread 12 -- holding scm_i_sweep_mutex, sleeping on heap_mutex


^C
Program received signal SIGINT, Interrupt.
[Switching to Thread 0xf79f86c0 (LWP 10364)]
0xe425 in __kernel_vsyscall ()
(gdb) info threads
  12 Thread 0xf24fdb90 (LWP 10395)  0xe425 in __kernel_vsyscall ()
  10 Thread 0xf34ffb90 (LWP 10389)  0xe425 in __kernel_vsyscall ()
  9 Thread 0xf3e7db90 (LWP 10387)  0xe425 in __kernel_vsyscall ()
  7 Thread 0xf4e7fb90 (LWP 10380)  0xe425 in __kernel_vsyscall ()
  6 Thread 0xf5680b90 (LWP 10377)  0xe425 in __kernel_vsyscall ()
  5 Thread 0xf5ec4b90 (LWP 10374)  0xe425 in __kernel_vsyscall ()
  2 Thread 0xf76c7b90 (LWP 10365)  0xe425 in __kernel_vsyscall ()
* 1 Thread 0xf79f86c0 (LWP 10364)  0xe425 in __kernel_vsyscall ()

lock summary
thread 5 -- holding heap_mutex
sleeping on CRITICAL_SECTION in scm_c_catch

thread 6 -- holds no locks
sleeping on scm_i_sweep_mutex in increase_mtrigger

thread 7 -- holding critical section lock  in make_struct
then tries to alloc mem ... !
sleeping on scm_i_sweep_mutex in increase_mtrigger

thread 9 -- holding heap_mutex
sleeping on CRITICAL SECTION in scm_c_catch

thread 10 -- holding ?? in  increase_mtrigger
called from scm_gc_register_collectable_memory
sleeping on scm_i_sweep_mutex in increase_mtrigger

thread 12 --  trying to put everything to sleep,
 holding admin mutex, and most heap_mutexes
 sleeping on remaining heap_mutexes that are still held.

The guilty party is thread 7 which tries to alloc memory while holding
a critical section lock. This leads to a deadlock:

thread 7 -- holding critical section lock, sleeping on scm_i_sweep_mutex
thread 5 -- holding heap_mutex, sleeping on critical section
thread 12 -- holding scm_i_sweep_mutex, sleeping on heap_mutex


(gdb) call prt_lockholders()

Thread 0xf5680b90 -- thread 6

Thread 0xf34ffb90  -- thread 10
0: mutex (0xf7816d0c) in:
/usr/lib/libguile.so.17 [0xf778f9f1]
c_register_collectable_memory+0x2a) [0xf778fbca]
/libguile.so.17(scm_gc_malloc+0x40) [0xf7790010]
.so.17(scm_gc_calloc+0x2c) [0xf77901bc]

Thread 0xf24fdb90 -- thread 12
0: mutex (0xf7812780) in: -- this is the scm_i_sweep_mutex
/usr/lib/libguile.so.17(scm_i_thread_put_to_sleep+0x7f) [0xf77e395f]
f]
x19) [0xf778db29]
(scm_gc_register_collectable_memory+0x2a) [0xf778fbca]
1: thread_admin_mutex (0xf78191ec) in:
/usr/lib/libguile.so.17(scm_i_thread_put_to_sleep+0x7f) [0xf77e395f]
/usr/lib/libguile.so.17(scm_i_gc+0x19) [0xf778db29]
/usr/lib/libguile.so.17 [0xf778fa5c]
/usr/lib/libguile.so.17(scm_gc_register_collectable_memory+0x2a)
[0xf778fbca]
2: t-heap_mutex (0x8c11484) in:
/usr/lib/libguile.so.17(scm_i_thread_put_to_sleep+0x7f) [0xf77e395f]
/usr/lib/libguile.so.17(scm_i_gc+0x19) [0xf778db29]
/usr/lib/libguile.so.17 [0xf778fa5c]
/usr/lib/libguile.so.17(scm_gc_register_collectable_memory+0x2a)
[0xf778fbca]
3: t-heap_mutex (0x8c682bc) in:
/usr/lib/libguile.so.17(scm_i_thread_put_to_sleep+0x7f) [0xf77e395f]
/usr/lib/libguile.so.17(scm_i_gc+0x19) [0xf778db29]
/usr/lib/libguile.so.17 [0xf778fa5c]
/usr/lib/libguile.so.17(scm_gc_register_collectable_memory+0x2a)
[0xf778fbca]
4: t-heap_mutex (0xf3518c74) in:
/usr/lib/libguile.so.17(scm_i_thread_put_to_sleep+0x7f) [0xf77e395f]
/usr/lib/libguile.so.17(scm_i_gc+0x19) [0xf778db29]
/usr/lib/libguile.so.17 [0xf778fa5c]
/usr/lib/libguile.so.17(scm_gc_register_collectable_memory+0x2a)
[0xf778fbca]
5: t-heap_mutex (0x8c10e84) in:
/usr/lib/libguile.so.17(scm_i_thread_put_to_sleep+0x7f) [0xf77e395f]
/usr/lib/libguile.so.17(scm_i_gc+0x19) [0xf778db29]
/usr/lib/libguile.so.17 [0xf778fa5c]
/usr/lib/libguile.so.17(scm_gc_register_collectable_memory+0x2a)
[0xf778fbca]

Thread 0xf4e7fb90 -- thread 7
0: scm_i_critical_section_mutex (0xf781c808) in:
/usr/lib/libguile.so.17(scm_make_struct+0xe2) [0xf77e1d52]
/usr/lib/libguile.so.17(scm_make_stack+0x181) [0xf77c6b11]

/home/linas/src/novamente/src/opencog-stage4/staging/bin/opencog/guile/libsmob.so(_ZN7opencog10SchemeEval17preunwind_handlerEP17scm_unused_structS2_+0x26)
[0xf7879696]


[PATCH]: deadlock in make_struct()

2008-11-16 Thread Linas Vepstas
Hi,

Here's a deadlock patch. When committing this patch, please copy
the text below into the source code commit message, as it provides
a record of what the patch does, and why it was made.

--linas


The patch below fixes a deadlock in the multi-threading code.
It fixes this by simply removing the CRITICAL_SECTION lock.
There does not seem to be any need for a lock on this section,
since all variables accessed are local, and all other literals
are compile-time constants.

A typical deadlock that was witnessed was:

thread 7 -- holding critical section lock, sleeping on scm_i_sweep_mutex
held in scm_make_struct() at struct.c:463,
sleeping in increase_mtrigger() at  gc-malloc.c:234
thread 5 -- holding heap_mutex, sleeping on critical section
held because thread is in guile mode
sleeping in scm_c_catch() at throw.c:187
thread 12 -- holding scm_i_sweep_mutex, sleeping on heap_mutex
 held in increase_mtrigger() at gc-malloc.c:234
 sleeping in scm_i_thread_put_to_sleep() at threads.c:1621

Signed-off-by: Linas Vepstas [EMAIL PROTECTED]

---
 libguile/struct.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: guile-1.8.5/libguile/struct.c
===
--- guile-1.8.5.orig/libguile/struct.c  2008-11-16 10:58:31.0 -0600
+++ guile-1.8.5/libguile/struct.c   2008-11-16 12:17:05.0 -0600
@@ -450,7 +450,14 @@ SCM_DEFINE (scm_make_struct, make-struc
 goto bad_tail;
 }

-  SCM_CRITICAL_SECTION_START;
+  /* In guile 1.8.5 and earlier, everything below was covered by a
+ CRITICAL_SECTION lock. This leads to deadlocks in garbage collection,
+ since other threads might be holding the heap_mutex, while sleeping
+ on the CRITICAL_SECTION lock. There does not seem to be any need
+ for a lock on the section below, as it does not access or update
+ any globals. vtable, basic_size, tail_elts are all local variables,
+ scm_tc3_struct and scm_struct_i_* are all compile-time consts.
+ So the lock has been removed. */
   if (SCM_STRUCT_DATA (vtable)[scm_struct_i_flags]  SCM_STRUCTF_ENTITY)
 {
   data = scm_alloc_struct (basic_size + tail_elts,
@@ -466,7 +473,6 @@ SCM_DEFINE (scm_make_struct, make-struc
   handle = scm_double_cell scm_t_bits) SCM_STRUCT_DATA (vtable))
 + scm_tc3_struct),
(scm_t_bits) data, 0, 0);
-  SCM_CRITICAL_SECTION_END;

   /* In guile 1.8.1 and earlier, the SCM_CRITICAL_SECTION_END above covered
  also the following scm_struct_init.  But that meant if scm_struct_init
Subject: [PATCH]: deadlock in make_struct()
Date: 16 Nov 2008
To: bug-guile@gnu.org

Hi, 

Here's a deadlock patch. When committing this patch, please copy
the text below into the source code commit message, as it provides
a record of what the patch does, and why it was made. 

--linas


The patch below fixes a deadlock in the multi-threading code.
It fixes this by simply removing the CRITICAL_SECTION lock.
There does not seem to be any need for a lock on this section,
since all variables accessed are local, and all other literals
are compile-time constants.

A typical deadlock that was witnessed was:

thread 7 -- holding critical section lock, sleeping on scm_i_sweep_mutex
held in scm_make_struct() at struct.c:463, 
sleeping in increase_mtrigger() at  gc-malloc.c:234
thread 5 -- holding heap_mutex, sleeping on critical section
held because thread is in guile mode
sleeping in scm_c_catch() at throw.c:187
thread 12 -- holding scm_i_sweep_mutex, sleeping on heap_mutex
 held in increase_mtrigger() at gc-malloc.c:234
 sleeping in scm_i_thread_put_to_sleep() at threads.c:1621

Signed-off-by: Linas Vepstas [EMAIL PROTECTED]

---
 libguile/struct.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: guile-1.8.5/libguile/struct.c
===
--- guile-1.8.5.orig/libguile/struct.c	2008-11-16 10:58:31.0 -0600
+++ guile-1.8.5/libguile/struct.c	2008-11-16 12:17:05.0 -0600
@@ -450,7 +450,14 @@ SCM_DEFINE (scm_make_struct, make-struc
 goto bad_tail;
 }
 
-  SCM_CRITICAL_SECTION_START;
+  /* In guile 1.8.5 and earlier, everything below was covered by a
+ CRITICAL_SECTION lock. This leads to deadlocks in garbage collection,
+ since other threads might be holding the heap_mutex, while sleeping
+ on the CRITICAL_SECTION lock. There does not seem to be any need
+ for a lock on the section below, as it does not access or update
+ any globals. vtable, basic_size, tail_elts are all local variables,
+ scm_tc3_struct and scm_struct_i_* are all compile-time consts.
+ So the lock has been removed. */
   if (SCM_STRUCT_DATA (vtable)[scm_struct_i_flags]  SCM_STRUCTF_ENTITY)
 {
   

[PATCH] Final: thread lock nesting debugging

2008-11-16 Thread Linas Vepstas
I've been seeing all sorts of deadlocks in guile, and so I wrote a small
debugging utility to try to track down the problems.  I'd like to see
this patch included in future versions of guile.

I found one bug with this tool, and have submitted a patch for that
already.  It looks like there's another bug involving signals --
there is a probably deadlock in garbage collection if a signal
is sent at just the wrong time. The bug can be seen by enabling
this patch, and then running 'make check'.  I'm going to ignore
this, as I'm not worried about signals right now.

This is my final version of this patch, I'd sent a beta version
a few days ago. Its final because I'm not anticipating any
further changes.

--linas

Add a deadlock debugging facility to guile.

Signed-off-by: Linas Vepstas [EMAIL PROTECTED]

---
 config.h.in|3
 configure.in   |   11 +++
 libguile/Makefile.am   |2
 libguile/debug-locks.c |  159 +
 libguile/pthread-threads.h |   15 
 libguile/threads.c |   53 +++
 libguile/threads.h |8 ++
 7 files changed, 250 insertions(+), 1 deletion(-)

Index: guile-1.8.5/libguile/pthread-threads.h
===
--- guile-1.8.5.orig/libguile/pthread-threads.h 2008-11-16
18:57:19.0 -0600
+++ guile-1.8.5/libguile/pthread-threads.h  2008-11-16 18:57:48.0 
-0600
@@ -91,6 +91,21 @@ extern pthread_mutexattr_t scm_i_pthread
 #define scm_i_scm_pthread_cond_wait scm_pthread_cond_wait
 #define scm_i_scm_pthread_cond_timedwaitscm_pthread_cond_timedwait

+#ifdef GUILE_DEBUG_LOCKS
+#undef scm_i_pthread_mutex_lock
+#define scm_i_pthread_mutex_lock(ARG) scm_i_pthread_mutex_lock_dbg(ARG, #ARG)
+
+#undef scm_i_pthread_mutex_unlock
+#define scm_i_pthread_mutex_unlock(ARG)
scm_i_pthread_mutex_unlock_dbg(ARG, #ARG)
+
+int scm_i_pthread_mutex_lock_dbg(pthread_mutex_t *, const char *);
+int scm_i_pthread_mutex_unlock_dbg(pthread_mutex_t *, const char *);
+
+void prt_lockholders(void);
+void prt_this_lockholder(void);
+
+#endif
+
 #endif  /* SCM_PTHREADS_THREADS_H */

 /*
Index: guile-1.8.5/libguile/threads.c
===
--- guile-1.8.5.orig/libguile/threads.c 2008-11-16 18:57:19.0 -0600
+++ guile-1.8.5/libguile/threads.c  2008-11-16 18:57:48.0 -0600
@@ -441,6 +441,24 @@ guilify_self_1 (SCM_STACKITEM *base)
   SCM_SET_FREELIST_LOC (scm_i_freelist, t-freelist);
   SCM_SET_FREELIST_LOC (scm_i_freelist2, t-freelist2);

+#ifdef GUILE_DEBUG_LOCKS
+  int i, j;
+  for(i=0; iLOCK_STACK_DEPTH; i++)
+{
+  t-lockname[i] = NULL;
+  t-lockmutex[i] = NULL;
+  for(j=0; jTRACE_STACK_DEPTH; j++)
+{
+  t-lockholder[i][j] = NULL;
+}
+}
+  if (scm_initialized_p == 0)
+{
+  t-lockname[0] = scm_i_init_mutex;
+  t-lockmutex[0] = scm_i_init_mutex;
+}
+#endif
+
   scm_i_pthread_setspecific (scm_i_thread_key, t);

   scm_i_pthread_mutex_lock (t-heap_mutex);
@@ -1624,8 +1642,21 @@ scm_i_thread_wake_up ()
   scm_i_thread *t;

   scm_i_pthread_cond_broadcast (wake_up_cond);
+#ifndef GUILE_DEBUG_LOCKS
   for (t = all_threads; t; t = t-next_thread)
scm_i_pthread_mutex_unlock (t-heap_mutex);
+#else
+  /* Unlock in reverse order from locking */
+  scm_i_thread *tt = NULL;
+  while (tt != all_threads)
+{
+  scm_i_thread *tp = NULL;
+  for (t = all_threads; t != tt; t = t-next_thread)
+ tp = t;
+  scm_i_pthread_mutex_unlock (tp-heap_mutex);
+  tt = tp;
+}
+#endif
   scm_i_pthread_mutex_unlock (thread_admin_mutex);
   scm_enter_guile ((scm_t_guile_ticket) SCM_I_CURRENT_THREAD);
 }
@@ -1721,6 +1752,28 @@ scm_init_threads_default_dynamic_state (
   scm_i_default_dynamic_state = scm_permanent_object (state);
 }

+#ifdef GUILE_DEBUG_LOCKS
+extern int guile_do_abort_on_badlock;
+
+void prt_one_lockholder(scm_i_thread *);
+void prt_lockholders(void)
+{
+  scm_i_thread *t;
+
+  if (!guile_do_abort_on_badlock) return;
+
+  for (t = all_threads; t; t = t-next_thread)
+{
+  prt_one_lockholder(t);
+}
+}
+
+void prt_this_lockholder(void)
+{
+  prt_one_lockholder(SCM_I_CURRENT_THREAD);
+}
+#endif
+
 void
 scm_init_thread_procs ()
 {
Index: guile-1.8.5/libguile/threads.h
===
--- guile-1.8.5.orig/libguile/threads.h 2008-11-16 18:57:19.0 -0600
+++ guile-1.8.5/libguile/threads.h  2008-11-16 18:57:48.0 -0600
@@ -108,6 +108,14 @@ typedef struct scm_i_thread {
   SCM_STACKITEM *top;
   jmp_buf regs;

+#ifdef GUILE_DEBUG_LOCKS
+#define LOCK_STACK_DEPTH 250
+#define TRACE_STACK_DEPTH 4
+  const char *lockname[LOCK_STACK_DEPTH];
+  char *lockholder[LOCK_STACK_DEPTH][TRACE_STACK_DEPTH];
+  pthread_mutex_t *lockmutex[LOCK_STACK_DEPTH];
+#endif
+
 }