Re: crash in gc with upside-down stack

2008-11-13 Thread Linas Vepstas
Attached below is a debugging patch, and its output,
which shows that the stack bounds are frequently
up-side-down, and are sometimes upside-down
when the GC runs, thus leading to a crash.

In the next email, I'll propose a patch that fixes the
the problem.

The original problem report:

 2008/11/11 Linas Vepstas [EMAIL PROTECTED]:

 My stack below.

 Program received signal SIGSEGV, Segmentation fault.
 [Switching to Thread 0xf5333b90 (LWP 20587)]
 0xf7711ce3 in scm_mark_locations (x=0xf5333110, n=4294966782) at 
 gc-mark.c:435
 435   SCM obj = * (SCM *) x[m];
 Current language:  auto; currently c
 (gdb) bt
 #0  0xf7711ce3 in scm_mark_locations (x=0xf5333110, n=4294966782)
at gc-mark.c:435
 #1  0xf7766a12 in scm_threads_mark_stacks () at threads.c:1375
 #2  0xf7711d38 in scm_mark_all () at gc-mark.c:82
 #3  0xf7710d33 in scm_i_gc (what=0xf778602e cells) at gc.c:598


A debugging patch. Yes, its ugly, its intentionally ugly.
More of an eye-catcher that way.

Index: guile-1.8.5/libguile/threads.c
===
--- guile-1.8.5.orig/libguile/threads.c 2008-11-13 07:58:22.0 -0600
+++ guile-1.8.5/libguile/threads.c  2008-11-13 13:14:00.0 -0600
@@ -395,6 +395,10 @@ static scm_t_guile_ticket
 scm_leave_guile ()
 {
   scm_i_thread *t = suspend ();
+int sz=t-base - t-top;
+if(0sz) {
+printf(duuude scm_leav_guile backwards stack %d\n, sz);
+}
   scm_i_pthread_mutex_unlock (t-heap_mutex);
   return (scm_t_guile_ticket) t;
 }
@@ -694,7 +698,15 @@ scm_i_with_guile_and_parent (void *(*fun
   really_entered = scm_i_init_thread_for_guile (base_item, parent);
   res = scm_c_with_continuation_barrier (func, data);
   if (really_entered)
-scm_leave_guile ();
+{
+// scm_leave_guile ();
+scm_i_thread * t = (scm_i_thread *) scm_leave_guile ();
+int sz=t-base - t-top;
+int szb=t-base - base_item;
+if(0sz) {
+printf(duuude scm_leav_guile and parent %d %d\n, sz, szb);
+}
+}
   return res;
 }

@@ -704,6 +716,11 @@ scm_without_guile (void *(*func)(void *)
   void *res;
   scm_t_guile_ticket t;
   t = scm_leave_guile ();
+scm_i_thread * s = (scm_i_thread *) t;
+int sz=s-base - s-top;
+if(0sz) {
+printf(duuude scm_wo guile %d\n, sz);
+}
   res = func (data);
   scm_enter_guile (t);
   return res;
@@ -1371,8 +1388,15 @@ scm_threads_mark_stacks (void)

 #if SCM_STACK_GROWS_UP
   scm_mark_locations (t-base, t-top - t-base);
+
 #else
+int sz=t-base - t-top;
+if(0=sz) {
   scm_mark_locations (t-top, t-base - t-top);
+} else {
+printf (duude bugg!!\n);
+printf (duude stack top=%p base=%p sz=%d\n, t-top, t-base,
t-base - t-top);
+}
 #endif
   scm_mark_locations ((SCM_STACKITEM *) t-regs,
  ((size_t) sizeof(t-regs)
@@ -1441,6 +1465,11 @@ int
 scm_pthread_mutex_lock (scm_i_pthread_mutex_t *mutex)
 {
   scm_t_guile_ticket t = scm_leave_guile ();
+scm_i_thread * s = (scm_i_thread *) t;
+int sz=s-base - s-top;
+if(0sz) {
+printf(duuude scm_mutexe %d\n, sz);
+}
   int res = scm_i_pthread_mutex_lock (mutex);
   scm_enter_guile (t);
   return res;
@@ -1463,6 +1492,11 @@ int
 scm_pthread_cond_wait (scm_i_pthread_cond_t *cond,
scm_i_pthread_mutex_t *mutex)
 {
   scm_t_guile_ticket t = scm_leave_guile ();
+scm_i_thread * s = (scm_i_thread *) t;
+int sz=s-base - s-top;
+if(0sz) {
+printf(duuude scm_conde %d\n, sz);
+}
   int res = scm_i_pthread_cond_wait (cond, mutex);
   scm_enter_guile (t);
   return res;
@@ -1578,7 +1612,12 @@ scm_i_thread_put_to_sleep ()
 {
   scm_i_thread *t;

-  scm_leave_guile ();
+  // scm_leave_guile ();
+   t = (scm_i_thread *) scm_leave_guile ();
+int sz=t-base - t-top;
+if(0sz) {
+printf(duuude scm_leav_guile backwards was scm_i_thread_put_to_sleep
%d\n, sz);
+}
   scm_i_pthread_mutex_lock (thread_admin_mutex);

   /* Signal all threads to go to sleep
@@ -1620,6 +1659,10 @@ void
 scm_i_thread_sleep_for_gc ()
 {
   scm_i_thread *t = suspend ();
+int sz=t-base - t-top;
+if(0sz) {
+printf(duuude scm_i_thread_sleep_for_gc backwards stack %d\n, sz);
+}
   scm_i_pthread_cond_wait (wake_up_cond, t-heap_mutex);
   resume (t);
 }


Here is an example of the output generated:

duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile and parent -54 -76
duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile and parent -54 -76
duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile and parent -54 -76
duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile and parent -54 -76
duude bugg!!
duude stack top=0xf355b9e0 base=0xf355b908 sz=-54
duude bugg!!
duude stack top=0xf355b9e0 base=0xf355b908 sz=-54
duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile and parent -54 -76
duuude scm_leav_guile backwards stack -54
duuude scm_leav_guile and parent -54 -76
duuude scm_leav_guile backwards stack -54

Re: Does anyone actually use threads with guile?

2008-11-13 Thread Andy Wingo
Hi Linas,

[cc'ing them all, but followups to just one please -- bug-guile is
probably best]

For my part I apologize for not having the cycles to poke this more
thoroughly. Fortunately you are a good programmer and can figure things
out :)

History: pthread support was new with 1.8, as you probably know. A
number of people have used it, but it seems that perhaps you are
pounding it more than most.

On Thu 13 Nov 2008 05:56, Linas Vepstas [EMAIL PROTECTED] writes:

 Basically, at any given time, some thread might be
 in a critical section. Some other thread may be
 throwing an error for some utterly unrelated reason.
 Yet, when the error is thrown, this critical section
 check will trip, and it will do so for an utterly bogus
 reason.  At least, that describes my case.

 Is there any reason at all not to remove this check
 entirely? (at  libguile/throw.c line 695.)

I think the idea behind the check sounds good -- it is incorrect to
throw from within a critical section, and the check detects this.

But the check is incorrect as you noticed, it should be checking if the
current thread is in a critical section.

So we have two options, to remove the check or to fix the critical
section counter (possiblity to be thread-local). My instinct would be
that removal is the correct solution in 1.8, and that in master the
decision depends on whether we merge the BDW GC or not, and what the
impact that GC has on the idea of critical sections.

More input appreciated.

Cheers,

Andy
-- 
http://wingolog.org/




[PATCH] fix for Re: crash in gc with upside-down stack

2008-11-13 Thread Linas Vepstas
Patch below; I'm also attaching the same patch, in case
gmail is scrambling this thing :-/  Also, I've long had a
generic assignment on file with the FSF.

--linas

The patch below fixes a crash during garbage collection, where, during
the mark-stack phase, the top and bottom of the stack are found to be
in backwards order, typically because scm_with_guile() was called when
the stack is much shorter than when a thread was first guilified. That
is, the stack base pointer is stale, and can be inverted from the stack
top. If GC runs due to activity in some other thread, the stale base
pointer leads to the crash (as base-top is approximately 2^32 or 2^64).

A typical symptom of this bug, on a 32-bit system, is:

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0xf5333b90 (LWP 20587)]
0xf7711ce3 in scm_mark_locations (x=0xf5333110, n=4294966782) at gc-mark.c:435
435   SCM obj = * (SCM *) x[m];
Current language:  auto; currently c
(gdb) bt
#0  0xf7711ce3 in scm_mark_locations (x=0xf5333110, n=4294966782) at
gc-mark.c:435
#1  0xf7766a12 in scm_threads_mark_stacks () at threads.c:1375

Notice that 4294966782 == fdfe == -202

Please apply in time for guile-1.8.6!

Signed-off-by: Linas Vepstas [EMAIL PROTECTED]

---
 libguile/threads.c |   19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

Index: guile-1.8.5/libguile/threads.c
===
--- guile-1.8.5.orig/libguile/threads.c 2008-11-13 15:17:12.0 -0600
+++ guile-1.8.5/libguile/threads.c  2008-11-13 15:32:07.0 -0600
@@ -577,9 +577,24 @@ scm_i_init_thread_for_guile (SCM_STACKIT
   /* This thread is already guilified but not in guile mode, just
 resume it.

-XXX - base might be lower than when this thread was first
-guilified.
+ A user call to scm_with_guile() will lead us to here. This
+ could happen anywhere on the stack, and in particular, the
+ stack can be *much* shorter than what it was when this thread
+ was first guilified. This will typically happen in
+ on_thread_exit(), where the stack is *always* shorter than
+ when the thread was first guilified. If the GC happens to
+ get triggered due to some other thread, we'd end up with
+ t-top upside-down w.r.t. t-base, which will result in
+ chaos in scm_threads_mark_stacks() when top-base=2^32 or 2^64.
+ Thus, reset the base, if needed.
*/
+#if SCM_STACK_GROWS_UP
+  if (base  t-base)
+ t-base = base;
+#else
+  if (base  t-base)
+ t-base = base;
+#endif
   scm_enter_guile ((scm_t_guile_ticket) t);
   return 1;
 }
The patch below fixes a crash during garbage collection, where, during
the mark-stack phase, the top and bottom of the stack are found to be 
in backwards order, typically because scm_with_guile() was called when
the stack is much shorter than when a thread was first guilified. That
is, the stack base pointer is stale, and can be inverted from the stack
top. If GC runs due to activity in some other thread, the stale base
pointer leads to the crash (as base-top is approximately 2^32 or 2^64).

A typical symptom of this bug, on a 32-bit system, is:

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0xf5333b90 (LWP 20587)]
0xf7711ce3 in scm_mark_locations (x=0xf5333110, n=4294966782) at gc-mark.c:435
435   SCM obj = * (SCM *) x[m];
Current language:  auto; currently c
(gdb) bt
#0  0xf7711ce3 in scm_mark_locations (x=0xf5333110, n=4294966782) at gc-mark.c:435
#1  0xf7766a12 in scm_threads_mark_stacks () at threads.c:1375

Notice that 4294966782 == fdfe == -202

Please apply in time for guile-1.8.6!

Signed-off-by: Linas Vepstas [EMAIL PROTECTED]

---
 libguile/threads.c |   19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

Index: guile-1.8.5/libguile/threads.c
===
--- guile-1.8.5.orig/libguile/threads.c	2008-11-13 15:17:12.0 -0600
+++ guile-1.8.5/libguile/threads.c	2008-11-13 15:32:07.0 -0600
@@ -577,9 +577,24 @@ scm_i_init_thread_for_guile (SCM_STACKIT
   /* This thread is already guilified but not in guile mode, just
 	 resume it.
 	 
-	 XXX - base might be lower than when this thread was first
-	 guilified.
+ A user call to scm_with_guile() will lead us to here. This
+ could happen anywhere on the stack, and in particular, the
+ stack can be *much* shorter than what it was when this thread
+ was first guilified. This will typically happen in
+ on_thread_exit(), where the stack is *always* shorter than
+ when the thread was first guilified. If the GC happens to
+ get triggered due to some other thread, we'd end up with
+ t-top upside-down w.r.t. t-base, which will result in
+ chaos in 

Re: Does anyone actually use threads with guile?

2008-11-13 Thread Linas Vepstas
2008/11/13 Andy Wingo [EMAIL PROTECTED]:
 For my part I apologize for not having the cycles

Fine, I'm hacking around it for now, but would like to see
something for 1.8.6.

 On Thu 13 Nov 2008 05:56, Linas Vepstas [EMAIL PROTECTED] writes:

 Basically, at any given time, some thread might be
 in a critical section. Some other thread may be
 throwing an error for some utterly unrelated reason.
 Yet, when the error is thrown, this critical section
 check will trip, and it will do so for an utterly bogus
 reason.  At least, that describes my case.

 Is there any reason at all not to remove this check
 entirely? (at  libguile/throw.c line 695.)

 I think the idea behind the check sounds good -- it is incorrect to
 throw from within a critical section, and the check detects this.

 But the check is incorrect as you noticed, it should be checking if the
 current thread is in a critical section.

The patch below does this.

I do not understand how 'async' fits into the grand scheme
of things. From what I can tell, though, there won't be any
cases where the thrower will be in a different thread than
where scm_ithrow() will run.  So the patch should be good.

--linas

---
 libguile/throw.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: guile-1.8.5/libguile/throw.c
===
--- guile-1.8.5.orig/libguile/throw.c  2008-11-13 16:02:26.0 -0600
+++ guile-1.8.5/libguile/throw.c 2008-11-13 16:29:46.0 -0600
@@ -689,7 +689,7 @@ scm_ithrow (SCM key, SCM args, int noret
   SCM dynpair = SCM_UNDEFINED;
   SCM winds;

-  if (scm_i_critical_section_level)
+  if (SCM_I_CURRENT_THREAD-block_asyncs)
 {
   fprintf (stderr, throw from within critical section.\n);
   abort ();