wingo pushed a commit to branch wip-whippet
in repository guile.

commit 0939aff5f69b668bb44e8b987b812c6ab27061c4
Author: Andy Wingo <wi...@pobox.com>
AuthorDate: Wed Jul 16 09:09:07 2025 +0200

    Rework mutex/cond waiting to not allocate
    
    Fixes some deadlocks when trying to come to a safepoint.
    
    * libguile/threads-internal.h (struct scm_wait_queue): New type.  Embed
    it in threads, mutexes, and conds.
    * libguile/threads.c (enqueue, remqueue_inner, remqueue, dequeue):
    Rework to use embedded wait queue links instead of allocating.
    (block_self, unblock_from_queue, guilify_self_1)
    (scm_make_mutex_with_kind, lock_mutex, unlock_mutex, scm_mutex_owner)
    (scm_mutex_locked_p, scm_make_condition_variable, timed_wait)
    (scm_signal_condition_variable, scm_broadcast_condition_variable): Adapt
    to wait queue change.  Also, pin mutexes and conds.
---
 libguile/threads-internal.h |  37 +++++++-
 libguile/threads.c          | 209 +++++++++++++++++++-------------------------
 2 files changed, 122 insertions(+), 124 deletions(-)

diff --git a/libguile/threads-internal.h b/libguile/threads-internal.h
index 344e22f7b..84af674a9 100644
--- a/libguile/threads-internal.h
+++ b/libguile/threads-internal.h
@@ -43,9 +43,35 @@ struct scm_thread_wake_data
   scm_i_pthread_cond_t *cond;
 };
 
+/* A scm_wait_queue is a link in a circular linked-list.  A
+   scm_wait_queue can either be "attached", in which case its next and
+   prev links are non-null, or "detached", in which case they are both
+   null.  Condition variables and mutexes embed a wait queue as a list
+   head, and their queues are always attached.  An empty queue is one
+   whose next and prev links point to itself.  Threads embed a wait
+   queue as a link, which may be detached if the thread is not waiting
+   on anything, or attached to the wait queue of the cond or mutex that
+   it is waiting on.
+
+   GC-wise, if a thread is waiting on a cond or mutex, the thread will
+   be reachable because it is on the global thread list.  If a thread
+   owns a reachable mutex, it is reachable because of the mutex's owner
+   field.  Any thread waiting for a mutex/cond has an active stack and
+   that mutex/cond is on the stack, so the mutex/cond will be reachable.
+
+   Because of the interior pointers in wait queues, we pin conds and
+   mutexes when they are allocated, even if there is no one waiting for
+   them.  (scm_thread is always pinned as well, for other reasons.)  */
+struct scm_wait_queue {
+  scm_thread *thread;
+  struct scm_wait_queue *next;
+  struct scm_wait_queue *prev;
+};
+
 struct scm_thread {
   scm_t_bits tag;
 
+  /* Link for global set of threads.  */
   struct scm_thread *next_thread;
 
   /* VM state for this thread.  */
@@ -71,6 +97,9 @@ struct scm_thread {
      this thread exits.  */
   int needs_unregister;
 
+  /* Queue links for when a thread is waiting on a mutex or a cond.  */
+  struct scm_wait_queue queue;
+
   struct scm_thread_wake_data wake_data;
   scm_i_pthread_cond_t sleep_cond;
   int sleep_pipe[2];
@@ -161,10 +190,10 @@ enum scm_mutex_kind {
 
 struct scm_mutex {
   scm_t_bits tag_and_flags;
-  /* The thread that owns this mutex, or #f if the mutex is unlocked.  */
-  SCM owner;
+  /* The thread that owns this mutex, or NULL if the mutex is unlocked.  */
+  scm_thread *owner;
   /* Queue of threads waiting for this mutex.  */
-  SCM waiting;
+  struct scm_wait_queue waiting;
   /* For SCM_MUTEX_RECURSIVE (and only SCM_MUTEX_RECURSIVE), the
      recursive lock count.  The first lock does not count.  */
   int level;
@@ -194,7 +223,7 @@ scm_mutex_kind (struct scm_mutex *m)
 
 struct scm_cond {
   scm_t_bits tag;
-  SCM waiting;               /* the threads waiting for this condition. */
+  struct scm_wait_queue waiting;  /* the threads waiting for this condition. */
   /* FIXME: Using one cond with multiple mutexes may race on the waiting
      list.  */
 };
diff --git a/libguile/threads.c b/libguile/threads.c
index 1bb6e4559..7aefa09bc 100644
--- a/libguile/threads.c
+++ b/libguile/threads.c
@@ -136,94 +136,67 @@ to_timespec (SCM t, scm_t_timespec *waittime)
 
 /*** Queues */
 
-/* Note: We annotate with "GC-robust" assignments whose purpose is to avoid
-   the risk of false references leading to unbounded retained space as
-   described in "Bounding Space Usage of Conservative Garbage Collectors",
-   H.J. Boehm, 2001.  */
-
-/* Make an empty queue data structure.
- */
-static SCM
-make_queue ()
-{
-  return scm_cons (SCM_EOL, SCM_EOL);
-}
-
 static scm_i_pthread_mutex_t queue_lock = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 
-/* Put T at the back of Q and return a handle that can be used with
-   remqueue to remove T from Q again.
- */
-static SCM
-enqueue (SCM q, SCM t)
+static void
+enqueue (struct scm_wait_queue *q, scm_thread *t)
 {
-  SCM c = scm_cons (t, SCM_EOL);
+  struct scm_wait_queue *p = &t->queue;
+  if (p->next)
+    abort ();
+
   scm_i_pthread_mutex_lock (&queue_lock);
-  if (scm_is_null (SCM_CDR (q)))
-    SCM_SETCDR (q, c);
-  else
-    SCM_SETCDR (SCM_CAR (q), c);
-  SCM_SETCAR (q, c);
+  p->next = q->next;
+  p->prev = q;
+  q->next->prev = p;
+  q->next = p;
   scm_i_pthread_mutex_unlock (&queue_lock);
-  return c;
 }
 
-/* Remove the element that the handle C refers to from the queue Q.  C
-   must have been returned from a call to enqueue.  The return value
-   is zero when the element referred to by C has already been removed.
-   Otherwise, 1 is returned.
-*/
 static int
-remqueue (SCM q, SCM c)
+remqueue_inner (struct scm_wait_queue *p)
 {
-  SCM p, prev = q;
-  scm_i_pthread_mutex_lock (&queue_lock);
-  for (p = SCM_CDR (q); !scm_is_null (p); p = SCM_CDR (p))
-    {
-      if (scm_is_eq (p, c))
-       {
-         if (scm_is_eq (c, SCM_CAR (q)))
-            SCM_SETCAR (q, scm_is_eq (prev, q) ? SCM_EOL : prev);
-         SCM_SETCDR (prev, SCM_CDR (c));
+  if (!p->next)
+    return 0;
 
-         /* GC-robust */
-         SCM_SETCDR (c, SCM_EOL);
+  if (p->next == p)
+    abort ();
 
-          scm_i_pthread_mutex_unlock (&queue_lock);
-         return 1;
-       }
-      prev = p;
-    }
-  scm_i_pthread_mutex_unlock (&queue_lock);
-  return 0;
+  p->next->prev = p->prev;
+  p->prev->next = p->next;
+
+  p->next = p->prev = NULL;
+
+  return 1;
 }
 
-/* Remove the front-most element from the queue Q and return it.
-   Return SCM_BOOL_F when Q is empty.
-*/
-static SCM
-dequeue (SCM q)
+static int
+remqueue (scm_thread *t)
 {
-  SCM c;
   scm_i_pthread_mutex_lock (&queue_lock);
-  c = SCM_CDR (q);
-  if (scm_is_null (c))
+  int ret = remqueue_inner (&t->queue);
+  scm_i_pthread_mutex_unlock (&queue_lock);
+
+  return ret;
+}
+
+static scm_thread *
+dequeue (struct scm_wait_queue *q)
+{
+  scm_i_pthread_mutex_unlock (&queue_lock);
+  struct scm_wait_queue *last = q->prev;
+
+  if (last == q)
     {
       scm_i_pthread_mutex_unlock (&queue_lock);
-      return SCM_BOOL_F;
+      return NULL;
     }
-  else
-    {
-      SCM_SETCDR (q, SCM_CDR (c));
-      if (scm_is_null (SCM_CDR (q)))
-       SCM_SETCAR (q, SCM_EOL);
-      scm_i_pthread_mutex_unlock (&queue_lock);
 
-      /* GC-robust */
-      SCM_SETCDR (c, SCM_EOL);
+  if (last->thread == NULL || !remqueue_inner (last))
+    abort ();
+  scm_i_pthread_mutex_unlock (&queue_lock);
 
-      return SCM_CAR (c);
-    }
+  return last->thread;
 }
 
 int
@@ -287,18 +260,17 @@ scm_i_print_thread (SCM exp, SCM port, scm_print_state 
*pstate SCM_UNUSED)
    The system asyncs themselves are not executed by block_self.
 */
 static int
-block_self (SCM queue, scm_i_pthread_mutex_t *mutex,
+block_self (struct scm_wait_queue *queue, scm_i_pthread_mutex_t *mutex,
            const scm_t_timespec *waittime)
 {
   scm_thread *t = SCM_I_CURRENT_THREAD;
-  SCM q_handle;
   int err;
 
   if (scm_i_prepare_to_wait_on_cond (t, mutex, &t->sleep_cond))
     return EINTR;
 
   t->block_asyncs++;
-  q_handle = enqueue (queue, scm_thread_handle (t));
+  enqueue (queue, t);
   gc_deactivate (t->mutator);
   if (waittime == NULL)
     err = scm_i_scm_pthread_cond_wait (&t->sleep_cond, mutex);
@@ -310,7 +282,7 @@ block_self (SCM queue, scm_i_pthread_mutex_t *mutex,
      report this only when no other error (such as a timeout) has
      happened above.
   */
-  if (remqueue (queue, q_handle) && err == 0)
+  if (remqueue (t) && err == 0)
     err = EINTR;
   t->block_asyncs--;
   scm_i_wait_finished (t);
@@ -319,14 +291,14 @@ block_self (SCM queue, scm_i_pthread_mutex_t *mutex,
 }
 
 /* Wake up the first thread on QUEUE, if any.  The awoken thread is
-   returned, or #f if the queue was empty.
+   returned, or NULL if the queue was empty.
  */
-static SCM
-unblock_from_queue (SCM queue)
+static scm_thread *
+unblock_from_queue (struct scm_wait_queue *queue)
 {
-  SCM thread = dequeue (queue);
-  if (scm_is_true (thread))
-    scm_i_pthread_cond_signal (&SCM_I_THREAD_DATA(thread)->sleep_cond);
+  scm_thread *thread = dequeue (queue);
+  if (thread)
+    scm_i_pthread_cond_signal (&thread->sleep_cond);
   return thread;
 }
 
@@ -380,6 +352,8 @@ guilify_self_1 (struct gc_mutator *mut, struct 
gc_stack_addr base,
   t->pending_asyncs = SCM_EOL;
   t->block_asyncs = 1;
   t->mutator = mut;
+  t->queue.thread = t;
+  t->queue.next = t->queue.prev = NULL;
   t->base = (SCM_STACKITEM *) gc_stack_addr_as_pointer (base);
   t->continuation_root = SCM_EOL;
   t->continuation_base = t->base;
@@ -939,15 +913,18 @@ SCM_DEFINE (scm_make_mutex_with_kind, "make-mutex", 0, 1, 
0,
        SCM_MISC_ERROR ("unsupported mutex kind: ~a", scm_list_1 (kind));
     }
 
-  m = scm_allocate_tagged (SCM_I_CURRENT_THREAD, sizeof (struct scm_mutex));
+  scm_thread *thr = SCM_I_CURRENT_THREAD;
+  m = scm_allocate_tagged (thr, sizeof (struct scm_mutex));
   m->tag_and_flags = scm_tc16_mutex | (mkind << 16);
-  m->owner = SCM_BOOL_F;
-  m->waiting = make_queue ();
+  m->owner = NULL;
+  m->waiting.prev = m->waiting.next = &m->waiting;
   m->level = 0;
   /* Because PTHREAD_MUTEX_INITIALIZER is static, it's plain old data,
      and so we can just copy it.  */
   memcpy (&m->lock, &lock, sizeof (m->lock));
 
+  scm_gc_pin_object (thr, scm_from_mutex (m));
+
   return scm_from_mutex (m);
 }
 #undef FUNC_NAME
@@ -980,21 +957,19 @@ lock_mutex (enum scm_mutex_kind kind, struct scm_mutex *m,
 {
   scm_i_scm_pthread_mutex_lock (&m->lock);
 
-  if (scm_is_eq (m->owner, SCM_BOOL_F))
+  if (!m->owner)
     {
-      m->owner = scm_thread_handle (current_thread);
+      m->owner = current_thread;
       scm_i_pthread_mutex_unlock (&m->lock);
       return SCM_BOOL_T;
     }
-  else if (kind == SCM_MUTEX_RECURSIVE &&
-           scm_is_eq (m->owner, scm_thread_handle (current_thread)))
+  else if (kind == SCM_MUTEX_RECURSIVE && m->owner == current_thread)
     {
       m->level++;
       scm_i_pthread_mutex_unlock (&m->lock);
       return SCM_BOOL_T;
     }
-  else if (kind == SCM_MUTEX_STANDARD &&
-           scm_is_eq (m->owner, scm_thread_handle (current_thread)))
+  else if (kind == SCM_MUTEX_STANDARD && m->owner == current_thread)
     {
       scm_i_pthread_mutex_unlock (&m->lock);
       SCM_MISC_ERROR ("mutex already locked by thread", SCM_EOL);
@@ -1002,7 +977,7 @@ lock_mutex (enum scm_mutex_kind kind, struct scm_mutex *m,
   else
     while (1)
       {
-        int err = block_self (m->waiting, &m->lock, waittime);
+        int err = block_self (&m->waiting, &m->lock, waittime);
 
         if (err == 0)
           {
@@ -1029,9 +1004,9 @@ lock_mutex (enum scm_mutex_kind kind, struct scm_mutex *m,
           }
 
       maybe_acquire:
-        if (scm_is_eq (m->owner, SCM_BOOL_F))
+        if (!m->owner)
           {
-            m->owner = scm_thread_handle (current_thread);
+            m->owner = current_thread;
             scm_i_pthread_mutex_unlock (&m->lock);
             return SCM_BOOL_T;
           }
@@ -1116,9 +1091,9 @@ unlock_mutex (enum scm_mutex_kind kind, struct scm_mutex 
*m,
 {
   scm_i_scm_pthread_mutex_lock (&m->lock);
 
-  if (!scm_is_eq (m->owner, scm_thread_handle (current_thread)))
+  if (m->owner != current_thread)
     {
-      if (scm_is_eq (m->owner, SCM_BOOL_F))
+      if (!m->owner)
         {
           scm_i_pthread_mutex_unlock (&m->lock);
           SCM_MISC_ERROR ("mutex not locked", SCM_EOL);
@@ -1135,9 +1110,9 @@ unlock_mutex (enum scm_mutex_kind kind, struct scm_mutex 
*m,
     m->level--;
   else
     {
-      m->owner = SCM_BOOL_F;
+      m->owner = NULL;
       /* Wake up one waiter.  */
-      unblock_from_queue (m->waiting);
+      unblock_from_queue (&m->waiting);
     }
 
   scm_i_pthread_mutex_unlock (&m->lock);
@@ -1193,16 +1168,13 @@ SCM_DEFINE (scm_mutex_owner, "mutex-owner", 1, 0, 0,
            "Return the thread owning @var{mx}, or @code{#f}.")
 #define FUNC_NAME s_scm_mutex_owner
 {
-  SCM owner;
-  struct scm_mutex *m = NULL;
-
   SCM_VALIDATE_MUTEX (1, mx);
-  m = scm_to_mutex (mx);
+  struct scm_mutex *m = scm_to_mutex (mx);
   scm_i_pthread_mutex_lock (&m->lock);
-  owner = m->owner;
+  scm_thread *owner = m->owner;
   scm_i_pthread_mutex_unlock (&m->lock);
 
-  return owner;
+  return owner ? scm_thread_handle (owner) : SCM_BOOL_F;
 }
 #undef FUNC_NAME
 
@@ -1215,7 +1187,7 @@ SCM_DEFINE (scm_mutex_level, "mutex-level", 1, 0, 0,
   struct scm_mutex *m = scm_to_mutex (mx);
   if (scm_mutex_kind (m) == SCM_MUTEX_RECURSIVE)
     return scm_from_int (m->level + 1);
-  else if (scm_is_eq (m->owner, SCM_BOOL_F))
+  else if (!m->owner)
     return SCM_INUM0;
   else
     return SCM_INUM1;
@@ -1228,10 +1200,7 @@ SCM_DEFINE (scm_mutex_locked_p, "mutex-locked?", 1, 0, 0,
 #define FUNC_NAME s_scm_mutex_locked_p
 {
   SCM_VALIDATE_MUTEX (1, mx);
-  if (scm_is_eq (scm_to_mutex (mx)->owner, SCM_BOOL_F))
-    return SCM_BOOL_F;
-  else
-    return SCM_BOOL_T;
+  return scm_from_bool (scm_to_mutex (mx)->owner != NULL);
 }
 #undef FUNC_NAME
 
@@ -1254,10 +1223,11 @@ SCM_DEFINE (scm_make_condition_variable, 
"make-condition-variable", 0, 0, 0,
            "Make a new condition variable.")
 #define FUNC_NAME s_scm_make_condition_variable
 {
-  struct scm_cond *c =
-    scm_allocate_tagged (SCM_I_CURRENT_THREAD, sizeof (struct scm_cond));
+  scm_thread *thr = SCM_I_CURRENT_THREAD;
+  struct scm_cond *c = scm_allocate_tagged (thr, sizeof (struct scm_cond));
   c->tag = scm_tc16_condition_variable;
-  c->waiting = make_queue ();
+  c->waiting.prev = c->waiting.next = &c->waiting;
+  scm_gc_pin_object (thr, scm_from_condvar (c));
   return scm_from_condvar (c);
 }
 #undef FUNC_NAME
@@ -1269,9 +1239,9 @@ timed_wait (enum scm_mutex_kind kind, struct scm_mutex 
*m, struct scm_cond *c,
 {
   scm_i_scm_pthread_mutex_lock (&m->lock);
 
-  if (!scm_is_eq (m->owner, scm_thread_handle (current_thread)))
+  if (m->owner != current_thread)
     {
-      if (scm_is_eq (m->owner, SCM_BOOL_F))
+      if (!m->owner)
         {
           scm_i_pthread_mutex_unlock (&m->lock);
           SCM_MISC_ERROR ("mutex not locked", SCM_EOL);
@@ -1293,14 +1263,14 @@ timed_wait (enum scm_mutex_kind kind, struct scm_mutex 
*m, struct scm_cond *c,
         m->level--;
       else
         {
-          m->owner = SCM_BOOL_F;
+          m->owner = NULL;
           /* Wake up one waiter.  */
-          unblock_from_queue (m->waiting);
+          unblock_from_queue (&m->waiting);
         }
 
       /* Wait for someone to signal the cond, a timeout, or an
          interrupt.  */
-      err = block_self (c->waiting, &m->lock, waittime);
+      err = block_self (&c->waiting, &m->lock, waittime);
 
       /* We woke up for some reason.  Reacquire the mutex before doing
          anything else.
@@ -1320,8 +1290,7 @@ timed_wait (enum scm_mutex_kind kind, struct scm_mutex 
*m, struct scm_cond *c,
          not great.  Maybe it's necessary but for now we just disable
          interrupts while reaquiring a mutex after a wait.  */
       current_thread->block_asyncs++;
-      if (kind == SCM_MUTEX_RECURSIVE &&
-          scm_is_eq (m->owner, scm_thread_handle (current_thread)))
+      if (kind == SCM_MUTEX_RECURSIVE && m->owner == current_thread)
        {
           m->level++;
           scm_i_pthread_mutex_unlock (&m->lock);
@@ -1329,13 +1298,13 @@ timed_wait (enum scm_mutex_kind kind, struct scm_mutex 
*m, struct scm_cond *c,
       else
         while (1)
           {
-            if (scm_is_eq (m->owner, SCM_BOOL_F))
+            if (!m->owner)
               {
-                m->owner = scm_thread_handle (current_thread);
+                m->owner = current_thread;
                 scm_i_pthread_mutex_unlock (&m->lock);
                 break;
               }
-            block_self (m->waiting, &m->lock, waittime);
+            block_self (&m->waiting, &m->lock, waittime);
           }
       current_thread->block_asyncs--;
 
@@ -1418,7 +1387,7 @@ SCM_DEFINE (scm_signal_condition_variable, 
"signal-condition-variable", 1, 0, 0,
   struct scm_cond *c;
   SCM_VALIDATE_CONDVAR (1, cv);
   c = scm_to_condvar (cv);
-  unblock_from_queue (c->waiting);
+  unblock_from_queue (&c->waiting);
   return SCM_BOOL_T;
 }
 #undef FUNC_NAME
@@ -1431,7 +1400,7 @@ SCM_DEFINE (scm_broadcast_condition_variable, 
"broadcast-condition-variable", 1,
   struct scm_cond *c;
   SCM_VALIDATE_CONDVAR (1, cv);
   c = scm_to_condvar (cv);
-  while (scm_is_true (unblock_from_queue (c->waiting)))
+  while (unblock_from_queue (&c->waiting))
     ;
   return SCM_BOOL_T;
 }

Reply via email to