On Sat, Oct 29, 2016 at 1:48 PM, Brent W. Baccala <cos...@freesoft.org>
wrote:

> Yes, patch7 without patch8 can reorder messages as they're being resent,
> but patch8 uses the mutex introduced in patch7, so the ordering can't be
> reversed.
>
> They could be combined into a single patch.  Do you want me to prepare a
> combined patch and email it in?
>

OK, I'm attaching the combined patch7+patch8.

I've studied it a bit more, and it has a race condition that looks
unavoidable to me.

rpctrace on ext2fs exposes the problem.  ext2fs maps a region of memory
that it manages itself (using libpager), then calls vm_copy() on that
region, using its own task port.  This causes the kernel to make a
memory_object_data_request back to ext2fs, which makes some more requests
(mach_port_deallocate and vm_allocate) on the task port before replying
with a memory_object_data_supply, which allows the original vm_copy to
return.

So, it's calling vm_copy on its task port, and needs to do some more
operations on the task port before the vm_copy completes.  And just like
the problem I described two months ago ("mach_msg blocking on call to
vm_map"), the mach_msg call sending the vm_copy will block even if you're
not waiting for the reply message.

So... in rpctrace, I want to signal the condition variable (allowing the
next message to be processed) after the message has been sent, but since
mach_msg blocks, I can't do that without blocking the entire task port,
which deadlocks the process.

My solution is to signal the condition variable right before the call to
mach_msg, but this creates a race condition where messages can get
reordered.

As you know, I never liked this blocking behavior of mach_msg, but I just
can't see any way around it now.  If you can suggest something, let me
know...

    agape
    brent
From f12c28e9b16d7a3c628586b739eee3f4e004f753 Mon Sep 17 00:00:00 2001
From: Brent Baccala <cos...@freesoft.org>
Date: Sun, 30 Oct 2016 16:13:34 -1000
Subject: [PATCH] multithread rpctrace to avoid deadlocks in the kernel

---
 utils/rpctrace.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/utils/rpctrace.c b/utils/rpctrace.c
index c01f8d4..72ca614 100644
--- a/utils/rpctrace.c
+++ b/utils/rpctrace.c
@@ -141,6 +141,8 @@ struct traced_info
   mach_msg_type_name_t type;
   char *name;			/* null or a string describing this */
   task_t task;			/* The task who has the right. */
+  mach_port_seqno_t seqno;      /* next RPC to be processed on this port */
+  pthread_cond_t sequencer;     /* used to sequence RPCs when they are processed out-of-order */
 };
 
 /* Each traced port has one receiver info and multiple send wrappers.
@@ -250,6 +252,8 @@ struct port_class *other_class;
 struct port_bucket *traced_bucket;
 FILE *ostream;
 
+pthread_mutex_t tracelock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
+
 /* These are the calls made from the tracing engine into
    the output formatting code.  */
 
@@ -334,9 +338,13 @@ destroy_receiver_info (struct receiver_info *info)
   while (send_wrapper)
     {
       struct sender_info *next = send_wrapper->next;
+#if 0
+      if (refcounts_hard_references (&TRACED_INFO (send_wrapper)->pi.refcounts) != 1)
+	fprintf(stderr, "refcounts_hard_references (%ld) == %d\n", TRACED_INFO(send_wrapper)->pi.port_right, refcounts_hard_references (&TRACED_INFO (send_wrapper)->pi.refcounts));
       assert (
 	refcounts_hard_references (&TRACED_INFO (send_wrapper)->pi.refcounts)
 	== 1);
+#endif
       /* Reset the receive_right of the send wrapper in advance to avoid
        * destroy_receiver_info is called when the port info is destroyed. */
       send_wrapper->receive_right = NULL;
@@ -370,6 +378,8 @@ new_send_wrapper (struct receiver_info *receive, task_t task,
 	     receive->forward, TRACED_INFO (info)->pi.port_right, task2pid (task));
   TRACED_INFO (info)->type = MACH_MSG_TYPE_MOVE_SEND;
   TRACED_INFO (info)->task = task;
+  TRACED_INFO (info)->seqno = 0;
+  pthread_cond_init(& TRACED_INFO(info)->sequencer, NULL);
   info->receive_right = receive;
   info->next = receive->next;
   receive->next = info;
@@ -400,6 +410,8 @@ new_send_once_wrapper (mach_port_t right, mach_port_t *wrapper_right, task_t tas
 			       sizeof *info, &info);
       assert_perror (err);
       TRACED_INFO (info)->name = 0;
+      TRACED_INFO (info)->seqno = 0;
+      pthread_cond_init(& TRACED_INFO(info)->sequencer, NULL);
     }
 
   info->forward = right;
@@ -451,6 +463,8 @@ traced_clean (void *pi)
 {
   struct sender_info *info = pi;
 
+  pthread_mutex_lock(&tracelock);
+
   assert (TRACED_INFO (info)->type == MACH_MSG_TYPE_MOVE_SEND);
   free (TRACED_INFO (info)->name);
 
@@ -466,6 +480,8 @@ traced_clean (void *pi)
 
       info->receive_right = NULL;
     }
+
+  pthread_mutex_unlock(&tracelock);
 }
 
 /* Check if the receive right has been seen. */
@@ -1144,6 +1160,8 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
 
   assert (info);
 
+  pthread_mutex_lock(&tracelock);
+
   /* A notification message from the kernel appears to have been sent
      with a send-once right, even if there have never really been any.  */
   if (MACH_MSGH_BITS_LOCAL (inp->msgh_bits) == MACH_MSG_TYPE_MOVE_SEND_ONCE)
@@ -1170,6 +1188,8 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
 	  /* It might be a task port. Remove the dead task from the list. */
 	  remove_task (n->not_port);
 
+	  pthread_mutex_unlock(&tracelock);
+
 	  return 1;
 	}
       else if (inp->msgh_id == MACH_NOTIFY_NO_SENDERS
@@ -1182,6 +1202,7 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
 	  ports_no_senders (info, n->not_count);
 	  ports_port_deref (info);
 	  ((mig_reply_header_t *) outp)->RetCode = MIG_NO_REPLY;
+	  pthread_mutex_unlock(&tracelock);
 	  return 1;
 	}
       /* Get some unexpected notification for rpctrace itself,
@@ -1190,6 +1211,7 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
 	{
 	  ports_port_deref (info);
 	  ((mig_reply_header_t *) outp)->RetCode = MIG_NO_REPLY;
+	  pthread_mutex_unlock(&tracelock);
 	  return 1;
 	}
     }
@@ -1201,6 +1223,11 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
 
   msgid = msgid_info (inp->msgh_id);
 
+  while (inp->msgh_seqno != TRACED_INFO (info)->seqno)
+    {
+      pthread_cond_wait (& TRACED_INFO (info)->sequencer, &tracelock);
+    }
+
   /* Determine message's destination task */
 
   if (INFO_SEND_ONCE(info))
@@ -1364,6 +1391,16 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
 	}
     }
 
+  /* Advance our sequence number and signal any other thread waiting
+   * to process the next message on this port.
+   */
+  TRACED_INFO (info)->seqno = inp->msgh_seqno + 1;
+  pthread_cond_broadcast (& TRACED_INFO (info)->sequencer);
+
+  /* Unlock prior to resending message to avoid deadlocks in the kernel */
+  ports_port_deref (info);
+  pthread_mutex_unlock(&tracelock);
+
   /* Resend the message to the tracee.  */
   err = mach_msg (inp, MACH_SEND_MSG, inp->msgh_size, 0,
 		  MACH_PORT_NULL, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL);
@@ -1377,8 +1414,6 @@ trace_and_forward (mach_msg_header_t *inp, mach_msg_header_t *outp)
   else
     assert_perror (err);
 
-  ports_port_deref (info);
-
   /* We already sent the message, so the server loop shouldn't do it again.  */
   ((mig_reply_header_t *) outp)->RetCode = MIG_NO_REPLY;
 
@@ -1390,7 +1425,7 @@ static void *
 trace_thread_function (void *arg)
 {
   struct port_bucket *const bucket = arg;
-  ports_manage_port_operations_one_thread (bucket, trace_and_forward, 0);
+  ports_manage_port_operations_multithread (bucket, trace_and_forward, 0, 0, NULL);
   return 0;
 }
 
-- 
2.6.4

Reply via email to