Module Name:    src
Committed By:   ad
Date:           Wed Jun  3 22:10:24 UTC 2020

Modified Files:
        src/lib/libpthread: pthread.c pthread_cond.c pthread_mutex.c

Log Message:
Deal with a couple of problems with threads being awoken early due to
timeouts or cancellation where:

- The restarting thread calls _lwp_exit() before another thread gets around
  to waking it with _lwp_unpark(), leading to ESRCH (observed by joerg@).
  (I may have removed a similar check mistakenly over the weekend.)

- The restarting thread considers itself gone off the sleep queue but
  at the same time another thread is part way through waking it, and hasn't
  fully completed that operation yet by setting thread->pt_mutexwait = 0.
  I think that could have potentially lead to the list of waiters getting
  messed up given the right circumstances.


To generate a diff of this commit:
cvs rdiff -u -r1.172 -r1.173 src/lib/libpthread/pthread.c
cvs rdiff -u -r1.70 -r1.71 src/lib/libpthread/pthread_cond.c
cvs rdiff -u -r1.78 -r1.79 src/lib/libpthread/pthread_mutex.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libpthread/pthread.c
diff -u src/lib/libpthread/pthread.c:1.172 src/lib/libpthread/pthread.c:1.173
--- src/lib/libpthread/pthread.c:1.172	Tue Jun  2 00:29:53 2020
+++ src/lib/libpthread/pthread.c	Wed Jun  3 22:10:24 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pthread.c,v 1.172 2020/06/02 00:29:53 joerg Exp $	*/
+/*	$NetBSD: pthread.c,v 1.173 2020/06/03 22:10:24 ad Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2002, 2003, 2006, 2007, 2008, 2020
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: pthread.c,v 1.172 2020/06/02 00:29:53 joerg Exp $");
+__RCSID("$NetBSD: pthread.c,v 1.173 2020/06/03 22:10:24 ad Exp $");
 
 #define	__EXPOSE_STACK	1
 
@@ -599,9 +599,12 @@ pthread_resume_np(pthread_t thread)
 }
 
 /*
- * In case the thread is exiting at an inopportune time leaving waiters not
- * awoken (because cancelled, for instance) make sure we have no waiters
- * left.
+ * Wake all deferred waiters hanging off self.
+ *
+ * It's possible for threads to have called _lwp_exit() before we wake them,
+ * because of cancellation and timeout, so ESRCH is tolerated here.  If a
+ * thread exits and its LID is reused, and the a thread receives an wakeup
+ * meant for the previous incarnation of the LID, no harm will be done.
  */
 void
 pthread__clear_waiters(pthread_t self)
@@ -620,7 +623,7 @@ pthread__clear_waiters(pthread_t self)
 		rv = _lwp_unpark(self->pt_waiters[0], NULL);
 		self->pt_waiters[0] = 0;
 		self->pt_nwaiters = 0;
-		if (rv != 0) {
+		if (rv != 0 && errno != ESRCH) {
 			pthread__errorfunc(__FILE__, __LINE__, __func__,
 			    "_lwp_unpark failed: %d", errno);
 		}
@@ -629,7 +632,7 @@ pthread__clear_waiters(pthread_t self)
 		rv = _lwp_unpark_all(self->pt_waiters, self->pt_nwaiters, NULL);
 		self->pt_waiters[0] = 0;
 		self->pt_nwaiters = 0;
-		if (rv != 0) {
+		if (rv != 0 && errno != ESRCH) {
 			pthread__errorfunc(__FILE__, __LINE__, __func__,
 			    "_lwp_unpark_all failed: %d", errno);
 		}
@@ -1195,6 +1198,7 @@ pthread__park(pthread_t self, pthread_mu
 			switch (rv = errno) {
 			case EINTR:
 			case EALREADY:
+			case ESRCH:
 				rv = 0;
 				break;
 			case ETIMEDOUT:

Index: src/lib/libpthread/pthread_cond.c
diff -u src/lib/libpthread/pthread_cond.c:1.70 src/lib/libpthread/pthread_cond.c:1.71
--- src/lib/libpthread/pthread_cond.c:1.70	Mon Jun  1 11:44:59 2020
+++ src/lib/libpthread/pthread_cond.c	Wed Jun  3 22:10:24 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pthread_cond.c,v 1.70 2020/06/01 11:44:59 ad Exp $	*/
+/*	$NetBSD: pthread_cond.c,v 1.71 2020/06/03 22:10:24 ad Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: pthread_cond.c,v 1.70 2020/06/01 11:44:59 ad Exp $");
+__RCSID("$NetBSD: pthread_cond.c,v 1.71 2020/06/03 22:10:24 ad Exp $");
 
 #include <stdlib.h>
 #include <errno.h>
@@ -112,7 +112,7 @@ pthread_cond_timedwait(pthread_cond_t *c
 		       const struct timespec *abstime)
 {
 	pthread_t self, next, waiters;
-	int retval;
+	int retval, cancel;
 	clockid_t clkid = pthread_cond_getclock(cond);
 
 	if (__predict_false(__uselibcstub))
@@ -126,6 +126,7 @@ pthread_cond_timedwait(pthread_cond_t *c
 	    mutex->ptm_owner != NULL);
 
 	self = pthread__self();
+	pthread__assert(!self->pt_condwait);
 
 	if (__predict_false(self->pt_cancel)) {
 		pthread__cancelled();
@@ -165,24 +166,42 @@ pthread_cond_timedwait(pthread_cond_t *c
 				retval = errno;
 			}
 		}
-	} while (self->pt_condwait && !self->pt_cancel && !retval);
+		cancel = self->pt_cancel;
+	} while (self->pt_condwait && !cancel && !retval);
 
 	/*
-	 * If we have cancelled then exit.  POSIX dictates that
-	 * the mutex must be held when we action the cancellation.
+	 * If this thread absorbed a wakeup from pthread_cond_signal() and
+	 * cannot take the wakeup, we must ensure that another thread does.
 	 *
-	 * If we absorbed a pthread_cond_signal() and cannot take
-	 * the wakeup, we must ensure that another thread does.
+	 * And if awoken early, we may still be on the waiter list and must
+	 * remove self.
 	 *
-	 * If awoke early, we may still be on the waiter list and
-	 * must remove ourself.
+	 * In all cases do the wakeup without the mutex held otherwise:
+	 *
+	 * - wakeup could be deferred until mutex release
+	 * - it would be mixing up two sets of waitpoints
 	 */
-	pthread_mutex_lock(mutex);
-	if (__predict_false(self->pt_condwait | self->pt_cancel | retval)) {
+	if (__predict_false(self->pt_condwait | cancel | retval)) {
 		pthread_cond_broadcast(cond);
-		if (self->pt_cancel) {
-			pthread__cancelled();
-		}
+
+		/*
+		 * Might have raced with another thread to do the wakeup. 
+		 * In any case there will be a wakeup for sure.  Eat it and
+		 * wait for pt_condwait to clear.
+		 */
+		do {
+			(void)_lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, NULL,
+			    0, NULL, NULL);
+		} while (self->pt_condwait);
+	}
+
+	/*
+	 * If cancelled then exit.  POSIX dictates that the mutex must be
+	 * held if this happens.
+	 */
+	pthread_mutex_lock(mutex);
+	if (cancel) {
+		pthread__cancelled();
 	}
 
 	return retval;

Index: src/lib/libpthread/pthread_mutex.c
diff -u src/lib/libpthread/pthread_mutex.c:1.78 src/lib/libpthread/pthread_mutex.c:1.79
--- src/lib/libpthread/pthread_mutex.c:1.78	Mon Jun  1 11:44:59 2020
+++ src/lib/libpthread/pthread_mutex.c	Wed Jun  3 22:10:24 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pthread_mutex.c,v 1.78 2020/06/01 11:44:59 ad Exp $	*/
+/*	$NetBSD: pthread_mutex.c,v 1.79 2020/06/03 22:10:24 ad Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2003, 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
@@ -47,7 +47,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: pthread_mutex.c,v 1.78 2020/06/01 11:44:59 ad Exp $");
+__RCSID("$NetBSD: pthread_mutex.c,v 1.79 2020/06/03 22:10:24 ad Exp $");
 
 #include <sys/types.h>
 #include <sys/lwpctl.h>
@@ -278,6 +278,7 @@ pthread__mutex_lock_slow(pthread_mutex_t
 	serrno = errno;
 
 	pthread__assert(!self->pt_willpark);
+	pthread__assert(!self->pt_mutexwait);
 
 	/* Recursive or errorcheck? */
 	if (MUTEX_OWNER(owner) == (uintptr_t)self) {
@@ -369,6 +370,18 @@ pthread__mutex_lock_slow(pthread_mutex_t
 			if (error < 0 && errno == ETIMEDOUT) {
 				/* Remove self from waiters list */
 				pthread__mutex_wakeup(self, ptm);
+
+				/*
+				 * Might have raced with another thread to
+				 * do the wakeup.  In any case there will be
+				 * a wakeup for sure.  Eat it and wait for
+				 * pt_mutexwait to clear.
+				 */
+				do {
+					(void)_lwp_park(CLOCK_REALTIME,
+					   TIMER_ABSTIME, NULL, 0, NULL, NULL);
+				} while (self->pt_mutexwait);
+
 				/* Priority protect */
 				if (MUTEX_PROTECT(owner))
 					(void)_sched_protect(-1);

Reply via email to