Module Name:    src
Committed By:   jdolecek
Date:           Wed Jan 20 21:39:09 UTC 2021

Modified Files:
        src/sys/kern: kern_event.c

Log Message:
fix a race in kqueue_scan() - when multiple threads check the same
kqueue, it could happen other thread seen empty kqueue while kevent
was being checked for re-firing and re-queued

make sure to keep retrying if there are outstanding kevents even
if no kevent is found on first pass through the queue, and only
drop the KN_QUEUED flag and kq_count when actually completely done
with the kevent

change is inspired by the FreeBSD in-flux handling, but without
introducing the reference counting

PR kern/50094 by Christof Meerwald


To generate a diff of this commit:
cvs rdiff -u -r1.110 -r1.111 src/sys/kern/kern_event.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/kern_event.c
diff -u src/sys/kern/kern_event.c:1.110 src/sys/kern/kern_event.c:1.111
--- src/sys/kern/kern_event.c:1.110	Sun Dec 27 12:45:33 2020
+++ src/sys/kern/kern_event.c	Wed Jan 20 21:39:09 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_event.c,v 1.110 2020/12/27 12:45:33 jdolecek Exp $	*/
+/*	$NetBSD: kern_event.c,v 1.111 2021/01/20 21:39:09 jdolecek Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
@@ -59,7 +59,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.110 2020/12/27 12:45:33 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.111 2021/01/20 21:39:09 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -179,6 +179,8 @@ static int	kq_calloutmax = (4 * 1024);
 
 extern const struct filterops sig_filtops;
 
+#define KQ_FLUX_WAKEUP(kq)	cv_broadcast(&kq->kq_cv)
+
 /*
  * Table for for all system-defined filters.
  * These should be listed in the numeric order of the EVFILT_* defines.
@@ -1396,7 +1398,7 @@ kqueue_scan(file_t *fp, size_t maxevents
 	struct timespec	ats, sleepts;
 	struct knote	*kn, *marker, morker;
 	size_t		count, nkev, nevents;
-	int		timeout, error, touch, rv;
+	int		timeout, error, touch, rv, influx;
 	filedesc_t	*fdp;
 
 	fdp = curlwp->l_fd;
@@ -1450,39 +1452,48 @@ kqueue_scan(file_t *fp, size_t maxevents
 
 	/* mark end of knote list */
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
+	influx = 0;
 
 	/*
 	 * Acquire the fdp->fd_lock interlock to avoid races with
 	 * file creation/destruction from other threads.
 	 */
+relock:
 	mutex_spin_exit(&kq->kq_lock);
 	mutex_enter(&fdp->fd_lock);
 	mutex_spin_enter(&kq->kq_lock);
 
 	while (count != 0) {
 		kn = TAILQ_FIRST(&kq->kq_head);	/* get next knote */
-		while ((kn->kn_status & KN_MARKER) != 0) {
-			if (kn == marker) {
-				/* it's our marker, stop */
-				TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
-				if (count < maxevents || (tsp != NULL &&
-				    (timeout = gettimeleft(&ats,
-				    &sleepts)) <= 0))
-					goto queue_processed;
+
+		if ((kn->kn_status & KN_MARKER) != 0 && kn != marker) {
+			if (influx) {
+				influx = 0;
+				KQ_FLUX_WAKEUP(kq);
+			}
+			mutex_exit(&fdp->fd_lock);
+			(void)cv_wait_sig(&kq->kq_cv, &kq->kq_lock);
+			goto relock;
+		}
+
+		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+		if (kn == marker) {
+			/* it's our marker, stop */
+			KQ_FLUX_WAKEUP(kq);
+			if (count == maxevents) {
 				mutex_exit(&fdp->fd_lock);
 				goto retry;
 			}
-			/* someone else's marker. */
-			kn = TAILQ_NEXT(kn, kn_tqe);
+			break;
 		}
+		KASSERT((kn->kn_status & KN_BUSY) == 0);
+
 		kq_check(kq);
-		kq->kq_count--;
-		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
-		kn->kn_status &= ~KN_QUEUED;
 		kn->kn_status |= KN_BUSY;
 		kq_check(kq);
 		if (kn->kn_status & KN_DISABLED) {
-			kn->kn_status &= ~KN_BUSY;
+			kq->kq_count--;
+			kn->kn_status &= ~(KN_QUEUED|KN_BUSY);
 			/* don't want disabled events */
 			continue;
 		}
@@ -1495,17 +1506,14 @@ kqueue_scan(file_t *fp, size_t maxevents
 			rv = (*kn->kn_fop->f_event)(kn, 0);
 			KERNEL_UNLOCK_ONE(NULL);	/* XXXSMP */
 			mutex_spin_enter(&kq->kq_lock);
-			/* Re-poll if note was re-enqueued. */
-			if ((kn->kn_status & KN_QUEUED) != 0) {
-				kn->kn_status &= ~KN_BUSY;
-				continue;
-			}
 			if (rv == 0) {
 				/*
 				 * non-ONESHOT event that hasn't
 				 * triggered again, so de-queue.
 				 */
-				kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
+				kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
+				kq->kq_count--;
+				influx = 1;
 				continue;
 			}
 		}
@@ -1524,9 +1532,11 @@ kqueue_scan(file_t *fp, size_t maxevents
 		}
 		kevp++;
 		nkev++;
+		influx = 1;
 		if (kn->kn_flags & EV_ONESHOT) {
 			/* delete ONESHOT events after retrieval */
-			kn->kn_status &= ~KN_BUSY;
+			kn->kn_status &= ~(KN_QUEUED|KN_BUSY);
+			kq->kq_count--;
 			mutex_spin_exit(&kq->kq_lock);
 			knote_detach(kn, fdp, true);
 			mutex_enter(&fdp->fd_lock);
@@ -1544,20 +1554,23 @@ kqueue_scan(file_t *fp, size_t maxevents
 				kn->kn_fflags = 0;
 			}
 			kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
+			kq->kq_count--;
 		} else if (kn->kn_flags & EV_DISPATCH) {
 			kn->kn_status |= KN_DISABLED;
 			kn->kn_status &= ~(KN_QUEUED|KN_ACTIVE|KN_BUSY);
+			kq->kq_count--;
 		} else {
 			/* add event back on list */
 			kq_check(kq);
-			kn->kn_status |= KN_QUEUED;
 			kn->kn_status &= ~KN_BUSY;
 			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
-			kq->kq_count++;
 			kq_check(kq);
 		}
+
 		if (nkev == kevcnt) {
 			/* do copyouts in kevcnt chunks */
+			influx = 0;
+			KQ_FLUX_WAKEUP(kq);
 			mutex_spin_exit(&kq->kq_lock);
 			mutex_exit(&fdp->fd_lock);
 			error = (*keops->keo_put_events)
@@ -1576,7 +1589,7 @@ kqueue_scan(file_t *fp, size_t maxevents
 			break;
 		}
 	}
-queue_processed:
+	KQ_FLUX_WAKEUP(kq);
 	mutex_spin_exit(&kq->kq_lock);
 	mutex_exit(&fdp->fd_lock);
 

Reply via email to