On Thu, May 20, 2021 at 01:56:24PM +0000, Visa Hankala wrote: > This patch adds a mutex that serializes access to a kqueue. As a result, > most of kqueue's internals should become safe to run without the kernel > lock. In principle, the patch should allow unlocking kevent(2). > > Some notes: > > * The existing uses of splhigh() outline where the mutex should be held. > > * The code is a true entanglement of lock operations. There are many > spots where lock usage is far from optimal. The patch does not attempt > to fix them, so as to keep the changeset relatively small. > > * As msleep() with PCATCH requires the kernel lock, kqueue_scan() locks > the kernel for the section that might sleep. The lock is released > before the actual scan of events. An opportunistic implementation > could do a precheck to determine if the scan could be started right > away, but this is not part of the diff. > > * knote_acquire() has a gap where it might miss a wakeup. This is an > unlikely situation that may arise with klist_invalidate(). It should > not happen during normal operation, and the code should recover thanks > to the one-second timeout. The loss of wakeup could be avoided with > serial numbering for example. > > * The timeout in knote_acquire() makes the function try-lock-like, which > is essential in klist_invalidate(). The normal sequence of action is > that knote_acquire() comes before klist_lock(). klist_invalidate() has > to violate this, and the timeout, and retrying, prevent the system > from deadlocking. > > * At the moment, all event sources still require the kernel lock. > kqueue will lock the kernel when it invokes the filterops callbacks > if FILTEROP_MPSAFE is not set.
Here is an updated patch. The changes are: * Lock kqueue mutex in filt_kqueue() for accessing kq_count, to avoid an unlocked read. There still is an unlocked read of kq_count in kqueue_stat(). However, I think it is good enough as is because the value returned to userspace is best-effort anyhow (FreeBSD omits it altogether). * Make callers of knote_activate() lock the mutex. This reduces the number of lock operations. * Adjust mutex assertion in knote_release() to avoid an unused variable when compiling without DIAGNOSTIC. OK? Index: kern/kern_event.c =================================================================== RCS file: src/sys/kern/kern_event.c,v retrieving revision 1.164 diff -u -p -r1.164 kern_event.c --- kern/kern_event.c 2 Jun 2021 13:56:28 -0000 1.164 +++ kern/kern_event.c 3 Jun 2021 14:06:54 -0000 @@ -123,7 +123,8 @@ void knote_dequeue(struct knote *kn); int knote_acquire(struct knote *kn, struct klist *, int); void knote_release(struct knote *kn); void knote_activate(struct knote *kn); -void knote_remove(struct proc *p, struct knlist *list, int purge); +void knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list, + int purge); void filt_kqdetach(struct knote *kn); int filt_kqueue(struct knote *kn, long hint); @@ -270,7 +271,9 @@ filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; + mtx_enter(&kq->kq_lock); kn->kn_data = kq->kq_count; + mtx_leave(&kq->kq_lock); return (kn->kn_data > 0); } @@ -416,9 +419,12 @@ void filt_timerexpire(void *knx) { struct knote *kn = knx; + struct kqueue *kq = kn->kn_kq; kn->kn_data++; + mtx_enter(&kq->kq_lock); knote_activate(kn); + mtx_leave(&kq->kq_lock); if ((kn->kn_flags & EV_ONESHOT) == 0) filt_timer_timeout_add(kn); @@ -744,28 +750,31 @@ kqpoll_dequeue(struct proc *p) { struct knote *kn; struct kqueue *kq = p->p_kq; - int s; - s = splhigh(); + mtx_enter(&kq->kq_lock); while ((kn = TAILQ_FIRST(&kq->kq_head)) != NULL) { /* This kqueue should not be scanned by other threads. */ KASSERT(kn->kn_filter != EVFILT_MARKER); - if (!knote_acquire(kn, NULL, 0)) + if (!knote_acquire(kn, NULL, 0)) { + /* knote_acquire() has released kq_lock. */ + mtx_enter(&kq->kq_lock); continue; + } kqueue_check(kq); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; + mtx_leave(&kq->kq_lock); - splx(s); - kn->kn_fop->f_detach(kn); + filter_detach(kn); knote_drop(kn, p); - s = splhigh(); + + mtx_enter(&kq->kq_lock); kqueue_check(kq); } - splx(s); + mtx_leave(&kq->kq_lock); } struct kqueue * @@ -777,6 +786,7 @@ kqueue_alloc(struct filedesc *fdp) kq->kq_refs = 1; kq->kq_fdp = fdp; TAILQ_INIT(&kq->kq_head); + mtx_init(&kq->kq_lock, IPL_HIGH); task_set(&kq->kq_task, kqueue_task, kq); return (kq); @@ -938,8 +948,7 @@ kqueue_do_check(struct kqueue *kq, const struct knote *kn; int count = 0, nmarker = 0; - KERNEL_ASSERT_LOCKED(); - splassert(IPL_HIGH); + MUTEX_ASSERT_LOCKED(&kq->kq_lock); TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { if (kn->kn_filter == EVFILT_MARKER) { @@ -978,7 +987,7 @@ kqueue_register(struct kqueue *kq, struc struct file *fp = NULL; struct knote *kn = NULL, *newkn = NULL; struct knlist *list = NULL; - int s, error = 0; + int active, error = 0; if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) @@ -1010,11 +1019,13 @@ again: error = EBADF; goto done; } + mtx_enter(&kq->kq_lock); if (kev->flags & EV_ADD) kqueue_expand_list(kq, kev->ident); if (kev->ident < kq->kq_knlistsize) list = &kq->kq_knlist[kev->ident]; } else { + mtx_enter(&kq->kq_lock); if (kev->flags & EV_ADD) kqueue_expand_hash(kq); if (kq->kq_knhashmask != 0) { @@ -1026,16 +1037,15 @@ again: SLIST_FOREACH(kn, list, kn_link) { if (kev->filter == kn->kn_filter && kev->ident == kn->kn_id) { - s = splhigh(); if (!knote_acquire(kn, NULL, 0)) { - splx(s); + /* knote_acquire() has released + * kq_lock. */ if (fp != NULL) { FRELE(fp, p); fp = NULL; } goto again; } - splx(s); break; } } @@ -1043,14 +1053,13 @@ again: KASSERT(kn == NULL || (kn->kn_status & KN_PROCESSING) != 0); if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { + mtx_leave(&kq->kq_lock); error = ENOENT; goto done; } /* * kn now contains the matching knote, or NULL if no match. - * If adding a new knote, sleeping is not allowed until the knote - * has been inserted. */ if (kev->flags & EV_ADD) { if (kn == NULL) { @@ -1074,6 +1083,8 @@ again: kn->kn_kevent = *kev; knote_attach(kn); + mtx_leave(&kq->kq_lock); + error = filter_attach(kn); if (error != 0) { knote_drop(kn, p); @@ -1100,7 +1111,9 @@ again: } /* Check if there is a pending event. */ - if (filter_process(kn, NULL)) + active = filter_process(kn, NULL); + mtx_enter(&kq->kq_lock); + if (active) knote_activate(kn); } else { /* @@ -1108,7 +1121,10 @@ again: * initial EV_ADD, but doing so will not reset any * filters which have already been triggered. */ - if (filter_modify(kev, kn)) + mtx_leave(&kq->kq_lock); + active = filter_modify(kev, kn); + mtx_enter(&kq->kq_lock); + if (active) knote_activate(kn); if (kev->flags & EV_ERROR) { error = kev->data; @@ -1116,31 +1132,28 @@ again: } } } else if (kev->flags & EV_DELETE) { + mtx_leave(&kq->kq_lock); filter_detach(kn); knote_drop(kn, p); goto done; } - if ((kev->flags & EV_DISABLE) && - ((kn->kn_status & KN_DISABLED) == 0)) { - s = splhigh(); + if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) kn->kn_status |= KN_DISABLED; - splx(s); - } if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { - s = splhigh(); kn->kn_status &= ~KN_DISABLED; - splx(s); + mtx_leave(&kq->kq_lock); /* Check if there is a pending event. */ - if (filter_process(kn, NULL)) + active = filter_process(kn, NULL); + mtx_enter(&kq->kq_lock); + if (active) knote_activate(kn); } release: - s = splhigh(); knote_release(kn); - splx(s); + mtx_leave(&kq->kq_lock); done: if (fp != NULL) FRELE(fp, p); @@ -1156,14 +1169,15 @@ kqueue_sleep(struct kqueue *kq, struct t uint64_t nsecs; int error; - splassert(IPL_HIGH); + MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (tsp != NULL) { getnanouptime(&start); nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); } else nsecs = INFSLP; - error = tsleep_nsec(kq, PSOCK | PCATCH, "kqread", nsecs); + error = msleep_nsec(kq, &kq->kq_lock, PSOCK | PCATCH | PNORELOCK, + "kqread", nsecs); if (tsp != NULL) { getnanouptime(&stop); timespecsub(&stop, &start, &elapsed); @@ -1186,7 +1200,7 @@ kqueue_scan(struct kqueue_scan_state *sc { struct kqueue *kq = scan->kqs_kq; struct knote *kn; - int s, error = 0, nkev = 0; + int error = 0, nkev = 0; if (maxevents == 0) goto done; @@ -1195,12 +1209,18 @@ retry: error = 0; + /* msleep() with PCATCH requires kernel lock. */ + KERNEL_LOCK(); + + mtx_enter(&kq->kq_lock); + if (kq->kq_state & KQ_DYING) { + mtx_leave(&kq->kq_lock); + KERNEL_UNLOCK(); error = EBADF; goto done; } - s = splhigh(); if (kq->kq_count == 0) { /* * Successive loops are only necessary if there are more @@ -1208,13 +1228,15 @@ retry: */ if ((tsp != NULL && !timespecisset(tsp)) || scan->kqs_nevent != 0) { - splx(s); + mtx_leave(&kq->kq_lock); + KERNEL_UNLOCK(); error = 0; goto done; } kq->kq_state |= KQ_SLEEP; error = kqueue_sleep(kq, tsp); - splx(s); + /* kqueue_sleep() has released kq_lock. */ + KERNEL_UNLOCK(); if (error == 0 || error == EWOULDBLOCK) goto retry; /* don't restart after signals... */ @@ -1223,6 +1245,9 @@ retry: goto done; } + /* The actual scan does not sleep on kq, so unlock the kernel. */ + KERNEL_UNLOCK(); + /* * Put the end marker in the queue to limit the scan to the events * that are currently active. This prevents events from being @@ -1254,8 +1279,11 @@ retry: continue; } - if (!knote_acquire(kn, NULL, 0)) + if (!knote_acquire(kn, NULL, 0)) { + /* knote_acquire() has released kq_lock. */ + mtx_enter(&kq->kq_lock); continue; + } kqueue_check(kq); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); @@ -1268,11 +1296,11 @@ retry: continue; } - splx(s); + mtx_leave(&kq->kq_lock); memset(kevp, 0, sizeof(*kevp)); if (filter_process(kn, kevp) == 0) { - s = splhigh(); + mtx_enter(&kq->kq_lock); if ((kn->kn_status & KN_QUEUED) == 0) kn->kn_status &= ~KN_ACTIVE; knote_release(kn); @@ -1286,9 +1314,9 @@ retry: if (kevp->flags & EV_ONESHOT) { filter_detach(kn); knote_drop(kn, p); - s = splhigh(); + mtx_enter(&kq->kq_lock); } else if (kevp->flags & (EV_CLEAR | EV_DISPATCH)) { - s = splhigh(); + mtx_enter(&kq->kq_lock); if (kevp->flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; if ((kn->kn_status & KN_QUEUED) == 0) @@ -1296,7 +1324,7 @@ retry: KASSERT(kn->kn_status & KN_ATTACHED); knote_release(kn); } else { - s = splhigh(); + mtx_enter(&kq->kq_lock); if ((kn->kn_status & KN_QUEUED) == 0) { kqueue_check(kq); kq->kq_count++; @@ -1313,7 +1341,7 @@ retry: scan->kqs_nevent++; } TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe); - splx(s); + mtx_leave(&kq->kq_lock); if (scan->kqs_nevent == 0) goto retry; done: @@ -1338,7 +1366,6 @@ void kqueue_scan_finish(struct kqueue_scan_state *scan) { struct kqueue *kq = scan->kqs_kq; - int s; KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER); KASSERT(scan->kqs_start.kn_status == KN_PROCESSING); @@ -1347,9 +1374,9 @@ kqueue_scan_finish(struct kqueue_scan_st if (scan->kqs_queued) { scan->kqs_queued = 0; - s = splhigh(); + mtx_enter(&kq->kq_lock); TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe); - splx(s); + mtx_leave(&kq->kq_lock); } KQRELE(kq); } @@ -1381,17 +1408,17 @@ kqueue_poll(struct file *fp, int events, { struct kqueue *kq = (struct kqueue *)fp->f_data; int revents = 0; - int s = splhigh(); if (events & (POLLIN | POLLRDNORM)) { + mtx_enter(&kq->kq_lock); if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(p, &kq->kq_sel); kq->kq_state |= KQ_SEL; } + mtx_leave(&kq->kq_lock); } - splx(s); return (revents); } @@ -1401,7 +1428,7 @@ kqueue_stat(struct file *fp, struct stat struct kqueue *kq = fp->f_data; memset(st, 0, sizeof(*st)); - st->st_size = kq->kq_count; + st->st_size = kq->kq_count; /* unlocked read */ st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO; return (0); @@ -1412,14 +1439,14 @@ kqueue_purge(struct proc *p, struct kque { int i; - KERNEL_ASSERT_LOCKED(); - + mtx_enter(&kq->kq_lock); for (i = 0; i < kq->kq_knlistsize; i++) - knote_remove(p, &kq->kq_knlist[i], 1); + knote_remove(p, kq, &kq->kq_knlist[i], 1); if (kq->kq_knhashmask != 0) { for (i = 0; i < kq->kq_knhashmask + 1; i++) - knote_remove(p, &kq->kq_knhash[i], 1); + knote_remove(p, kq, &kq->kq_knhash[i], 1); } + mtx_leave(&kq->kq_lock); } void @@ -1427,6 +1454,8 @@ kqueue_terminate(struct proc *p, struct { struct knote *kn; + mtx_enter(&kq->kq_lock); + /* * Any remaining entries should be scan markers. * They are removed when the ongoing scans finish. @@ -1437,6 +1466,7 @@ kqueue_terminate(struct proc *p, struct kq->kq_state |= KQ_DYING; kqueue_wakeup(kq); + mtx_leave(&kq->kq_lock); KASSERT(klist_empty(&kq->kq_sel.si_note)); task_del(systq, &kq->kq_task); @@ -1448,15 +1478,13 @@ kqueue_close(struct file *fp, struct pro { struct kqueue *kq = fp->f_data; - KERNEL_LOCK(); + fp->f_data = NULL; + kqueue_purge(p, kq); kqueue_terminate(p, kq); - fp->f_data = NULL; KQRELE(kq); - KERNEL_UNLOCK(); - return (0); } @@ -1465,10 +1493,16 @@ kqueue_task(void *arg) { struct kqueue *kq = arg; + /* Kernel lock is needed inside selwakeup(). */ + KERNEL_ASSERT_LOCKED(); + + mtx_enter(&kq->kq_lock); if (kq->kq_state & KQ_SEL) { kq->kq_state &= ~KQ_SEL; + mtx_leave(&kq->kq_lock); selwakeup(&kq->kq_sel); } else { + mtx_leave(&kq->kq_lock); KNOTE(&kq->kq_sel.si_note, 0); } KQRELE(kq); @@ -1477,6 +1511,7 @@ kqueue_task(void *arg) void kqueue_wakeup(struct kqueue *kq) { + MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_state & KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; @@ -1496,14 +1531,20 @@ kqueue_expand_hash(struct kqueue *kq) struct knlist *hash; u_long hashmask; + MUTEX_ASSERT_LOCKED(&kq->kq_lock); + if (kq->kq_knhashmask == 0) { + mtx_leave(&kq->kq_lock); hash = hashinit(KN_HASHSIZE, M_KEVENT, M_WAITOK, &hashmask); + mtx_enter(&kq->kq_lock); if (kq->kq_knhashmask == 0) { kq->kq_knhash = hash; kq->kq_knhashmask = hashmask; } else { /* Another thread has allocated the hash. */ + mtx_leave(&kq->kq_lock); hashfree(hash, KN_HASHSIZE, M_KEVENT); + mtx_enter(&kq->kq_lock); } } } @@ -1511,26 +1552,35 @@ kqueue_expand_hash(struct kqueue *kq) static void kqueue_expand_list(struct kqueue *kq, int fd) { - struct knlist *list; - int size; + struct knlist *list, *olist; + int size, osize; + + MUTEX_ASSERT_LOCKED(&kq->kq_lock); if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; + mtx_leave(&kq->kq_lock); while (size <= fd) size += KQEXTENT; list = mallocarray(size, sizeof(*list), M_KEVENT, M_WAITOK); + mtx_enter(&kq->kq_lock); if (kq->kq_knlistsize <= fd) { memcpy(list, kq->kq_knlist, kq->kq_knlistsize * sizeof(*list)); memset(&list[kq->kq_knlistsize], 0, (size - kq->kq_knlistsize) * sizeof(*list)); - free(kq->kq_knlist, M_KEVENT, - kq->kq_knlistsize * sizeof(*list)); + olist = kq->kq_knlist; + osize = kq->kq_knlistsize; kq->kq_knlist = list; kq->kq_knlistsize = size; + mtx_leave(&kq->kq_lock); + free(olist, M_KEVENT, osize * sizeof(*list)); + mtx_enter(&kq->kq_lock); } else { /* Another thread has expanded the list. */ + mtx_leave(&kq->kq_lock); free(list, M_KEVENT, size * sizeof(*list)); + mtx_enter(&kq->kq_lock); } } } @@ -1548,14 +1598,22 @@ kqueue_expand_list(struct kqueue *kq, in int knote_acquire(struct knote *kn, struct klist *klist, int ls) { - splassert(IPL_HIGH); + struct kqueue *kq = kn->kn_kq; + + MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); if (kn->kn_status & KN_PROCESSING) { kn->kn_status |= KN_WAITING; - if (klist != NULL) + if (klist != NULL) { + mtx_leave(&kq->kq_lock); klist_unlock(klist, ls); - tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); + /* XXX Timeout resolves potential loss of wakeup. */ + tsleep_nsec(kn, 0, "kqepts", SEC_TO_NSEC(1)); + } else { + msleep_nsec(kn, &kq->kq_lock, PNORELOCK, "kqepts", + SEC_TO_NSEC(1)); + } /* knote may be stale now */ return (0); } @@ -1569,7 +1627,7 @@ knote_acquire(struct knote *kn, struct k void knote_release(struct knote *kn) { - splassert(IPL_HIGH); + MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT(kn->kn_status & KN_PROCESSING); @@ -1587,13 +1645,11 @@ knote_release(struct knote *kn) void knote_activate(struct knote *kn) { - int s; + MUTEX_ASSERT_LOCKED(&kn->kn_kq->kq_lock); - s = splhigh(); kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) knote_enqueue(kn); - splx(s); } /* @@ -1603,30 +1659,38 @@ void knote(struct klist *list, long hint) { struct knote *kn, *kn0; + struct kqueue *kq; KLIST_ASSERT_LOCKED(list); - SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) - if (filter_event(kn, hint)) + SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, kn0) { + if (filter_event(kn, hint)) { + kq = kn->kn_kq; + mtx_enter(&kq->kq_lock); knote_activate(kn); + mtx_leave(&kq->kq_lock); + } + } } /* * remove all knotes from a specified knlist */ void -knote_remove(struct proc *p, struct knlist *list, int purge) +knote_remove(struct proc *p, struct kqueue *kq, struct knlist *list, int purge) { struct knote *kn; - int s; + + MUTEX_ASSERT_LOCKED(&kq->kq_lock); while ((kn = SLIST_FIRST(list)) != NULL) { - s = splhigh(); + KASSERT(kn->kn_kq == kq); if (!knote_acquire(kn, NULL, 0)) { - splx(s); + /* knote_acquire() has released kq_lock. */ + mtx_enter(&kq->kq_lock); continue; } - splx(s); + mtx_leave(&kq->kq_lock); filter_detach(kn); /* @@ -1641,20 +1705,22 @@ knote_remove(struct proc *p, struct knli */ if (!purge && (kn->kn_flags & __EV_POLL) != 0) { KASSERT(kn->kn_fop->f_flags & FILTEROP_ISFD); + mtx_enter(&kq->kq_lock); knote_detach(kn); + mtx_leave(&kq->kq_lock); FRELE(kn->kn_fp, p); kn->kn_fp = NULL; kn->kn_fop = &badfd_filtops; filter_event(kn, 0); + mtx_enter(&kq->kq_lock); knote_activate(kn); - s = splhigh(); knote_release(kn); - splx(s); continue; } knote_drop(kn, p); + mtx_enter(&kq->kq_lock); } } @@ -1666,7 +1732,6 @@ knote_fdclose(struct proc *p, int fd) { struct filedesc *fdp = p->p_p->ps_fd; struct kqueue *kq; - struct knlist *list; /* * fdplock can be ignored if the file descriptor table is being freed @@ -1675,18 +1740,12 @@ knote_fdclose(struct proc *p, int fd) if (fdp->fd_refcnt != 0) fdpassertlocked(fdp); - if (LIST_EMPTY(&fdp->fd_kqlist)) - return; - - KERNEL_LOCK(); LIST_FOREACH(kq, &fdp->fd_kqlist, kq_next) { - if (fd >= kq->kq_knlistsize) - continue; - - list = &kq->kq_knlist[fd]; - knote_remove(p, list, 0); + mtx_enter(&kq->kq_lock); + if (fd < kq->kq_knlistsize) + knote_remove(p, kq, &kq->kq_knlist[fd], 0); + mtx_leave(&kq->kq_lock); } - KERNEL_UNLOCK(); } /* @@ -1698,6 +1757,7 @@ knote_processexit(struct proc *p) { struct process *pr = p->p_p; + KERNEL_ASSERT_LOCKED(); KASSERT(p == curproc); KNOTE(&pr->ps_klist, NOTE_EXIT); @@ -1711,15 +1771,12 @@ knote_attach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct knlist *list; - int s; + MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_status & KN_PROCESSING); KASSERT((kn->kn_status & KN_ATTACHED) == 0); - s = splhigh(); kn->kn_status |= KN_ATTACHED; - splx(s); - if (kn->kn_fop->f_flags & FILTEROP_ISFD) { KASSERT(kq->kq_knlistsize > kn->kn_id); list = &kq->kq_knlist[kn->kn_id]; @@ -1735,8 +1792,8 @@ knote_detach(struct knote *kn) { struct kqueue *kq = kn->kn_kq; struct knlist *list; - int s; + MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_status & KN_PROCESSING); if ((kn->kn_status & KN_ATTACHED) == 0) @@ -1747,10 +1804,7 @@ knote_detach(struct knote *kn) else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; SLIST_REMOVE(list, kn, knote, kn_link); - - s = splhigh(); kn->kn_status &= ~KN_ATTACHED; - splx(s); } /* @@ -1760,20 +1814,20 @@ knote_detach(struct knote *kn) void knote_drop(struct knote *kn, struct proc *p) { - int s; + struct kqueue *kq = kn->kn_kq; KASSERT(kn->kn_filter != EVFILT_MARKER); + mtx_enter(&kq->kq_lock); knote_detach(kn); - - s = splhigh(); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); if (kn->kn_status & KN_WAITING) { kn->kn_status &= ~KN_WAITING; wakeup(kn); } - splx(s); + mtx_leave(&kq->kq_lock); + if ((kn->kn_fop->f_flags & FILTEROP_ISFD) && kn->kn_fp != NULL) FRELE(kn->kn_fp, p); pool_put(&knote_pool, kn); @@ -1785,7 +1839,7 @@ knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - splassert(IPL_HIGH); + MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT((kn->kn_status & KN_QUEUED) == 0); @@ -1802,7 +1856,7 @@ knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; - splassert(IPL_HIGH); + MUTEX_ASSERT_LOCKED(&kq->kq_lock); KASSERT(kn->kn_filter != EVFILT_MARKER); KASSERT(kn->kn_status & KN_QUEUED); @@ -1910,36 +1964,38 @@ void klist_invalidate(struct klist *list) { struct knote *kn; + struct kqueue *kq; struct proc *p = curproc; - int ls, s; + int ls; NET_ASSERT_UNLOCKED(); - s = splhigh(); ls = klist_lock(list); while ((kn = SLIST_FIRST(&list->kl_list)) != NULL) { + kq = kn->kn_kq; + mtx_enter(&kq->kq_lock); if (!knote_acquire(kn, list, ls)) { - /* knote_acquire() has unlocked list. */ + /* knote_acquire() has released kq_lock + * and klist lock. */ ls = klist_lock(list); continue; } + mtx_leave(&kq->kq_lock); klist_unlock(list, ls); - splx(s); filter_detach(kn); if (kn->kn_fop->f_flags & FILTEROP_ISFD) { kn->kn_fop = &dead_filtops; filter_event(kn, 0); + mtx_enter(&kq->kq_lock); knote_activate(kn); - s = splhigh(); knote_release(kn); + mtx_leave(&kq->kq_lock); } else { knote_drop(kn, p); - s = splhigh(); } ls = klist_lock(list); } klist_unlock(list, ls); - splx(s); } static int Index: sys/eventvar.h =================================================================== RCS file: src/sys/sys/eventvar.h,v retrieving revision 1.11 diff -u -p -r1.11 eventvar.h --- sys/eventvar.h 17 Jan 2021 05:56:32 -0000 1.11 +++ sys/eventvar.h 3 Jun 2021 14:06:54 -0000 @@ -31,6 +31,7 @@ #ifndef _SYS_EVENTVAR_H_ #define _SYS_EVENTVAR_H_ +#include <sys/mutex.h> #include <sys/task.h> #define KQ_NEVENTS 8 /* minimize copy{in,out} calls */ @@ -38,24 +39,29 @@ /* * Locking: + * I immutable after creation * a atomic operations + * q kq_lock */ struct kqueue { - TAILQ_HEAD(, knote) kq_head; /* list of pending event */ - int kq_count; /* number of pending events */ - u_int kq_refs; /* [a] number of references */ + struct mutex kq_lock; /* lock for queue access */ + TAILQ_HEAD(, knote) kq_head; /* [q] list of pending event */ + int kq_count; /* [q] # of pending events */ + u_int kq_refs; /* [a] # of references */ struct selinfo kq_sel; - struct filedesc *kq_fdp; + struct filedesc *kq_fdp; /* [I] fd table of this kq */ LIST_ENTRY(kqueue) kq_next; - int kq_knlistsize; /* size of kq_knlist */ - struct knlist *kq_knlist; /* list of attached knotes */ - u_long kq_knhashmask; /* size of kq_knhash */ - struct knlist *kq_knhash; /* hash table for attached knotes */ + int kq_knlistsize; /* [q] size of kq_knlist */ + struct knlist *kq_knlist; /* [q] list of + * attached knotes */ + u_long kq_knhashmask; /* [q] size of kq_knhash */ + struct knlist *kq_knhash; /* [q] hash table for + * attached knotes */ struct task kq_task; /* deferring of activation */ - int kq_state; + int kq_state; /* [q] */ #define KQ_SEL 0x01 #define KQ_SLEEP 0x02 #define KQ_DYING 0x04