Background: kqueue events are represented by a “knote” that is kept on a list 
of knotes (a klist) by its respective driver (usually indirectly in the 
“selinfo” structure).  A similar situation exists for vnodes.  Then, when 
activated, they are queued onto the kqueue’s active note list.  The driver gets 
called to attach the knote in a generic fashion, but then provides its own ops 
vector for subsequent operations, including “detach”.  When a file descriptor 
for a given knote is closed, this driver-supplied “detach” function is called 
to remove it from whatever driver-specific klist the knote is on.

When a driver detaches, it frees its softc structure, which includes the 
selinfo / klist.  This list might still have notes on it… but that isn’t 
inherently problematic, because the driver owns the list and no one will ever 
traverse it again — the generic kqueue code doesn’t care about it.

But it is a problem when an outstanding file descriptor with a registered 
kqueue event (e.g. EVFILT_READ) for that device that may have been open when 
the device was detached.  In this case, when the file descriptor is closed, the 
knote, which still points to the now-detached device’s filterops, will call the 
“detach” operation.  For a driver that is still loaded into the kernel, we call 
code that’s still valid, but a use-after-free condition exists where the 
now-freed softc structure will be accessed to unlink the knote from the klist.  
The situation is worse for modular drivers that may have been unloaded after 
the device detaches — jumping through function pointers to code that no longer 
exists is bad, m’kay?

The following patch rectifies this situation by having klist_fini() traverse a 
list of knotes and substitute their filterops with no-op stubs.  This requires 
synchronizing with any calls into the filterops themselves.  We have convenient 
funnel-points for this in kern_event.c already, so it’s not particularly 
difficult… the main issue is that the filterops themselves are allowed to 
block.  My solution to this was to have a single global rwlock, 
knote_filterops_lock, that’s acquired for reading when a call though a knote's 
filterops is made, and in klist_fini() is acquired for writing for the short 
duration needed to stub out filterops.  This lock should **very rarely** be 
contended, but it will be *hot* (lots of acquire-for-read), so it gets its own 
cache line.

If someone has ideas about another synchronization mechanism, I’m all ears… 
again, the main issue is that the filterops calls themselves are allowed to 
block, so I’m not sure that any of our passive serialization mechanisms would 
work here.

Index: kern/kern_event.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_event.c,v
retrieving revision 1.141
diff -u -p -r1.141 kern_event.c
--- kern/kern_event.c   24 May 2022 20:50:19 -0000      1.141
+++ kern/kern_event.c   11 Jul 2022 20:53:40 -0000
@@ -133,6 +133,31 @@ static const struct fileops kqueueops = 
        .fo_restart = kqueue_restart,
 };
 
+static void
+filt_nopdetach(struct knote *kn __unused)
+{
+}
+
+static int
+filt_nopevent(struct knote *kn __unused, long hint __unused)
+{
+       return 0;
+}
+
+static const struct filterops nop_fd_filtops = {
+       .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+       .f_attach = NULL,
+       .f_detach = filt_nopdetach,
+       .f_event = filt_nopevent,
+};
+
+static const struct filterops nop_filtops = {
+       .f_flags = FILTEROP_MPSAFE,
+       .f_attach = NULL,
+       .f_detach = filt_nopdetach,
+       .f_event = filt_nopevent,
+};
+
 static const struct filterops kqread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
@@ -232,6 +257,9 @@ static size_t               user_kfiltersz;         /* size 
  *     -> object lock (e.g., device driver lock, &c.)
  *     -> kn_kq->kq_lock
  *
+ *     knote_filterops_lock (rw)
+ *     -> whatever will be acquired in filter_*()
+ *
  * Locking rules:
  *
  *     f_attach: fdp->fd_lock, KERNEL_LOCK
@@ -279,6 +307,24 @@ static size_t              user_kfiltersz;         /* size 
  */
 static krwlock_t       kqueue_filter_lock;     /* lock on filter lists */
 
+/*
+ * Calls into the filterops need to be resilient against things which
+ * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
+ * chasing garbage pointers (to data, or even potentially code in a
+ * module about to be unloaded).  To that end, we acquire the
+ * knote_filterops_lock as a reader whenver calling into the filter
+ * ops (in the filter_*() functions).  When a driver (or anything else)
+ * is tearing down its klist, klist_fini() acquires knote_filterops_lock
+ * as a writer, and replaces all of the filterops in each knote with a
+ * stub, allowing knote detach (when descriptors are closed) to safely
+ * proceed.
+ *
+ * This lock should be very rarely *contended* (klist tear-down is fairly
+ * rare), but it will be *hot* (acquired as a reader for all filter calls),
+ * so give it its own cache line.
+ */
+static krwlock_t       knote_filterops_lock __cacheline_aligned;
+
 #define        KQ_FLUX_WAIT(kq)        (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
 #define        KQ_FLUX_WAKEUP(kq)      cv_broadcast(&kq->kq_cv)
 
@@ -428,6 +474,13 @@ filter_attach(struct knote *kn)
 {
        int rv;
 
+       /*
+        * No need to acquire knote_filterops_lock here; if the
+        * device is detaching, the driver already guarantees that
+        * no new knote references can be created (same rule as
+        * selinfo).
+        */
+
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_attach != NULL);
 
@@ -449,6 +502,9 @@ filter_attach(struct knote *kn)
 static void
 filter_detach(struct knote *kn)
 {
+
+       rw_enter(&knote_filterops_lock, RW_READER);
+
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_detach != NULL);
 
@@ -459,6 +515,8 @@ filter_detach(struct knote *kn)
                kn->kn_fop->f_detach(kn);
                KERNEL_UNLOCK_ONE(NULL);
        }
+
+       rw_exit(&knote_filterops_lock);
 }
 
 static int
@@ -466,6 +524,8 @@ filter_event(struct knote *kn, long hint
 {
        int rv;
 
+       rw_enter(&knote_filterops_lock, RW_READER);
+
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_event != NULL);
 
@@ -477,13 +537,36 @@ filter_event(struct knote *kn, long hint
                KERNEL_UNLOCK_ONE(NULL);
        }
 
+       rw_exit(&knote_filterops_lock);
+
        return rv;
 }
 
 static int
 filter_touch(struct knote *kn, struct kevent *kev, long type)
 {
-       return kn->kn_fop->f_touch(kn, kev, type);
+       int rv;
+
+       /*
+        * This one is a little tricky; we are called conditionally
+        * (only if f_touch is non-NULL), but that test is done without
+        * synchronizing with klist_fini(), so we need to check again
+        * after acquiring knote_filterops_lock.
+        */
+
+       rw_enter(&knote_filterops_lock, RW_READER);
+
+       KASSERT(kn->kn_fop != NULL);
+
+       if (__predict_true(kn->kn_fop->f_touch != NULL)) {
+               rv = kn->kn_fop->f_touch(kn, kev, type);
+       } else {
+               rv = 0;
+       }
+
+       rw_exit(&knote_filterops_lock);
+
+       return rv;
 }
 
 static kauth_listener_t        kqueue_listener;
@@ -518,6 +601,7 @@ kqueue_init(void)
 {
 
        rw_init(&kqueue_filter_lock);
+       rw_init(&knote_filterops_lock);
 
        kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            kqueue_listener_cb, NULL);
@@ -2797,3 +2881,59 @@ knote_clear_eof(struct knote *kn)
        kn->kn_flags &= ~EV_EOF;
        mutex_spin_exit(&kq->kq_lock);
 }
+
+/*
+ * Initialize a klist.
+ */
+void
+klist_init(struct klist *list)
+{
+       SLIST_INIT(list);
+}
+
+/*
+ * Neuter all existing knotes on the klist because the list is
+ * being destroyed.  The caller has guaranteed that no additional
+ * knotes will be added to the list, that the backing object's
+ * locks are not held (otherwise there is a locking order issue
+ * with acquire knote_filterops_lock), and that we can traverse
+ * the list safely in this state.
+ */
+void
+klist_fini(struct klist *list)
+{
+       struct knote *kn;
+
+       rw_enter(&knote_filterops_lock, RW_WRITER);
+
+       SLIST_FOREACH(kn, list, kn_selnext) {
+               KASSERT(kn->kn_fop != NULL);
+               if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
+                       kn->kn_fop = &nop_fd_filtops;
+               } else {
+                       kn->kn_fop = &nop_filtops;
+               }
+       }
+
+       rw_exit(&knote_filterops_lock);
+}
+
+/*
+ * Insert a knote into a klist.
+ */
+void
+klist_insert(struct klist *list, struct knote *kn)
+{
+       SLIST_INSERT_HEAD(list, kn, kn_selnext);
+}
+
+/*
+ * Remove a knote from a klist.  Returns true if the last
+ * knote was removed and the list is now empty.
+ */
+bool
+klist_remove(struct klist *list, struct knote *kn)
+{
+       SLIST_REMOVE(list, kn, knote, kn_selnext);
+       return SLIST_EMPTY(list);
+}
Index: sys/event.h
===================================================================
RCS file: /cvsroot/src/sys/sys/event.h,v
retrieving revision 1.52
diff -u -p -r1.52 event.h
--- sys/event.h 12 Feb 2022 15:51:29 -0000      1.52
+++ sys/event.h 11 Jul 2022 20:53:40 -0000
@@ -334,30 +334,10 @@ int       kfilter_unregister(const char *);
 int    filt_seltrue(struct knote *, long);
 extern const struct filterops seltrue_filtops;
 
-static inline void
-klist_init(struct klist *list)
-{
-       SLIST_INIT(list);
-}
-
-static inline void
-klist_fini(struct klist *list)
-{
-       /* Nothing, for now. */
-}
-
-static inline void
-klist_insert(struct klist *list, struct knote *kn)
-{
-       SLIST_INSERT_HEAD(list, kn, kn_selnext);
-}
-
-static inline bool
-klist_remove(struct klist *list, struct knote *kn)
-{
-       SLIST_REMOVE(list, kn, knote, kn_selnext);
-       return SLIST_EMPTY(list);
-}
+void   klist_init(struct klist *);
+void   klist_fini(struct klist *);
+void   klist_insert(struct klist *, struct knote *);
+bool   klist_remove(struct klist *, struct knote *);
 
 #else  /* !_KERNEL */
 

-- thorpej


Reply via email to