On Fri, Nov 05, 2021 at 10:04:50AM +0100, Martin Pieuchot wrote:
> New poll/select(2) implementation convert 'struct pollfd' and 'fdset' to
> knotes (kqueue event descriptors) then pass them to the kqueue subsystem.
> A knote is allocated, with kqueue_register(), for every read, write and
> except condition watched on a given FD.  That means at most 3 allocations
> might be necessary per FD.
> 
> The diff below reduce the overhead of per-syscall allocation/free of those
> descriptors by leaving those which didn't trigger on the kqueue across
> syscall.  Leaving knotes on the kqueue allows kqueue_register() to re-use
> existing descriptor instead of re-allocating a new one.
> 
> With this knotes are now lazily removed.  The mechanism uses a serial
> number which is incremented for every syscall that indicates if a knote
> sitting in the kqueue is still valid or should be freed.
> 
> Note that performance improvements might not be visible with this diff
> alone because kqueue_register() still pre-allocate a descriptor then drop
> it.
> 
> visa@ already pointed out that the lazy removal logic could be integrated
> in kqueue_scan() which would reduce the complexity of those two syscalls.
> I'm arguing for doing this in a next step in-tree.

I think it would make more sense to add the removal logic to the scan
function first as doing so would keep the code modifications more
logical and simpler. This would also avoid the need to go through
a temporary removal approach.

Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.170
diff -u -p -r1.170 kern_event.c
--- kern/kern_event.c   6 Nov 2021 05:48:47 -0000       1.170
+++ kern/kern_event.c   6 Nov 2021 15:31:04 -0000
@@ -73,6 +73,7 @@ void  kqueue_terminate(struct proc *p, st
 void   KQREF(struct kqueue *);
 void   KQRELE(struct kqueue *);
 
+void   kqueue_purge(struct proc *, struct kqueue *);
 int    kqueue_sleep(struct kqueue *, struct timespec *);
 
 int    kqueue_read(struct file *, struct uio *, int);
@@ -806,6 +807,22 @@ kqpoll_exit(void)
 }
 
 void
+kqpoll_done(unsigned int num)
+{
+       struct proc *p = curproc;
+
+       KASSERT(p->p_kq != NULL);
+
+       if (p->p_kq_serial + num >= p->p_kq_serial) {
+               p->p_kq_serial += num;
+       } else {
+               /* Clear all knotes after serial wraparound. */
+               kqueue_purge(p, p->p_kq);
+               p->p_kq_serial = 1;
+       }
+}
+
+void
 kqpoll_dequeue(struct proc *p, int all)
 {
        struct knote marker;
@@ -1383,6 +1400,15 @@ retry:
 
                mtx_leave(&kq->kq_lock);
 
+               /* Drop spurious events. */
+               if (p->p_kq == kq &&
+                   p->p_kq_serial > (unsigned long)kn->kn_udata) {
+                       filter_detach(kn);
+                       knote_drop(kn, p);
+                       mtx_enter(&kq->kq_lock);
+                       continue;
+               }
+
                memset(kevp, 0, sizeof(*kevp));
                if (filter_process(kn, kevp) == 0) {
                        mtx_enter(&kq->kq_lock);
Index: kern/sys_generic.c
===================================================================
RCS file: src/sys/kern/sys_generic.c,v
retrieving revision 1.139
diff -u -p -r1.139 sys_generic.c
--- kern/sys_generic.c  29 Oct 2021 15:52:44 -0000      1.139
+++ kern/sys_generic.c  6 Nov 2021 15:31:04 -0000
@@ -730,8 +730,7 @@ done:
        if (pibits[0] != (fd_set *)&bits[0])
                free(pibits[0], M_TEMP, 6 * ni);
 
-       kqueue_purge(p, p->p_kq);
-       p->p_kq_serial += nd;
+       kqpoll_done(nd);
 
        return (error);
 }
@@ -1230,8 +1229,7 @@ bad:
        if (pl != pfds)
                free(pl, M_TEMP, sz);
 
-       kqueue_purge(p, p->p_kq);
-       p->p_kq_serial += nfds;
+       kqpoll_done(nfds);
 
        return (error);
 }
@@ -1251,8 +1249,7 @@ ppollcollect(struct proc *p, struct keve
        /*
         * Lazily delete spurious events.
         *
-        * This should not happen as long as kqueue_purge() is called
-        * at the end of every syscall.  It migh be interesting to do
+        * This should not happen.  It might be interesting to do
         * like DragonFlyBSD and not always allocated a new knote in
         * kqueue_register() with that lazy removal makes sense.
         */
Index: sys/event.h
===================================================================
RCS file: src/sys/sys/event.h,v
retrieving revision 1.57
diff -u -p -r1.57 event.h
--- sys/event.h 24 Oct 2021 07:02:47 -0000      1.57
+++ sys/event.h 6 Nov 2021 15:31:04 -0000
@@ -289,6 +289,7 @@ extern const struct klistops socket_klis
 
 extern void    kqpoll_init(void);
 extern void    kqpoll_exit(void);
+extern void    kqpoll_done(unsigned int);
 extern void    knote(struct klist *list, long hint);
 extern void    knote_fdclose(struct proc *p, int fd);
 extern void    knote_processexit(struct proc *);
@@ -302,7 +303,6 @@ extern int  kqueue_scan(struct kqueue_sca
                    struct timespec *, struct proc *, int *);
 extern void    kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
 extern void    kqueue_scan_finish(struct kqueue_scan_state *);
-extern void    kqueue_purge(struct proc *, struct kqueue *);
 extern int     filt_seltrue(struct knote *kn, long hint);
 extern int     seltrue_kqfilter(dev_t, struct knote *);
 extern void    klist_init(struct klist *, const struct klistops *, void *);

Reply via email to