poll(2) on top of kqueue

Martin Pieuchot Thu, 14 Oct 2021 02:06:50 -0700

Diff below is the counterpart of the select(2) one I just committed to
make poll(2) and ppoll(2) use kqueue internally.


They use the same logic as select(2): convert pollfd into kqueue events
with EV_SET(2) then wait in kqueue_scan().

To make this implementation compatible with the existing poll(2) semantic  
I added a new specific kqueue-filter to FIFOs to handle the case where
POLLOUT is specified on a read-only event.  Thanks to millert@ for the
idea.  The regress sys/fifofs is passing with that.

As for the select(2) diff I'm currently interested in knowing if you
find any incompatibility with the current behavior. 

Thanks for testing,
Martin

Index: kern/sys_generic.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.136
diff -u -p -r1.136 sys_generic.c
--- kern/sys_generic.c  14 Oct 2021 08:46:01 -0000      1.136
+++ kern/sys_generic.c  14 Oct 2021 09:00:22 -0000
@@ -81,6 +81,8 @@ int kqpoll_debug = 0;
 
 int pselregister(struct proc *, fd_set *[], int, int *);
 int pselcollect(struct proc *, struct kevent *, fd_set *[], int *);
+int ppollregister(struct proc *, struct pollfd *, int, int *);
+int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
 
 int pollout(struct pollfd *, struct pollfd *, u_int);
 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
@@ -769,6 +771,7 @@ pselregister(struct proc *p, fd_set *pib
                                /* FALLTHROUGH */
                                case EOPNOTSUPP:/* No underlying kqfilter */
                                case EINVAL:    /* Unimplemented filter */
+                               case EPERM:     /* Specific to FIFO */
                                        error = 0;
                                        break;
                                case ENXIO:     /* Device has been detached */
@@ -899,31 +902,132 @@ doselwakeup(struct selinfo *sip)
        }
 }
 
-void
-pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
+int
+ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev,
+    struct pollfd *pl)
 {
-       struct filedesc *fdp = p->p_fd;
-       struct file *fp;
-       u_int i;
-       int n = 0;
+       int i, error, nevents = 0;
 
-       for (i = 0; i < nfd; i++, pl++) {
-               /* Check the file descriptor. */
-               if (pl->fd < 0) {
-                       pl->revents = 0;
-                       continue;
+       KASSERT(pl->revents == 0);
+
+#ifdef KTRACE
+       if (KTRPOINT(p, KTR_STRUCT))
+               ktrevent(p, kevp, nkev);
+#endif
+       for (i = 0; i < nkev; i++, kevp++) {
+again:
+               error = kqueue_register(p->p_kq, kevp, p);
+               switch (error) {
+               case 0:
+                       nevents++;
+                       break;
+               case EOPNOTSUPP:/* No underlying kqfilter */
+               case EINVAL:    /* Unimplemented filter */
+                       break;
+               case EBADF:     /* Bad file descriptor */
+                       pl->revents |= POLLNVAL;
+                       break;
+               case EPERM:     /* Specific to FIFO */
+                       KASSERT(kevp->filter == EVFILT_WRITE);
+                       if (nkev == 1) {
+                               /*
+                                * If this is the only filter make sure
+                                * POLLHUP is passed to userland.
+                                */
+                               kevp->filter = EVFILT_EXCEPT;
+                               goto again;
+                       }
+                       break;
+               case EPIPE:     /* Specific to pipes */
+                       KASSERT(kevp->filter == EVFILT_WRITE);
+                       pl->revents |= POLLHUP;
+                       break;
+               default:
+#ifdef DIAGNOSTIC
+                       DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
+                           " %lu filt %d ERROR=%d\n",
+                           ((unsigned long)kevp->udata - p->p_kq_serial),
+                           pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
+                           error);
+#endif
+                       /* FALLTHROUGH */
+               case ENXIO:     /* Device has been detached */
+                       pl->revents |= POLLERR;
+                       break;
                }
-               if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
-                       pl->revents = POLLNVAL;
-                       n++;
+       }
+
+       return (nevents);
+}
+
+/*
+ * Convert pollfd into kqueue events and register them on the
+ * per-thread queue.
+ *
+ * Return the number of pollfd that triggered at least one error and aren't
+ * completly monitored.  These pollfd should have the correponding error bit
+ * set in `revents'.
+ *
+ * At most 3 events can correspond to a single pollfd.
+ */
+int
+ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered)
+{
+       int i, nkev, nevt, errcount = 0, forcehup = 0;
+       struct kevent kev[3], *kevp;
+
+       for (i = 0; i < nfds; i++) {
+               pl[i].events &= ~POLL_NOHUP;
+               pl[i].revents = 0;
+
+               if (pl[i].fd < 0)
                        continue;
+
+               if (pl[i].events == 0)
+                       forcehup = 1;
+
+               DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n",
+                   i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial);
+
+               nevt = 0;
+               nkev = 0;
+               kevp = kev;
+               if (pl[i].events & (POLLIN | POLLRDNORM)) {
+                       EV_SET(kevp, pl[i].fd, EVFILT_READ,
+                           EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+                           (void *)(p->p_kq_serial + i));
+                       nkev++;
+                       kevp++;
+               }
+               if (pl[i].events & (POLLOUT | POLLWRNORM)) {
+                       EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
+                           EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+                           (void *)(p->p_kq_serial + i));
+                       nkev++;
+                       kevp++;
                }
-               pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
-               FRELE(fp, p);
-               if (pl->revents != 0)
-                       n++;
+               if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) {
+                       EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT,
+                           EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+                           (void *)(p->p_kq_serial + i));
+                       nkev++;
+                       kevp++;
+               }
+
+               if (nkev == 0)
+                       continue;
+
+               nevt = ppollregister_evts(p, kev, nkev, &pl[i]);
+               if (nevt == 0 && !forcehup)
+                       errcount++;
+               *nregistered += nevt;
        }
-       *retval = n;
+
+#if DIAGNOSTIC
+       DPRINTFN(1, "poll registered = %d, errors = %d\n", *nregistered,
+           errcount);
+#endif
+       return (errcount);
 }
 
 /*
@@ -1013,11 +1117,10 @@ int
 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
     struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
 {
-       size_t sz;
+       struct kqueue_scan_state scan;
        struct pollfd pfds[4], *pl = pfds;
-       struct timespec elapsed, start, stop;
-       uint64_t nsecs;
-       int ncoll, i, s, error;
+       int error, nevents = 0;
+       size_t sz;
 
        /* Standards say no more than MAX_OPEN; this is possibly better. */
        if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
@@ -1031,58 +1134,75 @@ doppoll(struct proc *p, struct pollfd *f
                        return (EINVAL);
        }
 
+       kqpoll_init();
+
        sz = nfds * sizeof(*pl);
 
        if ((error = copyin(fds, pl, sz)) != 0)
                goto bad;
 
-       for (i = 0; i < nfds; i++) {
-               pl[i].events &= ~POLL_NOHUP;
-               pl[i].revents = 0;
-       }
-
        if (sigmask)
                dosigsuspend(p, *sigmask &~ sigcantmask);
 
-retry:
-       ncoll = nselcoll;
-       atomic_setbits_int(&p->p_flag, P_SELECT);
-       pollscan(p, pl, nfds, retval);
-       if (*retval)
-               goto done;
-       if (timeout == NULL || timespecisset(timeout)) {
-               if (timeout != NULL) {
-                       getnanouptime(&start);
-                       nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
-               } else
-                       nsecs = INFSLP;
-               s = splhigh();
-               if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
-                       splx(s);
-                       goto retry;
-               }
-               atomic_clearbits_int(&p->p_flag, P_SELECT);
-               error = tsleep_nsec(&selwait, PSOCK | PCATCH, "poll", nsecs);
-               splx(s);
+       /* Register kqueue events */
+       *retval = ppollregister(p, pl, nfds, &nevents);
+
+       /*
+        * The poll/select family of syscalls has been designed to
+        * block when file descriptors are not available, even if
+        * there's nothing to wait for.
+        */
+       if (nevents == 0) {
+               uint64_t nsecs = INFSLP;
+
                if (timeout != NULL) {
-                       getnanouptime(&stop);
-                       timespecsub(&stop, &start, &elapsed);
-                       timespecsub(timeout, &elapsed, timeout);
-                       if (timeout->tv_sec < 0)
-                               timespecclear(timeout);
+                       if (!timespecisset(timeout))
+                               goto done;
+                       nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
                }
-               if (error == 0 || error == EWOULDBLOCK)
-                       goto retry;
+
+               error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqpoll", nsecs);
+               if (error == ERESTART)
+                       error = EINTR;
+               if (error == EWOULDBLOCK)
+                       error = 0;
+               goto done;
        }
 
+       /* Collect at most `nevents' possibly waiting in kqueue_scan() */
+       kqueue_scan_setup(&scan, p->p_kq);
+       while (nevents > 0) {
+               struct kevent kev[KQ_NEVENTS];
+               int i, ready, count;
+
+               /* Maxium number of events per iteration */
+               count = MIN(nitems(kev), nevents);
+               ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+#ifdef KTRACE
+               if (KTRPOINT(p, KTR_STRUCT))
+                       ktrevent(p, kev, ready);
+#endif
+               /* Convert back events that are ready. */
+               for (i = 0; i < ready; i++)
+                       *retval += ppollcollect(p, &kev[i], pl, nfds);
+
+               /*
+                * Stop if there was an error or if we had enough
+                * place to collect all events that were ready.
+                */
+               if (error || ready < count)
+                       break;
+
+               nevents -= ready;
+       }
+       kqueue_scan_finish(&scan);
 done:
-       atomic_clearbits_int(&p->p_flag, P_SELECT);
        /*
         * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
         *       ignored (since the whole point is to see what would block).
         */
        switch (error) {
-       case ERESTART:
+       case EINTR:
                error = pollout(pl, fds, nfds);
                if (error == 0)
                        error = EINTR;
@@ -1099,7 +1219,95 @@ done:
 bad:
        if (pl != pfds)
                free(pl, M_TEMP, sz);
+
+       kqueue_purge(p, p->p_kq);
+       p->p_kq_serial += nfds;
+
        return (error);
+}
+
+/*
+ * Convert given kqueue event into corresponding poll(2) revents bit.
+ */
+int
+ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int 
nfds)
+{
+       int already_seen;
+       unsigned long i;
+
+       /*  Extract poll array index */
+       i = (unsigned long)kevp->udata - p->p_kq_serial;
+
+#ifdef DIAGNOSTIC
+       /*
+        * Lazily delete spurious events.
+        *
+        * This should not happen as long as kqueue_purge() is called
+        * at the end of every syscall.  It migh be interesting to do
+        * like DragonFlyBSD and not always allocated a new knote in
+        * kqueue_register() with that lazy removal makes sense.
+        */
+       if (i >= nfds) {
+               DPRINTFN(0, "poll get out of range udata %lu vs serial %lu\n",
+                   (unsigned long)kevp->udata, p->p_kq_serial);
+               kevp->flags = EV_DISABLE|EV_DELETE;
+               kqueue_register(p->p_kq, kevp, p);
+               return (0);
+       }
+       if ((int)kevp->ident != pl[i].fd) {
+               DPRINTFN(0, "poll get %lu/%d mismatch fd %u!=%d serial %lu\n",
+                   i+1, nfds, (int)kevp->ident, pl[i].fd, p->p_kq_serial);
+               return (0);
+       }
+#endif
+
+       /*
+        * A given descriptor may already have generated an error
+        * against another filter during kqueue_register().
+        *
+        * Make sure to set the appropriate flags but do not
+        * increment `*retval' more than once.
+        */
+       already_seen = (pl[i].revents != 0);
+
+       switch (kevp->filter) {
+       case EVFILT_READ:
+               if (kevp->flags & __EV_HUP)
+                       pl[i].revents |= POLLHUP;
+               if (pl[i].events & (POLLIN | POLLRDNORM))
+                       pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
+               break;
+       case EVFILT_WRITE:
+               /* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive*/
+               if (kevp->flags & __EV_HUP) {
+                       pl[i].revents |= POLLHUP;
+               } else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
+                       pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
+               }
+               break;
+       case EVFILT_EXCEPT:
+               if (kevp->flags & __EV_HUP) {
+#if 1
+                       if (pl[i].events != 0 && pl[i].events != POLLOUT)
+                               DPRINTFN(0, "weird events %x\n", pl[i].events);
+#endif
+                       pl[i].revents |= POLLHUP;
+                       break;
+               }
+               if (pl[i].events & (POLLPRI | POLLRDBAND))
+                       pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
+               break;
+       default:
+               KASSERT(0);
+       }
+
+       DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
+           i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
+           kevp->filter);
+       if (!already_seen && (pl[i].revents != 0))
+               return (1);
+
+       return (0);
 }
 
 /*
Index: miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.81
diff -u -p -r1.81 fifo_vnops.c
--- miscfs/fifofs/fifo_vnops.c  2 Oct 2021 08:51:41 -0000       1.81
+++ miscfs/fifofs/fifo_vnops.c  14 Oct 2021 09:00:22 -0000
@@ -112,6 +112,10 @@ int        filt_fifowrite(struct knote *kn, lon
 int    filt_fifowritemodify(struct kevent *kev, struct knote *kn);
 int    filt_fifowriteprocess(struct knote *kn, struct kevent *kev);
 int    filt_fifowrite_common(struct knote *kn, struct socket *so);
+int    filt_fiforhup(struct knote *kn, long hint);
+int    filt_fiforhupmodify(struct kevent *kev, struct knote *kn);
+int    filt_fiforhupprocess(struct knote *kn, struct kevent *kev);
+int    filt_fiforhup_common(struct knote *kn, struct socket *so);
 
 const struct filterops fiforead_filtops = {
        .f_flags        = FILTEROP_ISFD,
@@ -131,6 +135,15 @@ const struct filterops fifowrite_filtops
        .f_process      = filt_fifowriteprocess,
 };
 
+const struct filterops fifohup_filtops = {
+       .f_flags        = FILTEROP_ISFD,
+       .f_attach       = NULL,
+       .f_detach       = filt_fifordetach,
+       .f_event        = filt_fiforhup,
+       .f_modify       = filt_fiforhupmodify,
+       .f_process      = filt_fiforhupprocess,
+};
+
 /*
  * Open called to set up a new instance of a fifo or
  * to find an active instance of a fifo.
@@ -516,12 +529,27 @@ fifo_kqfilter(void *v)
                sb = &so->so_rcv;
                break;
        case EVFILT_WRITE:
-               if (!(ap->a_fflag & FWRITE))
+               if (!(ap->a_fflag & FWRITE)) {
+                       /* Tell uper layer to ask for POLLUP only */
+                       if (ap->a_kn->kn_flags & __EV_POLL)
+                               return (EPERM);
                        return (EINVAL);
+               }
                ap->a_kn->kn_fop = &fifowrite_filtops;
                so = fip->fi_writesock;
                sb = &so->so_snd;
                break;
+       case EVFILT_EXCEPT:
+               /*
+                * Filter used to set POLLHUP when no poll(2) flag are
+                * specified or if POLLOUT is passed on a read-only fd.
+                */
+               if (!(ap->a_kn->kn_flags & __EV_POLL))
+                       return (EINVAL);
+               ap->a_kn->kn_fop = &fifohup_filtops;
+               so = fip->fi_readsock;
+               sb = &so->so_rcv;
+               break;
        default:
                return (EINVAL);
        }
@@ -670,3 +698,60 @@ filt_fifowriteprocess(struct knote *kn, 
 
        return (rv);
 }
+
+int
+filt_fiforhup_common(struct knote *kn, struct socket *so)
+{
+       int rv = 0;
+
+       soassertlocked(so);
+       KASSERT(kn->kn_flags & __EV_POLL);
+
+       if (so->so_state & SS_ISDISCONNECTED) {
+               kn->kn_flags |= __EV_HUP;
+               rv = 1;
+       }
+
+       return (rv);
+}
+
+int
+filt_fiforhup(struct knote *kn, long hint)
+{
+       struct socket *so = kn->kn_hook;
+
+       return (filt_fiforhup_common(kn, so));
+}
+
+int
+filt_fiforhupmodify(struct kevent *kev, struct knote *kn)
+{
+       struct socket *so = kn->kn_hook;
+       int rv, s;
+
+       s = solock(so);
+       knote_modify(kev, kn);
+       rv = filt_fiforhup_common(kn, so);
+       sounlock(so, s);
+
+       return (rv);
+}
+
+int
+filt_fiforhupprocess(struct knote *kn, struct kevent *kev)
+{
+       struct socket *so = kn->kn_hook;
+       int rv, s;
+
+       s = solock(so);
+       if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
+               rv = 1;
+       else
+               rv = filt_fiforhup_common(kn, so);
+       if (rv != 0)
+               knote_submit(kn, kev);
+       sounlock(so, s);
+
+       return (rv);
+}
+

poll(2) on top of kqueue

Reply via email to