On Fri, Dec 11, 2020 at 09:35:59AM -0300, Martin Pieuchot wrote:
> On 10/12/20(Thu) 09:59, Martin Pieuchot wrote:
> > All previous kqueue refactoring have been committed, here's a final diff
> > to modify the internal implementation of {p,}select(2) to query kqfilter
> > handlers instead of poll ones.
> > 
> > {p,}poll(2) are left untouched to ease the transition.
> > 
> > Here's what I said in the original mail back in May [0]:
> > 
> > > The main argument for this proposal is to reduce the amount of code
> > > executed to notify userland when an event occur.  The outcome of this
> > > diff is that a single notification subsystem needs to be taken out of
> > > the KERNEL_LOCK().  This simplifies a lot existing locking tentacles.
> > > 
> > > Using kqueue internally means collision is avoided and there's no need
> > > to query handlers for fds that aren't ready.  This comes at the cost of
> > > allocating descriptors.  A space vs time trade-off.  Note that this cost
> > > can be diminished by doing lazy removal of event descriptors to be able
> > > to re-use them.
> > 
> > The logic is as follow:
> > 
> > - With this change every thread use a "private" kqueue, usable only by
> >   the kernel, to register events for select(2) and later poll(2).
> > 
> > - Events specified via FD_SET(2) are converted to their kqueue equivalent.
> >   ktrace(1) now also outputs converted events to ease debugging.
> > 
> > - kqueue_scan() might be called multiple times per syscall, just like with
> >   the last version of kevent(2). 
> > 
> > - At the end of every {p,}select(2) syscall the private kqueue is purged.
> > 
> > [0] https://marc.info/?l=openbsd-tech&m=158979921322191&w=2
> 
> Updated diff that adds a FALLTHOUGHT lost in refactoring and do not
> block in if the timeout is cleared and no events are requested, pointed 
> by cheloha@:
> 
> Index: kern/sys_generic.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sys_generic.c,v
> retrieving revision 1.132
> diff -u -p -r1.132 sys_generic.c
> --- kern/sys_generic.c        2 Oct 2020 15:45:22 -0000       1.132
> +++ kern/sys_generic.c        11 Dec 2020 12:28:10 -0000
> @@ -55,6 +55,7 @@
>  #include <sys/time.h>
>  #include <sys/malloc.h>
>  #include <sys/poll.h>
> +#include <sys/eventvar.h>
>  #ifdef KTRACE
>  #include <sys/ktrace.h>
>  #endif
> @@ -66,8 +67,21 @@
>  
>  #include <uvm/uvm_extern.h>
>  
> -int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
> -void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
> +/*
> + * Debug values:
> + *  1 - print implementation errors, things that should not happen.
> + *  2 - print ppoll(2) information, somewhat verbose
> + *  3 - print pselect(2) and ppoll(2) information, very verbose
> + */
> +int kqpoll_debug = 0;
> +#define DPRINTFN(v, x...) if (kqpoll_debug > v) do {         \
> +    printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid);       \
> +    printf(x);                                                       \
> +} while (0)

"do" and "while" are not needed with the "if", indentation with tabs.

> +
> +int pselregister(struct proc *, fd_set *, int, int, int *);
> +int pselcollect(struct proc *, struct kevent *, fd_set *[]);
> +
>  int pollout(struct pollfd *, struct pollfd *, u_int);
>  int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
>      struct timespec *, const sigset_t *, register_t *);
> @@ -584,11 +598,10 @@ int
>  dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
>      struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
>  {
> +     struct kqueue_scan_state scan;
>       fd_mask bits[6];
>       fd_set *pibits[3], *pobits[3];
> -     struct timespec elapsed, start, stop;
> -     uint64_t nsecs;
> -     int s, ncoll, error = 0;
> +     int error, nevents = 0;
>       u_int ni;
>  
>       if (nd < 0)
> @@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
>               pobits[2] = (fd_set *)&bits[5];
>       }
>  
> +     kqpoll_init();
> +
>  #define      getbits(name, x) \
>       if (name && (error = copyin(name, pibits[x], ni))) \
>               goto done;
> @@ -636,43 +651,59 @@ dopselect(struct proc *p, int nd, fd_set
>       if (sigmask)
>               dosigsuspend(p, *sigmask &~ sigcantmask);
>  
> -retry:
> -     ncoll = nselcoll;
> -     atomic_setbits_int(&p->p_flag, P_SELECT);
> -     error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
> -     if (error || *retval)
> +     /* Register kqueue events */
> +     if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
>               goto done;
> -     if (timeout == NULL || timespecisset(timeout)) {
> -             if (timeout != NULL) {
> -                     getnanouptime(&start);
> -                     nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
> -             } else
> -                     nsecs = INFSLP;
> -             s = splhigh();
> -             if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
> -                     splx(s);
> -                     goto retry;
> -             }
> -             atomic_clearbits_int(&p->p_flag, P_SELECT);
> -             error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
> -             splx(s);
> +
> +     /*
> +      * The poll/select family of syscalls has been designed to
> +      * block when file descriptors are not available, even if
> +      * there's nothing to wait for.
> +      */
> +     if (nevents == 0) {
> +             uint64_t nsecs = INFSLP;
> +
>               if (timeout != NULL) {
> -                     getnanouptime(&stop);
> -                     timespecsub(&stop, &start, &elapsed);
> -                     timespecsub(timeout, &elapsed, timeout);
> -                     if (timeout->tv_sec < 0)
> -                             timespecclear(timeout);
> +                     if (!timespecisset(timeout))
> +                             goto done;
> +                     nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
>               }
> -             if (error == 0 || error == EWOULDBLOCK)
> -                     goto retry;
> +             error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
> +             /* select is not restarted after signals... */
> +             if (error == ERESTART)
> +                     error = EINTR;
> +             if (error == EWOULDBLOCK)
> +                     error = 0;
> +             goto done;
> +     }
> +
> +     /* Collect at most `nevents' possibly waiting in kqueue_scan() */
> +     kqueue_scan_setup(&scan, p->p_kq);
> +     while (nevents > 0) {
> +             struct kevent kev[KQ_NEVENTS];
> +             int i, ready, count;
> +
> +             /* Maxium number of events per iteration */
> +             count = MIN(nitems(kev), nevents);
> +             ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
> +#ifdef KTRACE
> +             if (KTRPOINT(p, KTR_STRUCT))
> +                     ktrevent(p, kev, ready);
> +#endif
> +             /* Convert back events that are ready. */
> +             for (i = 0; i < ready; i++)
> +                     *retval += pselcollect(p, &kev[i], pobits);
> +             /*
> +              * Stop if there was an error or if we had enough
> +              * space to collect all events that were ready.
> +              */
> +             if (error || ready < count)
> +                     break;
> +
> +             nevents -= ready;
>       }
> -done:
> -     atomic_clearbits_int(&p->p_flag, P_SELECT);
> -     /* select is not restarted after signals... */
> -     if (error == ERESTART)
> -             error = EINTR;
> -     if (error == EWOULDBLOCK)
> -             error = 0;
> +     kqueue_scan_finish(&scan);
> + done:
>  #define      putbits(name, x) \
>       if (name && (error2 = copyout(pobits[x], name, ni))) \
>               error = error2;
> @@ -694,41 +725,102 @@ done:
>  
>       if (pibits[0] != (fd_set *)&bits[0])
>               free(pibits[0], M_TEMP, 6 * ni);
> +
> +     kqueue_purge(p, p->p_kq);
> +     p->p_kq_serial += nd;
> +
>       return (error);
>  }
>  
> +/*
> + * Convert fd_set into kqueue events and register them on the
> + * per-thread queue.
> + */
>  int
> -selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
> -    register_t *retval)
> +pselregister(struct proc *p, fd_set *ibits, int nfd, int ni, int 
> *nregistered)

I think pselregister() should take the bitvector similarly to
pselcollect(). Then the function could skip some pointer arithmetics
as well.

int pselregister(struct proc *p, fd_set *pibits[3], int nfd, int *nregistered);

>  {
> -     caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
> -     struct filedesc *fdp = p->p_fd;
> -     int msk, i, j, fd;
> +     int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
> +     int evff[] = { 0, 0, NOTE_OOB };

evf and evff should be static const.

> +     caddr_t cibits = (caddr_t)ibits;
> +     int msk, i, j, fd, nevents = 0, error = 0;
> +     struct kevent kev;
>       fd_mask bits;
> -     struct file *fp;
> -     int n = 0;
> -     static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
>  
>       for (msk = 0; msk < 3; msk++) {
>               fd_set *pibits = (fd_set *)&cibits[msk*ni];
> -             fd_set *pobits = (fd_set *)&cobits[msk*ni];
>  
>               for (i = 0; i < nfd; i += NFDBITS) {
>                       bits = pibits->fds_bits[i/NFDBITS];
>                       while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
>                               bits &= ~(1 << j);
> -                             if ((fp = fd_getfile(fdp, fd)) == NULL)
> -                                     return (EBADF);
> -                             if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
> -                                     FD_SET(fd, pobits);
> -                                     n++;
> +
> +                             DPRINTFN(2, "select fd %d mask %d serial %lu\n",
> +                                 fd, msk, p->p_kq_serial);
> +                             EV_SET(&kev, fd, evf[msk],
> +                                 EV_ADD|EV_ENABLE|__EV_POLL, evff[msk], 0,
> +                                 (void *)(p->p_kq_serial));

As the implementation does not preserve registrations across
invocations at the moment, shouldn't the event be registered with
EV_ONESHOT? This would save some work in kqueue_purge() with knotes
that got activated during the scan.

In addition, EV_ONESHOT might be somewhat more cautious start for
the backend conversion because of possible semantic subtleties with
edge-triggered events (however, I think such issues, if there are any,
suggest bugs in f_event code).

> +#ifdef KTRACE
> +                             if (KTRPOINT(p, KTR_STRUCT))
> +                                     ktrevent(p, &kev, 1);
> +#endif
> +                             error = kqueue_register(p->p_kq, &kev, p);
> +                             switch (error) {
> +                             case 0:
> +                                     nevents++;
> +                             /* FALLTHROUGH */
> +                             case EOPNOTSUPP:/* No underlying kqfilter */
> +                             case EINVAL:    /* Unimplemented filter */
> +                                     error = 0;
> +                                     break;
> +                             case ENXIO:     /* Device has been detached */
> +                             default:
> +                                     goto bad;
>                               }
> -                             FRELE(fp, p);
>                       }
>               }
>       }
> -     *retval = n;
> +
> +     *nregistered = nevents;
>       return (0);
> +bad:
> +     DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
> +         kev.filter, error);
> +     return (error);
> +}
> +
> +/*
> + * Convert given kqueue event into corresponding select(2) bit.
> + */
> +int
> +pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
> +{
> +#ifdef DIAGNOSTIC
> +     /* Filter out and lazily delete spurious events */
> +     if ((unsigned long)kevp->udata != p->p_kq_serial) {
> +             DPRINTFN(0, "select fd %u mismatched serial %lu\n",
> +                 (int)kevp->ident, p->p_kq_serial);
> +             kevp->flags = EV_DISABLE|EV_DELETE;
> +             kqueue_register(p->p_kq, kevp, p);
> +             return (0);
> +     }
> +#endif

Shouldn't skipping of spurious events be taken into account in the
main scan loop? Otherwise they may cause artifacts, such as premature
timeout expiry when a wakeup is caused solely by a spurious event, or
return of an incomplete result when spurious events eat capacity from
nevents.

However, I think this can be addressed in a later patch because lazy
removal is not in use.

> +
> +     switch (kevp->filter) {
> +     case EVFILT_READ:
> +             FD_SET(kevp->ident, pobits[0]);
> +             break;
> +     case EVFILT_WRITE:
> +             FD_SET(kevp->ident, pobits[1]);
> +             break;
> +     case EVFILT_EXCEPT:
> +             FD_SET(kevp->ident, pobits[2]);
> +             break;
> +     default:
> +             KASSERT(0);
> +     }
> +
> +     DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
> +     return (1);
>  }
>  
>  int
> 

Reply via email to