On Fri, Oct 02, 2020 at 12:19:35PM +0200, Martin Pieuchot wrote:
>
> [...]
>
> I'd like to get this in early in this release cycle, so please test and
> report back :o)
You removed the resleep logic that accounts for if/when tsleep_nsec(9)
returns early. So now select and pselect can return too soon.
I've left questions below in the spots I think look off.
> Index: kern/kern_event.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_event.c,v
> retrieving revision 1.142
> diff -u -p -r1.142 kern_event.c
> --- kern/kern_event.c 12 Aug 2020 13:49:24 -0000 1.142
> +++ kern/kern_event.c 1 Oct 2020 12:53:54 -0000
> @@ -64,9 +64,6 @@ void KQREF(struct kqueue *);
> void KQRELE(struct kqueue *);
>
> int kqueue_sleep(struct kqueue *, struct timespec *);
> -int kqueue_scan(struct kqueue *kq, int maxevents,
> - struct kevent *ulistp, struct timespec *timeout,
> - struct kevent *kev, struct proc *p, int *retval);
>
> int kqueue_read(struct file *, struct uio *, int);
> int kqueue_write(struct file *, struct uio *, int);
> @@ -521,6 +518,14 @@ kqueue_alloc(struct filedesc *fdp)
> return (kq);
> }
>
> +void
> +kqueue_exit(struct proc *p)
> +{
> + kqueue_terminate(p, p->p_kq);
> + kqueue_free(p->p_kq);
> + p->p_kq = NULL;
> +}
> +
> int
> sys_kqueue(struct proc *p, void *v, register_t *retval)
> {
> @@ -554,6 +559,7 @@ out:
> int
> sys_kevent(struct proc *p, void *v, register_t *retval)
> {
> + struct kqueue_scan_state scan;
> struct filedesc* fdp = p->p_fd;
> struct sys_kevent_args /* {
> syscallarg(int) fd;
> @@ -569,6 +575,7 @@ sys_kevent(struct proc *p, void *v, regi
> struct timespec ts;
> struct timespec *tsp = NULL;
> int i, n, nerrors, error;
> + int ready, total;
> struct kevent kev[KQ_NEVENTS];
>
> if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
> @@ -597,9 +604,9 @@ sys_kevent(struct proc *p, void *v, regi
> kq = fp->f_data;
> nerrors = 0;
>
> - while (SCARG(uap, nchanges) > 0) {
> - n = SCARG(uap, nchanges) > KQ_NEVENTS ?
> - KQ_NEVENTS : SCARG(uap, nchanges);
> + while ((n = SCARG(uap, nchanges)) > 0) {
> + if (n > nitems(kev))
> + n = nitems(kev);
> error = copyin(SCARG(uap, changelist), kev,
> n * sizeof(struct kevent));
> if (error)
> @@ -635,12 +642,39 @@ sys_kevent(struct proc *p, void *v, regi
> goto done;
> }
>
> +
> KQREF(kq);
> FRELE(fp, p);
> - error = kqueue_scan(kq, SCARG(uap, nevents), SCARG(uap, eventlist),
> - tsp, kev, p, &n);
> + /*
> + * Collect as many events as we can. The timeout on successive
> + * loops is disabled (kqueue_scan() becomes non-blocking).
> + */
> + total = 0;
> + error = 0;
> + kqueue_scan_setup(&scan, kq);
> + while ((n = SCARG(uap, nevents) - total) > 0) {
> + if (n > nitems(kev))
> + n = nitems(kev);
> + ready = kqueue_scan(&scan, n, kev, tsp, p, &error);
> + if (ready == 0)
> + break;
> + error = copyout(kev, SCARG(uap, eventlist) + total,
> + sizeof(struct kevent) * ready);
> +#ifdef KTRACE
> + if (KTRPOINT(p, KTR_STRUCT))
> + ktrevent(p, kev, ready);
> +#endif
> + total += ready;
> + if (error || ready < n)
> + break;
> + tsp = &ts; /* successive loops non-blocking */
> + timespecclear(tsp);
Here, this. Why do we force a non-blocking loop the second time?
> + }
> + kqueue_scan_finish(&scan);
> KQRELE(kq);
> - *retval = n;
> + if (error == EWOULDBLOCK)
> + error = 0;
> + *retval = total;
> return (error);
>
> done:
> @@ -894,24 +928,22 @@ kqueue_sleep(struct kqueue *kq, struct t
> return (error);
> }
>
> +/*
> + * Scan the kqueue, blocking if necessary until the target time is reached.
> + * If tsp is NULL we block indefinitely. If tsp->ts_secs/nsecs are both
> + * 0 we do not block at all.
> + */
> int
> -kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
> - struct timespec *tsp, struct kevent *kev, struct proc *p, int *retval)
> +kqueue_scan(struct kqueue_scan_state *scan, int maxevents,
> + struct kevent *kevp, struct timespec *tsp, struct proc *p, int *errorp)
> {
> - struct kevent *kevp;
> - struct knote mend, mstart, *kn;
> - int s, count, nkev, error = 0;
> -
> - nkev = 0;
> - kevp = kev;
> + struct knote *kn;
> + struct kqueue *kq = scan->kqs_kq;
> + int s, count, nkev = 0, error = 0;
>
> count = maxevents;
> if (count == 0)
> goto done;
> -
> - memset(&mstart, 0, sizeof(mstart));
> - memset(&mend, 0, sizeof(mend));
> -
> retry:
> KASSERT(count == maxevents);
> KASSERT(nkev == 0);
> @@ -923,7 +955,8 @@ retry:
>
> s = splhigh();
> if (kq->kq_count == 0) {
> - if (tsp != NULL && !timespecisset(tsp)) {
> + if ((tsp != NULL && !timespecisset(tsp)) ||
> + scan->kqs_nevent != 0) {
> splx(s);
> error = 0;
> goto done;
> @@ -931,7 +964,7 @@ retry:
> kq->kq_state |= KQ_SLEEP;
> error = kqueue_sleep(kq, tsp);
> splx(s);
> - if (error == 0 || error == EWOULDBLOCK)
> + if (error == 0)
> goto retry;
Why wouldn't we want to retry in the EWOULDBLOCK case?
You have a check for
tsp != NULL && !timespecisset(tsp)
e.g., when you time out.
> /* don't restart after signals... */
> if (error == ERESTART)
> @@ -939,27 +972,40 @@ retry:
> goto done;
> }
>
> - mstart.kn_filter = EVFILT_MARKER;
> - mstart.kn_status = KN_PROCESSING;
> - TAILQ_INSERT_HEAD(&kq->kq_head, &mstart, kn_tqe);
> - mend.kn_filter = EVFILT_MARKER;
> - mend.kn_status = KN_PROCESSING;
> - TAILQ_INSERT_TAIL(&kq->kq_head, &mend, kn_tqe);
> + /*
> + * Put the end marker in the queue to limit the scan to the events
> + * that are currently active. This prevents events from being
> + * recollected if they reactivate during scan.
> + *
> + * If a partial scan has been performed already but no events have
> + * been collected, reposition the end marker to make any new events
> + * reachable.
> + */
> + if (!scan->kqs_queued) {
> + TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
> + scan->kqs_queued = 1;
> + } else if (scan->kqs_nevent == 0) {
> + TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
> + TAILQ_INSERT_TAIL(&kq->kq_head, &scan->kqs_end, kn_tqe);
> + }
> +
> + TAILQ_INSERT_HEAD(&kq->kq_head, &scan->kqs_start, kn_tqe);
> while (count) {
> - kn = TAILQ_NEXT(&mstart, kn_tqe);
> + kn = TAILQ_NEXT(&scan->kqs_start, kn_tqe);
> if (kn->kn_filter == EVFILT_MARKER) {
> - if (kn == &mend) {
> - TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe);
> - TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> + if (kn == &scan->kqs_end) {
> + TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start,
> + kn_tqe);
> splx(s);
> - if (count == maxevents)
> + if (scan->kqs_nevent == 0)
> goto retry;
> goto done;
> }
>
> /* Move start marker past another thread's marker. */
> - TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> - TAILQ_INSERT_AFTER(&kq->kq_head, kn, &mstart, kn_tqe);
> + TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
> + TAILQ_INSERT_AFTER(&kq->kq_head, kn, &scan->kqs_start,
> + kn_tqe);
> continue;
> }
>
> @@ -987,6 +1033,12 @@ retry:
> *kevp = kn->kn_kevent;
> kevp++;
> nkev++;
> + count--;
> + scan->kqs_nevent++;
> +
> + /*
> + * Post-event action on the note
> + */
> if (kn->kn_flags & EV_ONESHOT) {
> splx(s);
> kn->kn_fop->f_detach(kn);
> @@ -1012,37 +1064,44 @@ retry:
> knote_release(kn);
> }
> kqueue_check(kq);
> - count--;
> - if (nkev == KQ_NEVENTS) {
> - splx(s);
> -#ifdef KTRACE
> - if (KTRPOINT(p, KTR_STRUCT))
> - ktrevent(p, kev, nkev);
> -#endif
> - error = copyout(kev, ulistp,
> - sizeof(struct kevent) * nkev);
> - ulistp += nkev;
> - nkev = 0;
> - kevp = kev;
> - s = splhigh();
> - if (error)
> - break;
> - }
> }
> - TAILQ_REMOVE(&kq->kq_head, &mend, kn_tqe);
> - TAILQ_REMOVE(&kq->kq_head, &mstart, kn_tqe);
> + TAILQ_REMOVE(&kq->kq_head, &scan->kqs_start, kn_tqe);
> splx(s);
> + if (scan->kqs_nevent == 0)
> + goto retry;
> done:
> - if (nkev != 0) {
> -#ifdef KTRACE
> - if (KTRPOINT(p, KTR_STRUCT))
> - ktrevent(p, kev, nkev);
> -#endif
> - error = copyout(kev, ulistp,
> - sizeof(struct kevent) * nkev);
> + *errorp = error;
> + return (nkev);
> +}
> +
> +void
> +kqueue_scan_setup(struct kqueue_scan_state *scan, struct kqueue *kq)
> +{
> + memset(scan, 0, sizeof(*scan));
> + scan->kqs_kq = kq;
> + scan->kqs_start.kn_filter = EVFILT_MARKER;
> + scan->kqs_start.kn_status = KN_PROCESSING;
> + scan->kqs_end.kn_filter = EVFILT_MARKER;
> + scan->kqs_end.kn_status = KN_PROCESSING;
> +}
> +
> +void
> +kqueue_scan_finish(struct kqueue_scan_state *scan)
> +{
> + struct kqueue *kq = scan->kqs_kq;
> + int s;
> +
> + KASSERT(scan->kqs_start.kn_filter == EVFILT_MARKER);
> + KASSERT(scan->kqs_start.kn_status == KN_PROCESSING);
> + KASSERT(scan->kqs_end.kn_filter == EVFILT_MARKER);
> + KASSERT(scan->kqs_end.kn_status == KN_PROCESSING);
> +
> + if (scan->kqs_queued) {
> + scan->kqs_queued = 0;
> + s = splhigh();
> + TAILQ_REMOVE(&kq->kq_head, &scan->kqs_end, kn_tqe);
> + splx(s);
> }
> - *retval = maxevents - count;
> - return (error);
> }
>
> /*
> @@ -1099,7 +1158,7 @@ kqueue_stat(struct file *fp, struct stat
> }
>
> void
> -kqueue_terminate(struct proc *p, struct kqueue *kq)
> +kqueue_purge(struct proc *p, struct kqueue *kq)
> {
> int i;
>
> @@ -1111,6 +1170,12 @@ kqueue_terminate(struct proc *p, struct
> for (i = 0; i < kq->kq_knhashmask + 1; i++)
> knote_remove(p, &kq->kq_knhash[i]);
> }
> +}
> +
> +void
> +kqueue_terminate(struct proc *p, struct kqueue *kq)
> +{
> + kqueue_purge(p, kq);
> kq->kq_state |= KQ_DYING;
> kqueue_wakeup(kq);
>
> Index: kern/kern_exit.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_exit.c,v
> retrieving revision 1.188
> diff -u -p -r1.188 kern_exit.c
> --- kern/kern_exit.c 18 Mar 2020 15:48:21 -0000 1.188
> +++ kern/kern_exit.c 1 Oct 2020 12:53:54 -0000
> @@ -184,6 +184,8 @@ exit1(struct proc *p, int xexit, int xsi
> if ((p->p_flag & P_THREAD) == 0)
> pr->ps_siglist = 0;
>
> + kqueue_exit(p);
> +
> #if NKCOV > 0
> kcov_exit(p);
> #endif
> Index: kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> retrieving revision 1.225
> diff -u -p -r1.225 kern_fork.c
> --- kern/kern_fork.c 20 Mar 2020 08:14:07 -0000 1.225
> +++ kern/kern_fork.c 1 Oct 2020 13:04:36 -0000
> @@ -422,6 +422,8 @@ fork1(struct proc *curp, int flags, void
> newptstat = malloc(sizeof(*newptstat), M_SUBPROC, M_WAITOK);
>
> p->p_tid = alloctid();
> + p->p_kq = kqueue_alloc(p->p_fd);
> + p->p_kq_serial = arc4random();
>
> LIST_INSERT_HEAD(&allproc, p, p_list);
> LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
> @@ -553,6 +555,8 @@ thread_fork(struct proc *curp, void *sta
> cpu_fork(curp, p, stack, tcb, child_return, p);
>
> p->p_tid = alloctid();
> + p->p_kq = kqueue_alloc(p->p_fd);
> + p->p_kq_serial = arc4random();
>
> LIST_INSERT_HEAD(&allproc, p, p_list);
> LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
> Index: kern/sys_generic.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sys_generic.c,v
> retrieving revision 1.131
> diff -u -p -r1.131 sys_generic.c
> --- kern/sys_generic.c 20 Mar 2020 04:11:05 -0000 1.131
> +++ kern/sys_generic.c 2 Oct 2020 09:20:59 -0000
> @@ -55,6 +55,7 @@
> #include <sys/time.h>
> #include <sys/malloc.h>
> #include <sys/poll.h>
> +#include <sys/eventvar.h>
> #ifdef KTRACE
> #include <sys/ktrace.h>
> #endif
> @@ -66,8 +67,20 @@
>
> #include <uvm/uvm_extern.h>
>
> -int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
> -void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
> +/*
> + * Debug values:
> + * 1 - print implementation errors, things that should not happen.
> + * 2 - print ppoll(2) information, somewhat verbose
> + * 3 - print pselect(2) and ppoll(2) information, very verbose
> + */
> +int kqpoll_debug = 0;
> +#define DPRINTFN(v, x...) if (kqpoll_debug > v) { \
> + printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
> + printf(x); \
> +}
> +int pselregister(struct proc *, fd_set *, int, int, int *);
> +int pselcollect(struct proc *, struct kevent *, fd_set *[]);
> +
> int pollout(struct pollfd *, struct pollfd *, u_int);
> int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
> struct timespec *, const sigset_t *, register_t *);
> @@ -584,11 +597,11 @@ int
> dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
> struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
> {
> + struct kqueue_scan_state scan;
> fd_mask bits[6];
> fd_set *pibits[3], *pobits[3];
> - struct timespec elapsed, start, stop;
> - uint64_t nsecs;
> - int s, ncoll, error = 0;
> + struct timespec ts;
> + int error, nevents = 0;
> u_int ni;
>
> if (nd < 0)
> @@ -636,43 +649,61 @@ dopselect(struct proc *p, int nd, fd_set
> if (sigmask)
> dosigsuspend(p, *sigmask &~ sigcantmask);
>
> -retry:
> - ncoll = nselcoll;
> - atomic_setbits_int(&p->p_flag, P_SELECT);
> - error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
> - if (error || *retval)
> + /* Register kqueue events */
> + if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
> goto done;
> - if (timeout == NULL || timespecisset(timeout)) {
> - if (timeout != NULL) {
> - getnanouptime(&start);
> - nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
> - } else
> - nsecs = INFSLP;
> - s = splhigh();
> - if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
> - splx(s);
> - goto retry;
> - }
> - atomic_clearbits_int(&p->p_flag, P_SELECT);
> - error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
> - splx(s);
> - if (timeout != NULL) {
> - getnanouptime(&stop);
> - timespecsub(&stop, &start, &elapsed);
> - timespecsub(timeout, &elapsed, timeout);
> - if (timeout->tv_sec < 0)
> - timespecclear(timeout);
> - }
> - if (error == 0 || error == EWOULDBLOCK)
> - goto retry;
> +
> + /*
> + * The poll/select family of syscalls has been designed to
> + * block when file descriptors are not available, even if
> + * there's nothing to wait for.
> + */
> + if (nevents == 0) {
> + uint64_t nsecs = INFSLP;
> +
> + if (timeout != NULL)
> + nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
> +
> + error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
> + /* select is not restarted after signals... */
> + if (error == ERESTART)
> + error = EINTR;
> + }
Aside: can the new logic (below) not handle the case where
nevents == 0? Like, what happens if we go into kqueue_scan()
with count == 0?
> +
> + /* Collect at most `nevents' possibly waiting in kqueue_scan() */
> + kqueue_scan_setup(&scan, p->p_kq);
> + while (nevents > 0) {
> + struct kevent kev[KQ_NEVENTS];
> + int i, ready, count;
> +
> + /* Maxium number of events per iteration */
> + count = MIN(nitems(kev), nevents);
> + ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
> +#ifdef KTRACE
> + if (KTRPOINT(p, KTR_STRUCT))
> + ktrevent(p, kev, ready);
> +#endif
> + /* Convert back events that are ready. */
> + for (i = 0; i < ready; i++)
> + *retval += pselcollect(p, &kev[i], pobits);
> +
> + /*
> + * Stop if there was an error or if we had enough
> + * place to collect all events that were ready.
> + */
> + if (error || ready < count)
> + break;
> +
> + timeout = &ts; /* successive loops non-blocking */
> + timespecclear(timeout);
... again, it looks like we now truncate instead of resleeping.