On 12/12/20(Sat) 11:29, Visa Hankala wrote:
> On Fri, Dec 11, 2020 at 09:35:59AM -0300, Martin Pieuchot wrote:
> > On 10/12/20(Thu) 09:59, Martin Pieuchot wrote:
> > > All previous kqueue refactoring have been committed, here's a final diff
> > > to modify the internal implementation of {p,}select(2) to query kqfilter
> > > handlers instead of poll ones.
> > >
> > > {p,}poll(2) are left untouched to ease the transition.
> > >
> > > Here's what I said in the original mail back in May [0]:
> > >
> > > > The main argument for this proposal is to reduce the amount of code
> > > > executed to notify userland when an event occur. The outcome of this
> > > > diff is that a single notification subsystem needs to be taken out of
> > > > the KERNEL_LOCK(). This simplifies a lot existing locking tentacles.
> > > >
> > > > Using kqueue internally means collision is avoided and there's no need
> > > > to query handlers for fds that aren't ready. This comes at the cost of
> > > > allocating descriptors. A space vs time trade-off. Note that this cost
> > > > can be diminished by doing lazy removal of event descriptors to be able
> > > > to re-use them.
> > >
> > > The logic is as follow:
> > >
> > > - With this change every thread use a "private" kqueue, usable only by
> > > the kernel, to register events for select(2) and later poll(2).
> > >
> > > - Events specified via FD_SET(2) are converted to their kqueue equivalent.
> > > ktrace(1) now also outputs converted events to ease debugging.
> > >
> > > - kqueue_scan() might be called multiple times per syscall, just like with
> > > the last version of kevent(2).
> > >
> > > - At the end of every {p,}select(2) syscall the private kqueue is purged.
> > >
> > > [0] https://marc.info/?l=openbsd-tech&m=158979921322191&w=2
> >
> > Updated diff that adds a FALLTHOUGHT lost in refactoring and do not
> > block in if the timeout is cleared and no events are requested, pointed
> > by cheloha@:
Updated diff below.
> > +/*
> > + * Convert fd_set into kqueue events and register them on the
> > + * per-thread queue.
> > + */
> > int
> > -selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
> > - register_t *retval)
> > +pselregister(struct proc *p, fd_set *ibits, int nfd, int ni, int
> > *nregistered)
>
> I think pselregister() should take the bitvector similarly to
> pselcollect(). Then the function could skip some pointer arithmetics
> as well.
>
> int pselregister(struct proc *p, fd_set *pibits[3], int nfd, int
> *nregistered);
I don't mind but did not include it in the diff, this seem like an extra
refactoring to me and I'm trying to change few lines as possible.
> > +/*
> > + * Convert given kqueue event into corresponding select(2) bit.
> > + */
> > +int
> > +pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
> > +{
> > +#ifdef DIAGNOSTIC
> > + /* Filter out and lazily delete spurious events */
> > + if ((unsigned long)kevp->udata != p->p_kq_serial) {
> > + DPRINTFN(0, "select fd %u mismatched serial %lu\n",
> > + (int)kevp->ident, p->p_kq_serial);
> > + kevp->flags = EV_DISABLE|EV_DELETE;
> > + kqueue_register(p->p_kq, kevp, p);
> > + return (0);
> > + }
> > +#endif
>
> Shouldn't skipping of spurious events be taken into account in the
> main scan loop? Otherwise they may cause artifacts, such as premature
> timeout expiry when a wakeup is caused solely by a spurious event, or
> return of an incomplete result when spurious events eat capacity from
> nevents.
This sounds like an later improvement to me and I'd like to discuss it
separately.
Index: kern/sys_generic.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.132
diff -u -p -r1.132 sys_generic.c
--- kern/sys_generic.c 2 Oct 2020 15:45:22 -0000 1.132
+++ kern/sys_generic.c 15 Dec 2020 10:36:28 -0000
@@ -55,6 +55,7 @@
#include <sys/time.h>
#include <sys/malloc.h>
#include <sys/poll.h>
+#include <sys/eventvar.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
@@ -66,8 +67,21 @@
#include <uvm/uvm_extern.h>
-int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
-void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
+/*
+ * Debug values:
+ * 1 - print implementation errors, things that should not happen.
+ * 2 - print ppoll(2) information, somewhat verbose
+ * 3 - print pselect(2) and ppoll(2) information, very verbose
+ */
+int kqpoll_debug = 0;
+#define DPRINTFN(v, x...) if (kqpoll_debug > v) { \
+ printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
+ printf(x); \
+}
+
+int pselregister(struct proc *, fd_set *, int, int, int *);
+int pselcollect(struct proc *, struct kevent *, fd_set *[]);
+
int pollout(struct pollfd *, struct pollfd *, u_int);
int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
struct timespec *, const sigset_t *, register_t *);
@@ -584,11 +598,10 @@ int
dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
{
+ struct kqueue_scan_state scan;
fd_mask bits[6];
fd_set *pibits[3], *pobits[3];
- struct timespec elapsed, start, stop;
- uint64_t nsecs;
- int s, ncoll, error = 0;
+ int error, nevents = 0;
u_int ni;
if (nd < 0)
@@ -618,6 +631,8 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)&bits[5];
}
+ kqpoll_init();
+
#define getbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
goto done;
@@ -636,43 +651,59 @@ dopselect(struct proc *p, int nd, fd_set
if (sigmask)
dosigsuspend(p, *sigmask &~ sigcantmask);
-retry:
- ncoll = nselcoll;
- atomic_setbits_int(&p->p_flag, P_SELECT);
- error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
- if (error || *retval)
+ /* Register kqueue events */
+ if ((error = pselregister(p, pibits[0], nd, ni, &nevents) != 0))
goto done;
- if (timeout == NULL || timespecisset(timeout)) {
- if (timeout != NULL) {
- getnanouptime(&start);
- nsecs = MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP);
- } else
- nsecs = INFSLP;
- s = splhigh();
- if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
- splx(s);
- goto retry;
- }
- atomic_clearbits_int(&p->p_flag, P_SELECT);
- error = tsleep_nsec(&selwait, PSOCK | PCATCH, "select", nsecs);
- splx(s);
+
+ /*
+ * The poll/select family of syscalls has been designed to
+ * block when file descriptors are not available, even if
+ * there's nothing to wait for.
+ */
+ if (nevents == 0) {
+ uint64_t nsecs = INFSLP;
+
if (timeout != NULL) {
- getnanouptime(&stop);
- timespecsub(&stop, &start, &elapsed);
- timespecsub(timeout, &elapsed, timeout);
- if (timeout->tv_sec < 0)
- timespecclear(timeout);
+ if (!timespecisset(timeout))
+ goto done;
+ nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
}
- if (error == 0 || error == EWOULDBLOCK)
- goto retry;
+ error = tsleep_nsec(&p->p_kq, PSOCK | PCATCH, "kqsel", nsecs);
+ /* select is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+ goto done;
+ }
+
+ /* Collect at most `nevents' possibly waiting in kqueue_scan() */
+ kqueue_scan_setup(&scan, p->p_kq);
+ while (nevents > 0) {
+ struct kevent kev[KQ_NEVENTS];
+ int i, ready, count;
+
+ /* Maxium number of events per iteration */
+ count = MIN(nitems(kev), nevents);
+ ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, kev, ready);
+#endif
+ /* Convert back events that are ready. */
+ for (i = 0; i < ready; i++)
+ *retval += pselcollect(p, &kev[i], pobits);
+ /*
+ * Stop if there was an error or if we had enough
+ * space to collect all events that were ready.
+ */
+ if (error || ready < count)
+ break;
+
+ nevents -= ready;
}
-done:
- atomic_clearbits_int(&p->p_flag, P_SELECT);
- /* select is not restarted after signals... */
- if (error == ERESTART)
- error = EINTR;
- if (error == EWOULDBLOCK)
- error = 0;
+ kqueue_scan_finish(&scan);
+ done:
#define putbits(name, x) \
if (name && (error2 = copyout(pobits[x], name, ni))) \
error = error2;
@@ -694,41 +725,102 @@ done:
if (pibits[0] != (fd_set *)&bits[0])
free(pibits[0], M_TEMP, 6 * ni);
+
+ kqueue_purge(p, p->p_kq);
+ p->p_kq_serial += nd;
+
return (error);
}
+/*
+ * Convert fd_set into kqueue events and register them on the
+ * per-thread queue.
+ */
int
-selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
- register_t *retval)
+pselregister(struct proc *p, fd_set *ibits, int nfd, int ni, int *nregistered)
{
- caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
- struct filedesc *fdp = p->p_fd;
- int msk, i, j, fd;
+ static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
+ static const int evff[] = { 0, 0, NOTE_OOB };
+ caddr_t cibits = (caddr_t)ibits;
+ int msk, i, j, fd, nevents = 0, error = 0;
+ struct kevent kev;
fd_mask bits;
- struct file *fp;
- int n = 0;
- static const int flag[3] = { POLLIN, POLLOUT|POLL_NOHUP, POLLPRI };
for (msk = 0; msk < 3; msk++) {
fd_set *pibits = (fd_set *)&cibits[msk*ni];
- fd_set *pobits = (fd_set *)&cobits[msk*ni];
for (i = 0; i < nfd; i += NFDBITS) {
bits = pibits->fds_bits[i/NFDBITS];
while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
bits &= ~(1 << j);
- if ((fp = fd_getfile(fdp, fd)) == NULL)
- return (EBADF);
- if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
- FD_SET(fd, pobits);
- n++;
+
+ DPRINTFN(2, "select fd %d mask %d serial %lu\n",
+ fd, msk, p->p_kq_serial);
+ EV_SET(&kev, fd, evf[msk],
+ EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL,
+ evff[msk], 0, (void *)(p->p_kq_serial));
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_STRUCT))
+ ktrevent(p, &kev, 1);
+#endif
+ error = kqueue_register(p->p_kq, &kev, p);
+ switch (error) {
+ case 0:
+ nevents++;
+ /* FALLTHROUGH */
+ case EOPNOTSUPP:/* No underlying kqfilter */
+ case EINVAL: /* Unimplemented filter */
+ error = 0;
+ break;
+ case ENXIO: /* Device has been detached */
+ default:
+ goto bad;
}
- FRELE(fp, p);
}
}
}
- *retval = n;
+
+ *nregistered = nevents;
return (0);
+bad:
+ DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
+ kev.filter, error);
+ return (error);
+}
+
+/*
+ * Convert given kqueue event into corresponding select(2) bit.
+ */
+int
+pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3])
+{
+#ifdef DIAGNOSTIC
+ /* Filter out and lazily delete spurious events */
+ if ((unsigned long)kevp->udata != p->p_kq_serial) {
+ DPRINTFN(0, "select fd %u mismatched serial %lu\n",
+ (int)kevp->ident, p->p_kq_serial);
+ kevp->flags = EV_DISABLE|EV_DELETE;
+ kqueue_register(p->p_kq, kevp, p);
+ return (0);
+ }
+#endif
+
+ switch (kevp->filter) {
+ case EVFILT_READ:
+ FD_SET(kevp->ident, pobits[0]);
+ break;
+ case EVFILT_WRITE:
+ FD_SET(kevp->ident, pobits[1]);
+ break;
+ case EVFILT_EXCEPT:
+ FD_SET(kevp->ident, pobits[2]);
+ break;
+ default:
+ KASSERT(0);
+ }
+
+ DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
+ return (1);
}
int