This diff adds f_modify and f_process callbacks to socket event filters.
As a result, socket events are handled using the non-legacy paths in
filter_modify() and filter_process() of kern_event.c This a step toward
MP-safety. However, everything still runs under the kernel lock.

The change has three intended effects:

* Socket events are handled without raising the system priority level.
  This makes the activity observable with btrace(8).

* kqueue itself no longer calls f_event of socket filterops, which
  allows replacing the conditional, NOTE_SUBMIT-based locking with
  a fixed call pattern.

* The state of a socket event is now always rechecked before delivery
  to user. Before, the recheck was skipped if the event was registered
  with EV_ONESHOT.

However, the change of behaviour with EV_ONESHOT is questionable.
When an activated event is being processed, the code will acquire the
socket lock anyway. Skipping the state check would only be a minor
optimization. In addition, I think the behaviour becomes more
consistent as now a delivered EV_ONESHOT event really was active at
the time of delivery.

Consider the following program. It creates a socket pair, writes a byte
to the socket, registers an EV_ONESHOT event, and reads the byte from
the socket. Next it checks how kevent(2) behaves.

#include <sys/types.h>
#include <sys/event.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <err.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>

int
main(void)
{
        struct kevent kev[1];
        struct timespec ts = {};
        int fds[2], flags, kq, n;
        char b;

        if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == -1)
                err(1, "socketpair");
        flags = fcntl(fds[0], F_GETFL, 0);
        fcntl(fds[0], F_SETFL, flags | O_NONBLOCK);

        printf("write 1\n");
        write(fds[1], "x", 1);

        kq = kqueue();
        if (kq == -1)
                err(1, "kqueue");
        EV_SET(&kev[0], fds[0], EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, NULL);
        if (kevent(kq, kev, 1, NULL, 0, NULL) == -1)
                err(1, "kevent");

        n = read(fds[0], &b, 1);
        printf("read %d\n", n);
        n = read(fds[0], &b, 1);
        printf("read %d\n", n);

        n = kevent(kq, NULL, 0, kev, 1, &ts);
        printf("kevent %d\n", n);
        n = read(fds[0], &b, 1);
        printf("read %d\n", n);

        n = kevent(kq, NULL, 0, kev, 1, &ts);
        printf("kevent %d\n", n);

        printf("write 1\n");
        write(fds[1], "x", 1);

        n = kevent(kq, NULL, 0, kev, 1, &ts);
        printf("kevent %d\n", n);
        n = read(fds[0], &b, 1);
        printf("read %d\n", n);

        return 0;
}

With an unpatched kernel, the EV_ONESHOT event gets activated by the
pending byte when the event is registered. The event remains active
until delivery, and the delivery happens even though it is clear that
reading from the socket will fail. The program prints:

write 1
read 1
read -1
kevent 1
read -1
kevent 0
write 1
kevent 0
read 1

With the patch applied, the event gets delivered only if the socket
has bytes pending.

write 1
read 1
read -1
kevent 0
read -1
kevent 0
write 1
kevent 1
read 1

So, is this EV_ONESHOT change reasonable, or should the implementation
stick with the old way? FreeBSD appears to follow the old way. MacOS
might perform differently, though I am not sure about that.

It is not essential to change EV_ONESHOT, however.

Feedback and tests are welcome.

Index: kern/uipc_socket.c
===================================================================
RCS file: src/sys/kern/uipc_socket.c,v
retrieving revision 1.261
diff -u -p -r1.261 uipc_socket.c
--- kern/uipc_socket.c  13 May 2021 19:43:11 -0000      1.261
+++ kern/uipc_socket.c  18 May 2021 12:56:24 -0000
@@ -70,15 +70,26 @@ void        sorflush(struct socket *);
 
 void   filt_sordetach(struct knote *kn);
 int    filt_soread(struct knote *kn, long hint);
+int    filt_soreadmodify(struct kevent *kev, struct knote *kn);
+int    filt_soreadprocess(struct knote *kn, struct kevent *kev);
+int    filt_soread_common(struct knote *kn, struct socket *so);
 void   filt_sowdetach(struct knote *kn);
 int    filt_sowrite(struct knote *kn, long hint);
+int    filt_sowritemodify(struct kevent *kev, struct knote *kn);
+int    filt_sowriteprocess(struct knote *kn, struct kevent *kev);
+int    filt_sowrite_common(struct knote *kn, struct socket *so);
 int    filt_solisten(struct knote *kn, long hint);
+int    filt_solistenmodify(struct kevent *kev, struct knote *kn);
+int    filt_solistenprocess(struct knote *kn, struct kevent *kev);
+int    filt_solisten_common(struct knote *kn, struct socket *so);
 
 const struct filterops solisten_filtops = {
        .f_flags        = FILTEROP_ISFD,
        .f_attach       = NULL,
        .f_detach       = filt_sordetach,
        .f_event        = filt_solisten,
+       .f_modify       = filt_solistenmodify,
+       .f_process      = filt_solistenprocess,
 };
 
 const struct filterops soread_filtops = {
@@ -86,6 +97,8 @@ const struct filterops soread_filtops = 
        .f_attach       = NULL,
        .f_detach       = filt_sordetach,
        .f_event        = filt_soread,
+       .f_modify       = filt_soreadmodify,
+       .f_process      = filt_soreadprocess,
 };
 
 const struct filterops sowrite_filtops = {
@@ -93,6 +106,8 @@ const struct filterops sowrite_filtops =
        .f_attach       = NULL,
        .f_detach       = filt_sowdetach,
        .f_event        = filt_sowrite,
+       .f_modify       = filt_sowritemodify,
+       .f_process      = filt_sowriteprocess,
 };
 
 const struct filterops soexcept_filtops = {
@@ -100,6 +115,8 @@ const struct filterops soexcept_filtops 
        .f_attach       = NULL,
        .f_detach       = filt_sordetach,
        .f_event        = filt_soread,
+       .f_modify       = filt_soreadmodify,
+       .f_process      = filt_soreadprocess,
 };
 
 #ifndef SOMINCONN
@@ -2056,13 +2073,12 @@ filt_sordetach(struct knote *kn)
 }
 
 int
-filt_soread(struct knote *kn, long hint)
+filt_soread_common(struct knote *kn, struct socket *so)
 {
-       struct socket *so = kn->kn_fp->f_data;
-       int s, rv = 0;
+       int rv = 0;
+
+       soassertlocked(so);
 
-       if ((hint & NOTE_SUBMIT) == 0)
-               s = solock(so);
        kn->kn_data = so->so_rcv.sb_cc;
 #ifdef SOCKET_SPLICE
        if (isspliced(so)) {
@@ -2090,12 +2106,47 @@ filt_soread(struct knote *kn, long hint)
        } else {
                rv = (kn->kn_data >= so->so_rcv.sb_lowat);
        }
-       if ((hint & NOTE_SUBMIT) == 0)
-               sounlock(so, s);
 
        return rv;
 }
 
+int
+filt_soread(struct knote *kn, long hint)
+{
+       struct socket *so = kn->kn_fp->f_data;
+
+       return (filt_soread_common(kn, so));
+}
+
+int
+filt_soreadmodify(struct kevent *kev, struct knote *kn)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int rv, s;
+
+       s = solock(so);
+       knote_modify(kev, kn);
+       rv = filt_soread_common(kn, so);
+       sounlock(so, s);
+
+       return (rv);
+}
+
+int
+filt_soreadprocess(struct knote *kn, struct kevent *kev)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int rv, s;
+
+       s = solock(so);
+       rv = filt_soread_common(kn, so);
+       if (rv != 0)
+               knote_submit(kn, kev);
+       sounlock(so, s);
+
+       return (rv);
+}
+
 void
 filt_sowdetach(struct knote *kn)
 {
@@ -2107,13 +2158,12 @@ filt_sowdetach(struct knote *kn)
 }
 
 int
-filt_sowrite(struct knote *kn, long hint)
+filt_sowrite_common(struct knote *kn, struct socket *so)
 {
-       struct socket *so = kn->kn_fp->f_data;
-       int s, rv;
+       int rv;
+
+       soassertlocked(so);
 
-       if ((hint & NOTE_SUBMIT) == 0)
-               s = solock(so);
        kn->kn_data = sbspace(so, &so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                kn->kn_flags |= EV_EOF;
@@ -2133,27 +2183,94 @@ filt_sowrite(struct knote *kn, long hint
        } else {
                rv = (kn->kn_data >= so->so_snd.sb_lowat);
        }
-       if ((hint & NOTE_SUBMIT) == 0)
-               sounlock(so, s);
 
        return (rv);
 }
 
 int
-filt_solisten(struct knote *kn, long hint)
+filt_sowrite(struct knote *kn, long hint)
 {
        struct socket *so = kn->kn_fp->f_data;
-       int s;
 
-       if ((hint & NOTE_SUBMIT) == 0)
-               s = solock(so);
+       return (filt_sowrite_common(kn, so));
+}
+
+int
+filt_sowritemodify(struct kevent *kev, struct knote *kn)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int rv, s;
+
+       s = solock(so);
+       knote_modify(kev, kn);
+       rv = filt_sowrite_common(kn, so);
+       sounlock(so, s);
+
+       return (rv);
+}
+
+int
+filt_sowriteprocess(struct knote *kn, struct kevent *kev)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int rv, s;
+
+       s = solock(so);
+       rv = filt_sowrite_common(kn, so);
+       if (rv != 0)
+               knote_submit(kn, kev);
+       sounlock(so, s);
+
+       return (rv);
+}
+
+int
+filt_solisten_common(struct knote *kn, struct socket *so)
+{
+       soassertlocked(so);
+
        kn->kn_data = so->so_qlen;
-       if ((hint & NOTE_SUBMIT) == 0)
-               sounlock(so, s);
 
        return (kn->kn_data != 0);
 }
 
+int
+filt_solisten(struct knote *kn, long hint)
+{
+       struct socket *so = kn->kn_fp->f_data;
+
+       return (filt_solisten_common(kn, so));
+}
+
+int
+filt_solistenmodify(struct kevent *kev, struct knote *kn)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int rv, s;
+
+       s = solock(so);
+       knote_modify(kev, kn);
+       rv = filt_solisten_common(kn, so);
+       sounlock(so, s);
+
+       return (rv);
+}
+
+int
+filt_solistenprocess(struct knote *kn, struct kevent *kev)
+{
+       struct socket *so = kn->kn_fp->f_data;
+       int rv, s;
+
+       s = solock(so);
+       rv = filt_solisten_common(kn, so);
+       if (rv != 0)
+               knote_submit(kn, kev);
+       sounlock(so, s);
+
+       return (rv);
+}
+
 #ifdef DDB
 void
 sobuf_print(struct sockbuf *,
Index: kern/uipc_syscalls.c
===================================================================
RCS file: src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.190
diff -u -p -r1.190 uipc_syscalls.c
--- kern/uipc_syscalls.c        13 May 2021 17:31:59 -0000      1.190
+++ kern/uipc_syscalls.c        18 May 2021 12:56:24 -0000
@@ -308,7 +308,7 @@ doaccept(struct proc *p, int sock, struc
            : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
 
        /* connection has been removed from the listen queue */
-       KNOTE(&head->so_rcv.sb_sel.si_note, NOTE_SUBMIT);
+       KNOTE(&head->so_rcv.sb_sel.si_note, 0);
 
        fp->f_type = DTYPE_SOCKET;
        fp->f_flag = FREAD | FWRITE | nflag;

Reply via email to