This diff adds f_modify and f_process callbacks to socket event filters. As a result, socket events are handled using the non-legacy paths in filter_modify() and filter_process() of kern_event.c This a step toward MP-safety. However, everything still runs under the kernel lock.
The change has three intended effects: * Socket events are handled without raising the system priority level. This makes the activity observable with btrace(8). * kqueue itself no longer calls f_event of socket filterops, which allows replacing the conditional, NOTE_SUBMIT-based locking with a fixed call pattern. * The state of a socket event is now always rechecked before delivery to user. Before, the recheck was skipped if the event was registered with EV_ONESHOT. However, the change of behaviour with EV_ONESHOT is questionable. When an activated event is being processed, the code will acquire the socket lock anyway. Skipping the state check would only be a minor optimization. In addition, I think the behaviour becomes more consistent as now a delivered EV_ONESHOT event really was active at the time of delivery. Consider the following program. It creates a socket pair, writes a byte to the socket, registers an EV_ONESHOT event, and reads the byte from the socket. Next it checks how kevent(2) behaves. #include <sys/types.h> #include <sys/event.h> #include <sys/socket.h> #include <sys/time.h> #include <err.h> #include <fcntl.h> #include <stdio.h> #include <unistd.h> int main(void) { struct kevent kev[1]; struct timespec ts = {}; int fds[2], flags, kq, n; char b; if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == -1) err(1, "socketpair"); flags = fcntl(fds[0], F_GETFL, 0); fcntl(fds[0], F_SETFL, flags | O_NONBLOCK); printf("write 1\n"); write(fds[1], "x", 1); kq = kqueue(); if (kq == -1) err(1, "kqueue"); EV_SET(&kev[0], fds[0], EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, NULL); if (kevent(kq, kev, 1, NULL, 0, NULL) == -1) err(1, "kevent"); n = read(fds[0], &b, 1); printf("read %d\n", n); n = read(fds[0], &b, 1); printf("read %d\n", n); n = kevent(kq, NULL, 0, kev, 1, &ts); printf("kevent %d\n", n); n = read(fds[0], &b, 1); printf("read %d\n", n); n = kevent(kq, NULL, 0, kev, 1, &ts); printf("kevent %d\n", n); printf("write 1\n"); write(fds[1], "x", 1); n = kevent(kq, NULL, 0, kev, 1, &ts); printf("kevent %d\n", n); n = read(fds[0], &b, 1); printf("read %d\n", n); return 0; } With an unpatched kernel, the EV_ONESHOT event gets activated by the pending byte when the event is registered. The event remains active until delivery, and the delivery happens even though it is clear that reading from the socket will fail. The program prints: write 1 read 1 read -1 kevent 1 read -1 kevent 0 write 1 kevent 0 read 1 With the patch applied, the event gets delivered only if the socket has bytes pending. write 1 read 1 read -1 kevent 0 read -1 kevent 0 write 1 kevent 1 read 1 So, is this EV_ONESHOT change reasonable, or should the implementation stick with the old way? FreeBSD appears to follow the old way. MacOS might perform differently, though I am not sure about that. It is not essential to change EV_ONESHOT, however. Feedback and tests are welcome. Index: kern/uipc_socket.c =================================================================== RCS file: src/sys/kern/uipc_socket.c,v retrieving revision 1.261 diff -u -p -r1.261 uipc_socket.c --- kern/uipc_socket.c 13 May 2021 19:43:11 -0000 1.261 +++ kern/uipc_socket.c 18 May 2021 12:56:24 -0000 @@ -70,15 +70,26 @@ void sorflush(struct socket *); void filt_sordetach(struct knote *kn); int filt_soread(struct knote *kn, long hint); +int filt_soreadmodify(struct kevent *kev, struct knote *kn); +int filt_soreadprocess(struct knote *kn, struct kevent *kev); +int filt_soread_common(struct knote *kn, struct socket *so); void filt_sowdetach(struct knote *kn); int filt_sowrite(struct knote *kn, long hint); +int filt_sowritemodify(struct kevent *kev, struct knote *kn); +int filt_sowriteprocess(struct knote *kn, struct kevent *kev); +int filt_sowrite_common(struct knote *kn, struct socket *so); int filt_solisten(struct knote *kn, long hint); +int filt_solistenmodify(struct kevent *kev, struct knote *kn); +int filt_solistenprocess(struct knote *kn, struct kevent *kev); +int filt_solisten_common(struct knote *kn, struct socket *so); const struct filterops solisten_filtops = { .f_flags = FILTEROP_ISFD, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_solisten, + .f_modify = filt_solistenmodify, + .f_process = filt_solistenprocess, }; const struct filterops soread_filtops = { @@ -86,6 +97,8 @@ const struct filterops soread_filtops = .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soread, + .f_modify = filt_soreadmodify, + .f_process = filt_soreadprocess, }; const struct filterops sowrite_filtops = { @@ -93,6 +106,8 @@ const struct filterops sowrite_filtops = .f_attach = NULL, .f_detach = filt_sowdetach, .f_event = filt_sowrite, + .f_modify = filt_sowritemodify, + .f_process = filt_sowriteprocess, }; const struct filterops soexcept_filtops = { @@ -100,6 +115,8 @@ const struct filterops soexcept_filtops .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soread, + .f_modify = filt_soreadmodify, + .f_process = filt_soreadprocess, }; #ifndef SOMINCONN @@ -2056,13 +2073,12 @@ filt_sordetach(struct knote *kn) } int -filt_soread(struct knote *kn, long hint) +filt_soread_common(struct knote *kn, struct socket *so) { - struct socket *so = kn->kn_fp->f_data; - int s, rv = 0; + int rv = 0; + + soassertlocked(so); - if ((hint & NOTE_SUBMIT) == 0) - s = solock(so); kn->kn_data = so->so_rcv.sb_cc; #ifdef SOCKET_SPLICE if (isspliced(so)) { @@ -2090,12 +2106,47 @@ filt_soread(struct knote *kn, long hint) } else { rv = (kn->kn_data >= so->so_rcv.sb_lowat); } - if ((hint & NOTE_SUBMIT) == 0) - sounlock(so, s); return rv; } +int +filt_soread(struct knote *kn, long hint) +{ + struct socket *so = kn->kn_fp->f_data; + + return (filt_soread_common(kn, so)); +} + +int +filt_soreadmodify(struct kevent *kev, struct knote *kn) +{ + struct socket *so = kn->kn_fp->f_data; + int rv, s; + + s = solock(so); + knote_modify(kev, kn); + rv = filt_soread_common(kn, so); + sounlock(so, s); + + return (rv); +} + +int +filt_soreadprocess(struct knote *kn, struct kevent *kev) +{ + struct socket *so = kn->kn_fp->f_data; + int rv, s; + + s = solock(so); + rv = filt_soread_common(kn, so); + if (rv != 0) + knote_submit(kn, kev); + sounlock(so, s); + + return (rv); +} + void filt_sowdetach(struct knote *kn) { @@ -2107,13 +2158,12 @@ filt_sowdetach(struct knote *kn) } int -filt_sowrite(struct knote *kn, long hint) +filt_sowrite_common(struct knote *kn, struct socket *so) { - struct socket *so = kn->kn_fp->f_data; - int s, rv; + int rv; + + soassertlocked(so); - if ((hint & NOTE_SUBMIT) == 0) - s = solock(so); kn->kn_data = sbspace(so, &so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; @@ -2133,27 +2183,94 @@ filt_sowrite(struct knote *kn, long hint } else { rv = (kn->kn_data >= so->so_snd.sb_lowat); } - if ((hint & NOTE_SUBMIT) == 0) - sounlock(so, s); return (rv); } int -filt_solisten(struct knote *kn, long hint) +filt_sowrite(struct knote *kn, long hint) { struct socket *so = kn->kn_fp->f_data; - int s; - if ((hint & NOTE_SUBMIT) == 0) - s = solock(so); + return (filt_sowrite_common(kn, so)); +} + +int +filt_sowritemodify(struct kevent *kev, struct knote *kn) +{ + struct socket *so = kn->kn_fp->f_data; + int rv, s; + + s = solock(so); + knote_modify(kev, kn); + rv = filt_sowrite_common(kn, so); + sounlock(so, s); + + return (rv); +} + +int +filt_sowriteprocess(struct knote *kn, struct kevent *kev) +{ + struct socket *so = kn->kn_fp->f_data; + int rv, s; + + s = solock(so); + rv = filt_sowrite_common(kn, so); + if (rv != 0) + knote_submit(kn, kev); + sounlock(so, s); + + return (rv); +} + +int +filt_solisten_common(struct knote *kn, struct socket *so) +{ + soassertlocked(so); + kn->kn_data = so->so_qlen; - if ((hint & NOTE_SUBMIT) == 0) - sounlock(so, s); return (kn->kn_data != 0); } +int +filt_solisten(struct knote *kn, long hint) +{ + struct socket *so = kn->kn_fp->f_data; + + return (filt_solisten_common(kn, so)); +} + +int +filt_solistenmodify(struct kevent *kev, struct knote *kn) +{ + struct socket *so = kn->kn_fp->f_data; + int rv, s; + + s = solock(so); + knote_modify(kev, kn); + rv = filt_solisten_common(kn, so); + sounlock(so, s); + + return (rv); +} + +int +filt_solistenprocess(struct knote *kn, struct kevent *kev) +{ + struct socket *so = kn->kn_fp->f_data; + int rv, s; + + s = solock(so); + rv = filt_solisten_common(kn, so); + if (rv != 0) + knote_submit(kn, kev); + sounlock(so, s); + + return (rv); +} + #ifdef DDB void sobuf_print(struct sockbuf *, Index: kern/uipc_syscalls.c =================================================================== RCS file: src/sys/kern/uipc_syscalls.c,v retrieving revision 1.190 diff -u -p -r1.190 uipc_syscalls.c --- kern/uipc_syscalls.c 13 May 2021 17:31:59 -0000 1.190 +++ kern/uipc_syscalls.c 18 May 2021 12:56:24 -0000 @@ -308,7 +308,7 @@ doaccept(struct proc *p, int sock, struc : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0); /* connection has been removed from the listen queue */ - KNOTE(&head->so_rcv.sb_sel.si_note, NOTE_SUBMIT); + KNOTE(&head->so_rcv.sb_sel.si_note, 0); fp->f_type = DTYPE_SOCKET; fp->f_flag = FREAD | FWRITE | nflag;