On 22.8.2022. 15:07, Alexander Bluhm wrote:
> On Sun, Aug 21, 2022 at 07:07:29PM +0200, Alexander Bluhm wrote:
>> On Fri, Aug 19, 2022 at 10:54:42PM +0200, Alexander Bluhm wrote:
>>> This diff allows to run udp_input() in parallel.
>
> Diff rebased to -current.
Hi,
is this diff still active? I was running this diff in prod with
wireguard, remote syslog, dhcp server, ntp client for 2 months and all
seems good.
>
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.284
> diff -u -p -r1.284 uipc_socket.c
> --- kern/uipc_socket.c 21 Aug 2022 16:22:17 -0000 1.284
> +++ kern/uipc_socket.c 22 Aug 2022 12:01:58 -0000
> @@ -822,10 +822,10 @@ bad:
> if (mp)
> *mp = NULL;
>
> - solock(so);
> + solock_shared(so);
> restart:
> if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
> - sounlock(so);
> + sounlock_shared(so);
> return (error);
> }
>
> @@ -893,7 +893,7 @@ restart:
> sbunlock(so, &so->so_rcv);
> error = sbwait(so, &so->so_rcv);
> if (error) {
> - sounlock(so);
> + sounlock_shared(so);
> return (error);
> }
> goto restart;
> @@ -962,11 +962,11 @@ dontblock:
> sbsync(&so->so_rcv, nextrecord);
> if (controlp) {
> if (pr->pr_domain->dom_externalize) {
> - sounlock(so);
> + sounlock_shared(so);
> error =
> (*pr->pr_domain->dom_externalize)
> (cm, controllen, flags);
> - solock(so);
> + solock_shared(so);
> }
> *controlp = cm;
> } else {
> @@ -1040,9 +1040,9 @@ dontblock:
> SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
> SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
> resid = uio->uio_resid;
> - sounlock(so);
> + sounlock_shared(so);
> uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
> - solock(so);
> + solock_shared(so);
> if (uio_error)
> uio->uio_resid = resid - len;
> } else
> @@ -1126,7 +1126,7 @@ dontblock:
> error = sbwait(so, &so->so_rcv);
> if (error) {
> sbunlock(so, &so->so_rcv);
> - sounlock(so);
> + sounlock_shared(so);
> return (0);
> }
> if ((m = so->so_rcv.sb_mb) != NULL)
> @@ -1171,7 +1171,7 @@ dontblock:
> *flagsp |= flags;
> release:
> sbunlock(so, &so->so_rcv);
> - sounlock(so);
> + sounlock_shared(so);
> return (error);
> }
>
> Index: kern/uipc_socket2.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket2.c,v
> retrieving revision 1.127
> diff -u -p -r1.127 uipc_socket2.c
> --- kern/uipc_socket2.c 13 Aug 2022 21:01:46 -0000 1.127
> +++ kern/uipc_socket2.c 22 Aug 2022 12:01:58 -0000
> @@ -360,6 +360,24 @@ solock(struct socket *so)
> }
> }
>
> +void
> +solock_shared(struct socket *so)
> +{
> + switch (so->so_proto->pr_domain->dom_family) {
> + case PF_INET:
> + case PF_INET6:
> + if (so->so_proto->pr_usrreqs->pru_lock != NULL) {
> + NET_LOCK_SHARED();
> + pru_lock(so);
> + } else
> + NET_LOCK();
> + break;
> + default:
> + rw_enter_write(&so->so_lock);
> + break;
> + }
> +}
> +
> int
> solock_persocket(struct socket *so)
> {
> @@ -403,6 +421,24 @@ sounlock(struct socket *so)
> }
>
> void
> +sounlock_shared(struct socket *so)
> +{
> + switch (so->so_proto->pr_domain->dom_family) {
> + case PF_INET:
> + case PF_INET6:
> + if (so->so_proto->pr_usrreqs->pru_unlock != NULL) {
> + pru_unlock(so);
> + NET_UNLOCK_SHARED();
> + } else
> + NET_UNLOCK();
> + break;
> + default:
> + rw_exit_write(&so->so_lock);
> + break;
> + }
> +}
> +
> +void
> soassertlocked(struct socket *so)
> {
> switch (so->so_proto->pr_domain->dom_family) {
> @@ -425,7 +461,15 @@ sosleep_nsec(struct socket *so, void *id
> switch (so->so_proto->pr_domain->dom_family) {
> case PF_INET:
> case PF_INET6:
> + if (so->so_proto->pr_usrreqs->pru_unlock != NULL &&
> + rw_status(&netlock) == RW_READ) {
> + pru_unlock(so);
> + }
> ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
> + if (so->so_proto->pr_usrreqs->pru_lock != NULL &&
> + rw_status(&netlock) == RW_READ) {
> + pru_lock(so);
> + }
> break;
> default:
> ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs);
> Index: net/if_bridge.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_bridge.c,v
> retrieving revision 1.364
> diff -u -p -r1.364 if_bridge.c
> --- net/if_bridge.c 7 Aug 2022 00:57:43 -0000 1.364
> +++ net/if_bridge.c 22 Aug 2022 12:01:58 -0000
> @@ -1590,7 +1590,7 @@ bridge_ipsec(struct ifnet *ifp, struct e
> off);
> tdb_unref(tdb);
> if (prot != IPPROTO_DONE)
> - ip_deliver(&m, &hlen, prot, af);
> + ip_deliver(&m, &hlen, prot, af, 0);
> return (1);
> } else {
> tdb_unref(tdb);
> Index: netinet/in_proto.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_proto.c,v
> retrieving revision 1.99
> diff -u -p -r1.99 in_proto.c
> --- netinet/in_proto.c 15 Aug 2022 09:11:38 -0000 1.99
> +++ netinet/in_proto.c 22 Aug 2022 12:01:58 -0000
> @@ -185,7 +185,7 @@ const struct protosw inetsw[] = {
> .pr_type = SOCK_DGRAM,
> .pr_domain = &inetdomain,
> .pr_protocol = IPPROTO_UDP,
> - .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE,
> + .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE|PR_MPSAFE,
> .pr_input = udp_input,
> .pr_ctlinput = udp_ctlinput,
> .pr_ctloutput = ip_ctloutput,
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.380
> diff -u -p -r1.380 ip_input.c
> --- netinet/ip_input.c 21 Aug 2022 14:15:55 -0000 1.380
> +++ netinet/ip_input.c 22 Aug 2022 12:01:58 -0000
> @@ -230,6 +230,11 @@ ip_init(void)
> #endif
> }
>
> +struct ip_offnxt {
> + int ion_off;
> + int ion_nxt;
> +};
> +
> /*
> * Enqueue packet for local delivery. Queuing is used as a boundary
> * between the network layer (input/forward path) running with
> @@ -246,6 +251,30 @@ ip_ours(struct mbuf **mp, int *offp, int
> if (af != AF_UNSPEC)
> return nxt;
>
> + nxt = ip_deliver(mp, offp, nxt, AF_INET, 1);
> + if (nxt == IPPROTO_DONE)
> + return IPPROTO_DONE;
> +
> + /* save values for later, use after dequeue */
> + if (*offp != sizeof(struct ip)) {
> + struct m_tag *mtag;
> + struct ip_offnxt *ion;
> +
> + /* mbuf tags are expensive, but only used for header options */
> + mtag = m_tag_get(PACKET_TAG_IP_OFFNXT, sizeof(*ion),
> + M_NOWAIT);
> + if (mtag == NULL) {
> + ipstat_inc(ips_idropped);
> + m_freemp(mp);
> + return IPPROTO_DONE;
> + }
> + ion = (struct ip_offnxt *)(mtag + 1);
> + ion->ion_off = *offp;
> + ion->ion_nxt = nxt;
> +
> + m_tag_prepend(*mp, mtag);
> + }
> +
> niq_enqueue(&ipintrq, *mp);
> *mp = NULL;
> return IPPROTO_DONE;
> @@ -261,18 +290,31 @@ ipintr(void)
> struct mbuf *m;
>
> while ((m = niq_dequeue(&ipintrq)) != NULL) {
> - struct ip *ip;
> + struct m_tag *mtag;
> int off, nxt;
>
> #ifdef DIAGNOSTIC
> if ((m->m_flags & M_PKTHDR) == 0)
> panic("ipintr no HDR");
> #endif
> - ip = mtod(m, struct ip *);
> - off = ip->ip_hl << 2;
> - nxt = ip->ip_p;
> + mtag = m_tag_find(m, PACKET_TAG_IP_OFFNXT, NULL);
> + if (mtag != NULL) {
> + struct ip_offnxt *ion;
> +
> + ion = (struct ip_offnxt *)(mtag + 1);
> + off = ion->ion_off;
> + nxt = ion->ion_nxt;
> +
> + m_tag_delete(m, mtag);
> + } else {
> + struct ip *ip;
>
> - nxt = ip_deliver(&m, &off, nxt, AF_INET);
> + ip = mtod(m, struct ip *);
> + off = ip->ip_hl << 2;
> + nxt = ip->ip_p;
> + }
> +
> + nxt = ip_deliver(&m, &off, nxt, AF_INET, 0);
> KASSERT(nxt == IPPROTO_DONE);
> }
> }
> @@ -673,7 +715,7 @@ ip_fragcheck(struct mbuf **mp, int *offp
> #endif
>
> int
> -ip_deliver(struct mbuf **mp, int *offp, int nxt, int af)
> +ip_deliver(struct mbuf **mp, int *offp, int nxt, int af, int shared)
> {
> const struct protosw *psw;
> int naf = af;
> @@ -681,26 +723,24 @@ ip_deliver(struct mbuf **mp, int *offp,
> int nest = 0;
> #endif /* INET6 */
>
> - NET_ASSERT_LOCKED_EXCLUSIVE();
> -
> - /* pf might have modified stuff, might have to chksum */
> - switch (af) {
> - case AF_INET:
> - in_proto_cksum_out(*mp, NULL);
> - break;
> -#ifdef INET6
> - case AF_INET6:
> - in6_proto_cksum_out(*mp, NULL);
> - break;
> -#endif /* INET6 */
> - }
> -
> /*
> * Tell launch routine the next header
> */
> IPSTAT_INC(delivered);
>
> while (nxt != IPPROTO_DONE) {
> + switch (af) {
> + case AF_INET:
> + psw = &inetsw[ip_protox[nxt]];
> + break;
> +#ifdef INET6
> + case AF_INET6:
> + psw = &inet6sw[ip6_protox[nxt]];
> + break;
> +#endif /* INET6 */
> + }
> + if (shared && !ISSET(psw->pr_flags, PR_MPSAFE))
> + break;
> #ifdef INET6
> if (af == AF_INET6 &&
> ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) {
> @@ -737,16 +777,6 @@ ip_deliver(struct mbuf **mp, int *offp,
> case IPPROTO_IPV6:
> naf = AF_INET6;
> ip6stat_inc(ip6s_delivered);
> - break;
> -#endif /* INET6 */
> - }
> - switch (af) {
> - case AF_INET:
> - psw = &inetsw[ip_protox[nxt]];
> - break;
> -#ifdef INET6
> - case AF_INET6:
> - psw = &inet6sw[ip6_protox[nxt]];
> break;
> #endif /* INET6 */
> }
> Index: netinet/ip_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> retrieving revision 1.99
> diff -u -p -r1.99 ip_var.h
> --- netinet/ip_var.h 21 Aug 2022 22:45:55 -0000 1.99
> +++ netinet/ip_var.h 22 Aug 2022 12:01:58 -0000
> @@ -249,7 +249,7 @@ int ip_sysctl(int *, u_int, void *, siz
> void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
> struct mbuf *);
> int ip_input_if(struct mbuf **, int *, int, int, struct ifnet *);
> -int ip_deliver(struct mbuf **, int *, int, int);
> +int ip_deliver(struct mbuf **, int *, int, int, int);
> void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int);
> int rip_ctloutput(int, struct socket *, int, int, struct mbuf *);
> void rip_init(void);
> Index: netinet/udp_usrreq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/udp_usrreq.c,v
> retrieving revision 1.287
> diff -u -p -r1.287 udp_usrreq.c
> --- netinet/udp_usrreq.c 22 Aug 2022 10:37:27 -0000 1.287
> +++ netinet/udp_usrreq.c 22 Aug 2022 12:01:58 -0000
> @@ -122,10 +122,15 @@ u_int udp_sendspace = 9216; /* really m
> u_int udp_recvspace = 40 * (1024 + sizeof(struct sockaddr_in));
> /* 40 1K datagrams */
>
> +void udp_lock(struct socket *);
> +void udp_unlock(struct socket *);
> +
> const struct pr_usrreqs udp_usrreqs = {
> .pru_usrreq = udp_usrreq,
> .pru_attach = udp_attach,
> .pru_detach = udp_detach,
> + .pru_lock = udp_lock,
> + .pru_unlock = udp_unlock,
> .pru_bind = udp_bind,
> .pru_connect = udp_connect,
> };
> @@ -653,12 +658,17 @@ udp_sbappend(struct inpcb *inp, struct m
> }
> #endif
> m_adj(m, hlen);
> +
> + mtx_enter(&inp->inp_mtx);
> if (sbappendaddr(so, &so->so_rcv, srcaddr, m, opts) == 0) {
> + mtx_leave(&inp->inp_mtx);
> udpstat_inc(udps_fullsock);
> m_freem(m);
> m_freem(opts);
> return;
> }
> + mtx_leave(&inp->inp_mtx);
> +
> sorwakeup(so);
> }
>
> @@ -1245,6 +1255,24 @@ udp_detach(struct socket *so)
>
> in_pcbdetach(inp);
> return (0);
> +}
> +
> +void
> +udp_lock(struct socket *so)
> +{
> + struct inpcb *inp = sotoinpcb(so);
> +
> + NET_ASSERT_LOCKED();
> + mtx_enter(&inp->inp_mtx);
> +}
> +
> +void
> +udp_unlock(struct socket *so)
> +{
> + struct inpcb *inp = sotoinpcb(so);
> +
> + NET_ASSERT_LOCKED();
> + mtx_leave(&inp->inp_mtx);
> }
>
> int
> Index: netinet6/in6_proto.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_proto.c,v
> retrieving revision 1.110
> diff -u -p -r1.110 in6_proto.c
> --- netinet6/in6_proto.c 15 Aug 2022 09:11:39 -0000 1.110
> +++ netinet6/in6_proto.c 22 Aug 2022 12:01:58 -0000
> @@ -136,7 +136,7 @@ const struct protosw inet6sw[] = {
> .pr_type = SOCK_DGRAM,
> .pr_domain = &inet6domain,
> .pr_protocol = IPPROTO_UDP,
> - .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE,
> + .pr_flags = PR_ATOMIC|PR_ADDR|PR_SPLICE|PR_MPSAFE,
> .pr_input = udp_input,
> .pr_ctlinput = udp6_ctlinput,
> .pr_ctloutput = ip6_ctloutput,
> Index: netinet6/ip6_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.254
> diff -u -p -r1.254 ip6_input.c
> --- netinet6/ip6_input.c 21 Aug 2022 14:15:55 -0000 1.254
> +++ netinet6/ip6_input.c 22 Aug 2022 12:01:58 -0000
> @@ -190,6 +190,10 @@ ip6_ours(struct mbuf **mp, int *offp, in
> if (af != AF_UNSPEC)
> return nxt;
>
> + nxt = ip_deliver(mp, offp, nxt, AF_INET6, 1);
> + if (nxt == IPPROTO_DONE)
> + return IPPROTO_DONE;
> +
> /* save values for later, use after dequeue */
> if (*offp != sizeof(struct ip6_hdr)) {
> struct m_tag *mtag;
> @@ -248,7 +252,7 @@ ip6intr(void)
> off = sizeof(struct ip6_hdr);
> nxt = ip6->ip6_nxt;
> }
> - nxt = ip_deliver(&m, &off, nxt, AF_INET6);
> + nxt = ip_deliver(&m, &off, nxt, AF_INET6, 0);
> KASSERT(nxt == IPPROTO_DONE);
> }
> }
> Index: sys/mbuf.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
> retrieving revision 1.255
> diff -u -p -r1.255 mbuf.h
> --- sys/mbuf.h 15 Aug 2022 16:15:37 -0000 1.255
> +++ sys/mbuf.h 22 Aug 2022 12:01:58 -0000
> @@ -471,6 +471,8 @@ struct m_tag *m_tag_next(struct mbuf *,
> #define PACKET_TAG_IPSEC_IN_DONE 0x0001 /* IPsec applied, in */
> #define PACKET_TAG_IPSEC_OUT_DONE 0x0002 /* IPsec applied, out */
> #define PACKET_TAG_IPSEC_FLOWINFO 0x0004 /* IPsec flowinfo */
> +#define PACKET_TAG_IP_OFFNXT 0x0010 /* IPv4 offset and next proto */
> +#define PACKET_TAG_IP6_OFFNXT 0x0020 /* IPv6 offset and next
> proto */
> #define PACKET_TAG_WIREGUARD 0x0040 /* WireGuard data */
> #define PACKET_TAG_GRE 0x0080 /* GRE processing done
> */
> #define PACKET_TAG_DLT 0x0100 /* data link layer type
> */
> @@ -479,7 +481,6 @@ struct m_tag *m_tag_next(struct mbuf *,
> #define PACKET_TAG_SRCROUTE 0x1000 /* IPv4 source routing options */
> #define PACKET_TAG_TUNNEL 0x2000 /* Tunnel endpoint address */
> #define PACKET_TAG_CARP_BAL_IP 0x4000 /* carp(4) ip balanced
> marker */
> -#define PACKET_TAG_IP6_OFFNXT 0x8000 /* IPv6 offset and next
> proto */
>
> #define MTAG_BITS \
> ("\20\1IPSEC_IN_DONE\2IPSEC_OUT_DONE\3IPSEC_FLOWINFO" \
> Index: sys/protosw.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/protosw.h,v
> retrieving revision 1.41
> diff -u -p -r1.41 protosw.h
> --- sys/protosw.h 22 Aug 2022 08:08:47 -0000 1.41
> +++ sys/protosw.h 22 Aug 2022 12:01:58 -0000
> @@ -66,6 +66,8 @@ struct pr_usrreqs {
>
> int (*pru_attach)(struct socket *, int);
> int (*pru_detach)(struct socket *);
> + void (*pru_lock)(struct socket *);
> + void (*pru_unlock)(struct socket *);
> int (*pru_bind)(struct socket *, struct mbuf *, struct proc *);
> int (*pru_listen)(struct socket *);
> int (*pru_connect)(struct socket *, struct mbuf *);
> @@ -116,6 +118,7 @@ struct protosw {
> #define PR_ABRTACPTDIS 0x20 /* abort on accept(2) to
> disconnected
> socket */
> #define PR_SPLICE 0x40 /* socket splicing is possible
> */
> +#define PR_MPSAFE 0x80 /* input runs with shared
> netlock */
>
> /*
> * The arguments to usrreq are:
> @@ -263,6 +266,18 @@ static inline int
> pru_detach(struct socket *so)
> {
> return (*so->so_proto->pr_usrreqs->pru_detach)(so);
> +}
> +
> +static inline void
> +pru_lock(struct socket *so)
> +{
> + (*so->so_proto->pr_usrreqs->pru_lock)(so);
> +}
> +
> +static inline void
> +pru_unlock(struct socket *so)
> +{
> + (*so->so_proto->pr_usrreqs->pru_unlock)(so);
> }
>
> static inline int
> Index: sys/socketvar.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.108
> diff -u -p -r1.108 socketvar.h
> --- sys/socketvar.h 21 Aug 2022 16:22:18 -0000 1.108
> +++ sys/socketvar.h 22 Aug 2022 12:01:58 -0000
> @@ -349,9 +349,11 @@ int sockargs(struct mbuf **, const void
>
> int sosleep_nsec(struct socket *, void *, int, const char *, uint64_t);
> void solock(struct socket *);
> +void solock_shared(struct socket *);
> int solock_persocket(struct socket *);
> void solock_pair(struct socket *, struct socket *);
> void sounlock(struct socket *);
> +void sounlock_shared(struct socket *);
>
> int sendit(struct proc *, int, struct msghdr *, int, register_t *);
> int recvit(struct proc *, int, struct msghdr *, caddr_t, register_t *);
>