On Tue, May 09, 2023 at 09:56:36AM +0200, Alexander Bluhm wrote: > On Sun, May 07, 2023 at 09:00:31PM +0200, Alexander Bluhm wrote: > > Not sure if I addressed all corner cases already. I think IPsec > > is missing. > > Updated diff: > - parts have been commited > - works with IPsec now
Thanks for this solution. Looks much better to me, then an IPSec lookup in tcp_output() as its done in FreeBSD. > - some bugs fixed > - sysctl net.inet.tcp.tso > - netstat TSO counter > > If you test this, recompile sysctl and netstat with new kernel > headers. Then you can see, whether the diff has an effect on your > setup. > > # netstat -s -p tcp | grep TSO > 79 output TSO packets software chopped > 0 output TSO packets hardware processed > 840 output TSO packets generated > 0 output TSO packets dropped Good idea. > If you run into problems, disable the feature, and report if the > problem goes away. This helps to locate the bug. > > # sysctl net.inet.tcp.tso=0 > net.inet.tcp.tso: 1 -> 0 > > I would like to keep the sysctl for now. It makes performance > comparison easier. When we add hardware TSO it can be a quick > workaround for driver problems. > > When this has been tested a bit, I think it is ready for commit. > Remaining issues can be handled in tree. My tests pass, I am not > aware of TCP problems. I also did some testing in my setups. Everything works. > ok? Diff looks fine to me, too. ok jan@ > bluhm > > Index: sys/net/pf.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v > retrieving revision 1.1177 > diff -u -p -r1.1177 pf.c > --- sys/net/pf.c 8 May 2023 13:22:13 -0000 1.1177 > +++ sys/net/pf.c 8 May 2023 22:37:04 -0000 > @@ -6561,6 +6561,16 @@ pf_route(struct pf_pdesc *pd, struct pf_ > goto done; > } > > + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && > + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { > + if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) || > + if_output_ml(ifp, &ml, sintosa(dst), rt)) > + goto done; > + tcpstat_inc(tcps_outswtso); > + goto done; > + } > + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); > + > /* > * Too large for interface; fragment if possible. > * Must be able to put at least 8 bytes per fragment. > @@ -6594,6 +6604,7 @@ void > pf_route6(struct pf_pdesc *pd, struct pf_state *st) > { > struct mbuf *m0; > + struct mbuf_list ml; > struct sockaddr_in6 *dst, sin6; > struct rtentry *rt = NULL; > struct ip6_hdr *ip6; > @@ -6685,11 +6696,21 @@ pf_route6(struct pf_pdesc *pd, struct pf > goto done; > } > > - if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { > + if (m0->m_pkthdr.len <= ifp->if_mtu) { > in6_proto_cksum_out(m0, ifp); > ifp->if_output(ifp, m0, sin6tosa(dst), rt); > goto done; > } > + > + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && > + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { > + if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) || > + if_output_ml(ifp, &ml, sin6tosa(dst), rt)) > + goto done; > + tcpstat_inc(tcps_outswtso); > + goto done; > + } > + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); > > ip6stat_inc(ip6s_cantfrag); > if (st->rt != PF_DUPTO) > Index: sys/netinet/in.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.h,v > retrieving revision 1.142 > diff -u -p -r1.142 in.h > --- sys/netinet/in.h 11 Apr 2023 00:45:09 -0000 1.142 > +++ sys/netinet/in.h 8 May 2023 13:47:48 -0000 > @@ -780,6 +780,7 @@ int in_canforward(struct in_addr); > int in_cksum(struct mbuf *, int); > int in4_cksum(struct mbuf *, u_int8_t, int, int); > void in_proto_cksum_out(struct mbuf *, struct ifnet *); > +int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); > void in_ifdetach(struct ifnet *); > int in_mask2len(struct in_addr *); > void in_len2mask(struct in_addr *, int); > Index: sys/netinet/ip_output.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v > retrieving revision 1.384 > diff -u -p -r1.384 ip_output.c > --- sys/netinet/ip_output.c 8 May 2023 13:22:13 -0000 1.384 > +++ sys/netinet/ip_output.c 8 May 2023 22:37:04 -0000 > @@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct > static __inline u_int16_t __attribute__((__unused__)) > in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); > void in_delayed_cksum(struct mbuf *); > -int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); > > int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp, > struct tdb **, int ipsecflowinfo); > @@ -468,6 +467,16 @@ sendit: > goto done; > } > > + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && > + m->m_pkthdr.ph_mss <= mtu) { > + if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) || > + (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt))) > + goto done; > + tcpstat_inc(tcps_outswtso); > + goto done; > + } > + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); > + > /* > * Too large for interface; fragment if possible. > * Must be able to put at least 8 bytes per fragment. > @@ -597,12 +606,12 @@ ip_output_ipsec_pmtu_update(struct tdb * > int > ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int > fwd) > { > -#if NPF > 0 > - struct ifnet *encif; > -#endif > + struct mbuf_list ml; > + struct ifnet *encif = NULL; > struct ip *ip; > struct in_addr dst; > - int error, rtableid; > + u_int len; > + int error, rtableid, tso = 0; > > #if NPF > 0 > /* > @@ -622,16 +631,22 @@ ip_output_ipsec_send(struct tdb *tdb, st > * Until now the change was not reconsidered. > * What's the behaviour? > */ > - in_proto_cksum_out(m, encif); > #endif > > - /* Check if we are allowed to fragment */ > + /* Check if we can chop the TCP packet */ > ip = mtod(m, struct ip *); > + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && > + m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { > + tso = 1; > + len = m->m_pkthdr.ph_mss; > + } else > + len = ntohs(ip->ip_len); > + > + /* Check if we are allowed to fragment */ > dst = ip->ip_dst; > rtableid = m->m_pkthdr.ph_rtableid; > if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && > - ntohs(ip->ip_len) > tdb->tdb_mtu && > - tdb->tdb_mtutimeout > gettime()) { > + len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { > int transportmode; > > transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && > @@ -652,14 +667,33 @@ ip_output_ipsec_send(struct tdb *tdb, st > */ > m->m_flags &= ~(M_MCAST | M_BCAST); > > - /* Callee frees mbuf */ > + if (tso) { > + error = tcp_chopper(m, &ml, encif, len); > + if (error) > + goto done; > + } else { > + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); > + in_proto_cksum_out(m, encif); > + ml_init(&ml); > + ml_enqueue(&ml, m); > + } > + > KERNEL_LOCK(); > - error = ipsp_process_packet(m, tdb, AF_INET, 0); > + while ((m = ml_dequeue(&ml)) != NULL) { > + /* Callee frees mbuf */ > + error = ipsp_process_packet(m, tdb, AF_INET, 0); > + if (error) > + break; > + } > KERNEL_UNLOCK(); > + done: > if (error) { > + ml_purge(&ml); > ipsecstat_inc(ipsec_odrops); > tdbstat_inc(tdb, tdb_odrops); > } > + if (!error && tso) > + tcpstat_inc(tcps_outswtso); > if (ip_mtudisc && error == EMSGSIZE) > ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0); > return error; > Index: sys/netinet/tcp_output.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v > retrieving revision 1.135 > diff -u -p -r1.135 tcp_output.c > --- sys/netinet/tcp_output.c 25 Apr 2023 22:56:28 -0000 1.135 > +++ sys/netinet/tcp_output.c 8 May 2023 22:37:04 -0000 > @@ -210,6 +210,7 @@ tcp_output(struct tcpcb *tp) > #ifdef TCP_ECN > int needect; > #endif > + int tso; > > if (tp->t_flags & TF_BLOCKOUTPUT) { > tp->t_flags |= TF_NEEDOUTPUT; > @@ -279,6 +280,7 @@ again: > } > > sendalot = 0; > + tso = 0; > /* > * If in persist timeout with window of 0, send 1 byte. > * Otherwise, if window is small but nonzero > @@ -346,8 +348,25 @@ again: > txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg); > > if (len > txmaxseg) { > - len = txmaxseg; > - sendalot = 1; > + if (tcp_do_tso && > + tp->t_inpcb->inp_options == NULL && > + tp->t_inpcb->inp_outputopts6 == NULL && > +#ifdef TCP_SIGNATURE > + ((tp->t_flags & TF_SIGNATURE) == 0) && > +#endif > + len >= 2 * tp->t_maxseg && > + tp->rcv_numsacks == 0 && sack_rxmit == 0 && > + !(flags & (TH_SYN|TH_RST|TH_FIN))) { > + tso = 1; > + /* avoid small chopped packets */ > + if (len > (len / tp->t_maxseg) * tp->t_maxseg) { > + len = (len / tp->t_maxseg) * tp->t_maxseg; > + sendalot = 1; > + } > + } else { > + len = txmaxseg; > + sendalot = 1; > + } > } > if (off + len < so->so_snd.sb_cc) > flags &= ~TH_FIN; > @@ -365,7 +384,7 @@ again: > * to send into a small window), then must resend. > */ > if (len) { > - if (len == txmaxseg) > + if (len >= txmaxseg) > goto send; > if ((idle || (tp->t_flags & TF_NODELAY)) && > len + off >= so->so_snd.sb_cc && !soissending(so) && > @@ -616,10 +635,19 @@ send: > /* > * Adjust data length if insertion of options will > * bump the packet length beyond the t_maxopd length. > + * Clear the FIN bit because we cut off the tail of > + * the segment. > */ > if (len > tp->t_maxopd - optlen) { > - len = tp->t_maxopd - optlen; > - sendalot = 1; > + if (tso) { > + if (len + hdrlen + max_linkhdr > MAXMCLBYTES) { > + len = MAXMCLBYTES - hdrlen - max_linkhdr; > + sendalot = 1; > + } > + } else { > + len = tp->t_maxopd - optlen; > + sendalot = 1; > + } > flags &= ~TH_FIN; > } > > @@ -723,6 +751,12 @@ send: > m->m_pkthdr.ph_ifidx = 0; > m->m_pkthdr.len = hdrlen + len; > > + /* Enable TSO and specify the size of the resulting segments. */ > + if (tso) { > + m->m_pkthdr.csum_flags |= M_TCP_TSO; > + m->m_pkthdr.ph_mss = tp->t_maxseg; > + } > + > if (!tp->t_template) > panic("tcp_output"); > #ifdef DIAGNOSTIC > @@ -1152,4 +1186,177 @@ tcp_setpersist(struct tcpcb *tp) > TCP_TIMER_ARM(tp, TCPT_PERSIST, msec); > if (tp->t_rxtshift < TCP_MAXRXTSHIFT) > tp->t_rxtshift++; > +} > + > +int > +tcp_chopper(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, > + u_int mss) > +{ > + struct ip *ip = NULL; > +#ifdef INET6 > + struct ip6_hdr *ip6 = NULL; > +#endif > + struct tcphdr *th; > + int firstlen, iphlen, hlen, tlen, off; > + int error; > + > + ml_init(ml); > + ml_enqueue(ml, m0); > + > + ip = mtod(m0, struct ip *); > + switch (ip->ip_v) { > + case 4: > + iphlen = ip->ip_hl << 2; > + if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF)) || > + iphlen != sizeof(struct ip) || ip->ip_p != IPPROTO_TCP) { > + /* only TCP without fragment or IP option supported */ > + error = EPROTOTYPE; > + goto bad; > + } > + break; > +#ifdef INET6 > + case 6: > + ip = NULL; > + ip6 = mtod(m0, struct ip6_hdr *); > + iphlen = sizeof(struct ip6_hdr); > + if (ip6->ip6_nxt != IPPROTO_TCP) { > + /* only TCP without IPv6 header chain supported */ > + error = EPROTOTYPE; > + goto bad; > + } > + break; > +#endif > + default: > + panic("%s: unknown ip version %d", __func__, ip->ip_v); > + } > + > + tlen = m0->m_pkthdr.len; > + if (tlen < iphlen + sizeof(struct tcphdr)) { > + error = ENOPROTOOPT; > + goto bad; > + } > + /* IP and TCP header should be contiguous, this check is paranoia */ > + if (m0->m_len < iphlen + sizeof(*th)) { > + ml_dequeue(ml); > + if ((m0 = m_pullup(m0, iphlen + sizeof(*th))) == NULL) { > + error = ENOBUFS; > + goto bad; > + } > + ml_enqueue(ml, m0); > + } > + th = (struct tcphdr *)(mtod(m0, caddr_t) + iphlen); > + hlen = iphlen + (th->th_off << 2); > + if (tlen < hlen) { > + error = ENOPROTOOPT; > + goto bad; > + } > + firstlen = MIN(tlen - hlen, mss); > + > + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); > + > + /* > + * Loop through length of payload after first segment, > + * make new header and copy data of each part and link onto chain. > + */ > + for (off = hlen + firstlen; off < tlen; off += mss) { > + struct mbuf *m; > + struct tcphdr *mhth; > + int len; > + > + len = MIN(tlen - off, mss); > + > + MGETHDR(m, M_DONTWAIT, MT_HEADER); > + if (m == NULL) { > + error = ENOBUFS; > + goto bad; > + } > + ml_enqueue(ml, m); > + if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) > + goto bad; > + > + /* IP and TCP header to the end, space for link layer header */ > + m->m_len = hlen; > + m_align(m, hlen); > + > + /* copy and adjust TCP header */ > + mhth = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); > + memcpy(mhth, th, hlen - iphlen); > + mhth->th_seq = htonl(ntohl(th->th_seq) + (off - hlen)); > + if (off + len < tlen) > + CLR(mhth->th_flags, TH_PUSH|TH_FIN); > + > + /* add mbuf chain with payload */ > + m->m_pkthdr.len = hlen + len; > + if ((m->m_next = m_copym(m0, off, len, M_DONTWAIT)) == NULL) { > + error = ENOBUFS; > + goto bad; > + } > + > + /* copy and adjust IP header, calculate checksum */ > + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); > + mhth->th_sum = 0; > + if (ip) { > + struct ip *mhip; > + > + mhip = mtod(m, struct ip *); > + *mhip = *ip; > + mhip->ip_len = htons(hlen + len); > + mhip->ip_id = htons(ip_randomid()); > + mhip->ip_sum = 0; > + if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { > + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; > + } else { > + ipstat_inc(ips_outswcsum); > + mhip->ip_sum = in_cksum(m, iphlen); > + } > + in_proto_cksum_out(m, ifp); > + } > +#ifdef INET6 > + if (ip6) { > + struct ip6_hdr *mhip6; > + > + mhip6 = mtod(m, struct ip6_hdr *); > + *mhip6 = *ip6; > + mhip6->ip6_plen = htons(hlen - iphlen + len); > + in6_proto_cksum_out(m, ifp); > + } > +#endif > + } > + > + /* > + * Update first segment by trimming what's been copied out > + * and updating header, then send each segment (in order). > + */ > + if (hlen + firstlen < tlen) { > + m_adj(m0, hlen + firstlen - tlen); > + CLR(th->th_flags, TH_PUSH|TH_FIN); > + } > + /* adjust IP header, calculate checksum */ > + SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); > + th->th_sum = 0; > + if (ip) { > + ip->ip_len = htons(m0->m_pkthdr.len); > + ip->ip_sum = 0; > + if (ifp && in_ifcap_cksum(m0, ifp, IFCAP_CSUM_IPv4)) { > + m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; > + } else { > + ipstat_inc(ips_outswcsum); > + ip->ip_sum = in_cksum(m0, iphlen); > + } > + in_proto_cksum_out(m0, ifp); > + } > +#ifdef INET6 > + if (ip6) { > + ip6->ip6_plen = htons(m0->m_pkthdr.len - iphlen); > + in6_proto_cksum_out(m0, ifp); > + } > +#endif > + > + tcpstat_add(tcps_outpkttso, ml_len(ml)); > + return 0; > + > + bad: > + tcpstat_inc(tcps_outbadtso); > + ml_purge(ml); > + return error; > } > Index: sys/netinet/tcp_subr.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v > retrieving revision 1.190 > diff -u -p -r1.190 tcp_subr.c > --- sys/netinet/tcp_subr.c 7 Nov 2022 11:22:55 -0000 1.190 > +++ sys/netinet/tcp_subr.c 8 May 2023 22:37:04 -0000 > @@ -119,6 +119,7 @@ int tcp_ack_on_push = 0; /* set to enabl > int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */ > #endif > int tcp_do_rfc3390 = 2; /* Increase TCP's Initial Window to 10*mss */ > +int tcp_do_tso = 1; /* TCP segmentation offload for output */ > > #ifndef TCB_INITIAL_HASH_SIZE > #define TCB_INITIAL_HASH_SIZE 128 > Index: sys/netinet/tcp_usrreq.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v > retrieving revision 1.217 > diff -u -p -r1.217 tcp_usrreq.c > --- sys/netinet/tcp_usrreq.c 14 Mar 2023 00:24:05 -0000 1.217 > +++ sys/netinet/tcp_usrreq.c 8 May 2023 22:37:04 -0000 > @@ -166,6 +166,7 @@ const struct sysctl_bounded_args tcpctl_ > { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, > { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, > { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, > + { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, > }; > > struct inpcbtable tcbtable; > @@ -1335,6 +1336,10 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o > ASSIGN(tcps_sack_rcv_opts); > ASSIGN(tcps_sack_snd_opts); > ASSIGN(tcps_sack_drop_opts); > + ASSIGN(tcps_outswtso); > + ASSIGN(tcps_outhwtso); > + ASSIGN(tcps_outpkttso); > + ASSIGN(tcps_outbadtso); > > #undef ASSIGN > > @@ -1494,8 +1499,8 @@ tcp_sysctl(int *name, u_int namelen, voi > > default: > NET_LOCK(); > - error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), > name, > - namelen, oldp, oldlenp, newp, newlen); > + error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), > + name, namelen, oldp, oldlenp, newp, newlen); > NET_UNLOCK(); > return (error); > } > Index: sys/netinet/tcp_var.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v > retrieving revision 1.163 > diff -u -p -r1.163 tcp_var.h > --- sys/netinet/tcp_var.h 14 Mar 2023 00:24:05 -0000 1.163 > +++ sys/netinet/tcp_var.h 8 May 2023 22:37:04 -0000 > @@ -442,6 +442,11 @@ struct tcpstat { > u_int64_t tcps_sack_rcv_opts; /* SACK options received */ > u_int64_t tcps_sack_snd_opts; /* SACK options sent */ > u_int64_t tcps_sack_drop_opts; /* SACK options dropped */ > + > + u_int32_t tcps_outswtso; /* output tso chopped in software */ > + u_int32_t tcps_outhwtso; /* output tso processed by hardware */ > + u_int32_t tcps_outpkttso; /* packets generated by tso */ > + u_int32_t tcps_outbadtso; /* output tso failed, packet dropped */ > }; > > /* > @@ -473,7 +478,8 @@ struct tcpstat { > #define TCPCTL_SYN_USE_LIMIT 23 /* number of uses before reseeding > hash */ > #define TCPCTL_ROOTONLY 24 /* return root only port bitmap */ > #define TCPCTL_SYN_HASH_SIZE 25 /* number of buckets in the hash */ > -#define TCPCTL_MAXID 26 > +#define TCPCTL_TSO 26 /* enable TCP segmentation offload */ > +#define TCPCTL_MAXID 27 > > #define TCPCTL_NAMES { \ > { 0, 0 }, \ > @@ -500,8 +506,9 @@ struct tcpstat { > { "stats", CTLTYPE_STRUCT }, \ > { "always_keepalive", CTLTYPE_INT }, \ > { "synuselimit", CTLTYPE_INT }, \ > - { "rootonly", CTLTYPE_STRUCT }, \ > + { "rootonly", CTLTYPE_STRUCT }, \ > { "synhashsize", CTLTYPE_INT }, \ > + { "tso", CTLTYPE_INT }, \ > } > > struct tcp_ident_mapping { > @@ -614,6 +621,10 @@ enum tcpstat_counters { > tcps_sack_rcv_opts, > tcps_sack_snd_opts, > tcps_sack_drop_opts, > + tcps_outswtso, > + tcps_outhwtso, > + tcps_outpkttso, > + tcps_outbadtso, > tcps_ncounters, > }; > > @@ -665,6 +676,7 @@ extern struct pool sackhl_pool; > extern int tcp_sackhole_limit; /* max entries for tcp sack queues */ > extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */ > extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial > Window */ > +extern int tcp_do_tso; /* enable TSO for TCP output packets */ > > extern struct pool tcpqe_pool; > extern int tcp_reass_limit; /* max entries for tcp reass queues */ > @@ -706,6 +718,7 @@ struct tcpcb * > tcp_newtcpcb(struct inpcb *, int); > void tcp_notify(struct inpcb *, int); > int tcp_output(struct tcpcb *); > +int tcp_chopper(struct mbuf *, struct mbuf_list *, struct ifnet *, u_int); > void tcp_pulloutofband(struct socket *, u_int, struct mbuf *, int); > int tcp_reass(struct tcpcb *, struct tcphdr *, struct mbuf *, int *); > void tcp_rscale(struct tcpcb *, u_long); > Index: sys/netinet6/ip6_output.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v > retrieving revision 1.274 > diff -u -p -r1.274 ip6_output.c > --- sys/netinet6/ip6_output.c 8 May 2023 13:22:13 -0000 1.274 > +++ sys/netinet6/ip6_output.c 8 May 2023 22:37:04 -0000 > @@ -686,7 +686,9 @@ reroute: > dontfrag = 1; > else > dontfrag = 0; > - if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */ > + if (dontfrag && /* case 2-b */ > + (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ? > + m->m_pkthdr.csum_flags : tlen) > ifp->if_mtu) { > #ifdef IPSEC > if (ip_mtudisc) > ipsec_adjust_mtu(m, mtu); > @@ -698,12 +700,22 @@ reroute: > /* > * transmit packet without fragmentation > */ > - if (dontfrag || (tlen <= mtu)) { /* case 1-a and 2-a */ > + if (dontfrag || tlen <= mtu) { /* case 1-a and 2-a */ > in6_proto_cksum_out(m, ifp); > error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt); > goto done; > } > > + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && > + m->m_pkthdr.ph_mss <= mtu) { > + if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) || > + (error = if_output_ml(ifp, &ml, sin6tosa(dst), ro->ro_rt))) > + goto done; > + tcpstat_inc(tcps_outswtso); > + goto done; > + } > + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); > + > /* > * try to fragment the packet. case 1-b > */ > @@ -2829,12 +2841,12 @@ int > ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro, > int tunalready, int fwd) > { > -#if NPF > 0 > - struct ifnet *encif; > -#endif > + struct mbuf_list ml; > + struct ifnet *encif = NULL; > struct ip6_hdr *ip6; > struct in6_addr dst; > - int error, ifidx, rtableid; > + u_int len; > + int error, ifidx, rtableid, tso = 0; > > #if NPF > 0 > /* > @@ -2854,17 +2866,23 @@ ip6_output_ipsec_send(struct tdb *tdb, s > * Until now the change was not reconsidered. > * What's the behaviour? > */ > - in6_proto_cksum_out(m, encif); > #endif > > - /* Check if we are allowed to fragment */ > + /* Check if we can chop the TCP packet */ > ip6 = mtod(m, struct ip6_hdr *); > + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && > + m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { > + tso = 1; > + len = m->m_pkthdr.ph_mss; > + } else > + len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); > + > + /* Check if we are allowed to fragment */ > dst = ip6->ip6_dst; > ifidx = m->m_pkthdr.ph_ifidx; > rtableid = m->m_pkthdr.ph_rtableid; > if (ip_mtudisc && tdb->tdb_mtu && > - sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) > tdb->tdb_mtu && > - tdb->tdb_mtutimeout > gettime()) { > + len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { > int transportmode; > > transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET6) && > @@ -2891,14 +2909,33 @@ ip6_output_ipsec_send(struct tdb *tdb, s > */ > m->m_flags &= ~(M_BCAST | M_MCAST); > > - /* Callee frees mbuf */ > + if (tso) { > + error = tcp_chopper(m, &ml, encif, len); > + if (error) > + goto done; > + } else { > + CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); > + in6_proto_cksum_out(m, encif); > + ml_init(&ml); > + ml_enqueue(&ml, m); > + } > + > KERNEL_LOCK(); > - error = ipsp_process_packet(m, tdb, AF_INET6, tunalready); > + while ((m = ml_dequeue(&ml)) != NULL) { > + /* Callee frees mbuf */ > + error = ipsp_process_packet(m, tdb, AF_INET6, tunalready); > + if (error) > + break; > + } > KERNEL_UNLOCK(); > + done: > if (error) { > + ml_purge(&ml); > ipsecstat_inc(ipsec_odrops); > tdbstat_inc(tdb, tdb_odrops); > } > + if (!error && tso) > + tcpstat_inc(tcps_outswtso); > if (ip_mtudisc && error == EMSGSIZE) > ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx, rtableid, 0); > return error; > Index: sys/sys/mbuf.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v > retrieving revision 1.256 > diff -u -p -r1.256 mbuf.h > --- sys/sys/mbuf.h 5 May 2023 01:19:51 -0000 1.256 > +++ sys/sys/mbuf.h 8 May 2023 13:47:48 -0000 > @@ -129,12 +129,13 @@ struct pkthdr { > SLIST_HEAD(, m_tag) ph_tags; /* list of packet tags */ > int64_t ph_timestamp; /* packet timestamp */ > int len; /* total packet length */ > + u_int ph_rtableid; /* routing table id */ > + u_int ph_ifidx; /* rcv interface index */ > u_int16_t ph_tagsset; /* mtags attached */ > u_int16_t ph_flowid; /* pseudo unique flow id */ > u_int16_t csum_flags; /* checksum flags */ > u_int16_t ether_vtag; /* Ethernet 802.1p+Q vlan tag */ > - u_int ph_rtableid; /* routing table id */ > - u_int ph_ifidx; /* rcv interface index */ > + u_int16_t ph_mss; /* TCP max segment size */ > u_int8_t ph_loopcnt; /* mbuf is looping in kernel */ > u_int8_t ph_family; /* af, used when queueing */ > struct pkthdr_pf pf; > @@ -226,6 +227,7 @@ struct mbuf { > #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 > */ > #define M_TIMESTAMP 0x2000 /* ph_timestamp is set */ > #define M_FLOWID 0x4000 /* ph_flowid is set */ > +#define M_TCP_TSO 0x8000 /* TCP Segmentation Offload > needed */ > > #ifdef _KERNEL > #define MCS_BITS \ > Index: usr.bin/netstat/inet.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v > retrieving revision 1.174 > diff -u -p -r1.174 inet.c > --- usr.bin/netstat/inet.c 12 Aug 2022 14:49:15 -0000 1.174 > +++ usr.bin/netstat/inet.c 8 May 2023 14:01:00 -0000 > @@ -408,6 +408,10 @@ tcp_stats(char *name) > p(tcps_sndwinup, "\t\t%u window update packet%s\n"); > p(tcps_sndctrl, "\t\t%u control packet%s\n"); > p(tcps_outswcsum, "\t\t%u packet%s software-checksummed\n"); > + p(tcps_outswtso, "\t\t%u output TSO packet%s software chopped\n"); > + p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n"); > + p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n"); > + p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n"); > p(tcps_rcvtotal, "\t%u packet%s received\n"); > p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu > byte%s)\n"); > p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n"); >