On Sun, May 07, 2023 at 09:00:31PM +0200, Alexander Bluhm wrote:
> Not sure if I addressed all corner cases already. I think IPsec
> is missing.
Updated diff:
- parts have been commited
- works with IPsec now
- some bugs fixed
- sysctl net.inet.tcp.tso
- netstat TSO counter
If you test this, recompile sysctl and netstat with new kernel
headers. Then you can see, whether the diff has an effect on your
setup.
# netstat -s -p tcp | grep TSO
79 output TSO packets software chopped
0 output TSO packets hardware processed
840 output TSO packets generated
0 output TSO packets dropped
If you run into problems, disable the feature, and report if the
problem goes away. This helps to locate the bug.
# sysctl net.inet.tcp.tso=0
net.inet.tcp.tso: 1 -> 0
I would like to keep the sysctl for now. It makes performance
comparison easier. When we add hardware TSO it can be a quick
workaround for driver problems.
When this has been tested a bit, I think it is ready for commit.
Remaining issues can be handled in tree. My tests pass, I am not
aware of TCP problems.
ok?
bluhm
Index: sys/net/pf.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1177
diff -u -p -r1.1177 pf.c
--- sys/net/pf.c 8 May 2023 13:22:13 -0000 1.1177
+++ sys/net/pf.c 8 May 2023 22:37:04 -0000
@@ -6561,6 +6561,16 @@ pf_route(struct pf_pdesc *pd, struct pf_
goto done;
}
+ if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) &&
+ m0->m_pkthdr.ph_mss <= ifp->if_mtu) {
+ if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) ||
+ if_output_ml(ifp, &ml, sintosa(dst), rt))
+ goto done;
+ tcpstat_inc(tcps_outswtso);
+ goto done;
+ }
+ CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
+
/*
* Too large for interface; fragment if possible.
* Must be able to put at least 8 bytes per fragment.
@@ -6594,6 +6604,7 @@ void
pf_route6(struct pf_pdesc *pd, struct pf_state *st)
{
struct mbuf *m0;
+ struct mbuf_list ml;
struct sockaddr_in6 *dst, sin6;
struct rtentry *rt = NULL;
struct ip6_hdr *ip6;
@@ -6685,11 +6696,21 @@ pf_route6(struct pf_pdesc *pd, struct pf
goto done;
}
- if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
+ if (m0->m_pkthdr.len <= ifp->if_mtu) {
in6_proto_cksum_out(m0, ifp);
ifp->if_output(ifp, m0, sin6tosa(dst), rt);
goto done;
}
+
+ if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) &&
+ m0->m_pkthdr.ph_mss <= ifp->if_mtu) {
+ if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) ||
+ if_output_ml(ifp, &ml, sin6tosa(dst), rt))
+ goto done;
+ tcpstat_inc(tcps_outswtso);
+ goto done;
+ }
+ CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
ip6stat_inc(ip6s_cantfrag);
if (st->rt != PF_DUPTO)
Index: sys/netinet/in.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.h,v
retrieving revision 1.142
diff -u -p -r1.142 in.h
--- sys/netinet/in.h 11 Apr 2023 00:45:09 -0000 1.142
+++ sys/netinet/in.h 8 May 2023 13:47:48 -0000
@@ -780,6 +780,7 @@ int in_canforward(struct in_addr);
int in_cksum(struct mbuf *, int);
int in4_cksum(struct mbuf *, u_int8_t, int, int);
void in_proto_cksum_out(struct mbuf *, struct ifnet *);
+int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
void in_ifdetach(struct ifnet *);
int in_mask2len(struct in_addr *);
void in_len2mask(struct in_addr *, int);
Index: sys/netinet/ip_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.384
diff -u -p -r1.384 ip_output.c
--- sys/netinet/ip_output.c 8 May 2023 13:22:13 -0000 1.384
+++ sys/netinet/ip_output.c 8 May 2023 22:37:04 -0000
@@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct
static __inline u_int16_t __attribute__((__unused__))
in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t);
void in_delayed_cksum(struct mbuf *);
-int in_ifcap_cksum(struct mbuf *, struct ifnet *, int);
int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp,
struct tdb **, int ipsecflowinfo);
@@ -468,6 +467,16 @@ sendit:
goto done;
}
+ if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
+ m->m_pkthdr.ph_mss <= mtu) {
+ if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) ||
+ (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt)))
+ goto done;
+ tcpstat_inc(tcps_outswtso);
+ goto done;
+ }
+ CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
+
/*
* Too large for interface; fragment if possible.
* Must be able to put at least 8 bytes per fragment.
@@ -597,12 +606,12 @@ ip_output_ipsec_pmtu_update(struct tdb *
int
ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int
fwd)
{
-#if NPF > 0
- struct ifnet *encif;
-#endif
+ struct mbuf_list ml;
+ struct ifnet *encif = NULL;
struct ip *ip;
struct in_addr dst;
- int error, rtableid;
+ u_int len;
+ int error, rtableid, tso = 0;
#if NPF > 0
/*
@@ -622,16 +631,22 @@ ip_output_ipsec_send(struct tdb *tdb, st
* Until now the change was not reconsidered.
* What's the behaviour?
*/
- in_proto_cksum_out(m, encif);
#endif
- /* Check if we are allowed to fragment */
+ /* Check if we can chop the TCP packet */
ip = mtod(m, struct ip *);
+ if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
+ m->m_pkthdr.ph_mss <= tdb->tdb_mtu) {
+ tso = 1;
+ len = m->m_pkthdr.ph_mss;
+ } else
+ len = ntohs(ip->ip_len);
+
+ /* Check if we are allowed to fragment */
dst = ip->ip_dst;
rtableid = m->m_pkthdr.ph_rtableid;
if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu &&
- ntohs(ip->ip_len) > tdb->tdb_mtu &&
- tdb->tdb_mtutimeout > gettime()) {
+ len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) {
int transportmode;
transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) &&
@@ -652,14 +667,33 @@ ip_output_ipsec_send(struct tdb *tdb, st
*/
m->m_flags &= ~(M_MCAST | M_BCAST);
- /* Callee frees mbuf */
+ if (tso) {
+ error = tcp_chopper(m, &ml, encif, len);
+ if (error)
+ goto done;
+ } else {
+ CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
+ in_proto_cksum_out(m, encif);
+ ml_init(&ml);
+ ml_enqueue(&ml, m);
+ }
+
KERNEL_LOCK();
- error = ipsp_process_packet(m, tdb, AF_INET, 0);
+ while ((m = ml_dequeue(&ml)) != NULL) {
+ /* Callee frees mbuf */
+ error = ipsp_process_packet(m, tdb, AF_INET, 0);
+ if (error)
+ break;
+ }
KERNEL_UNLOCK();
+ done:
if (error) {
+ ml_purge(&ml);
ipsecstat_inc(ipsec_odrops);
tdbstat_inc(tdb, tdb_odrops);
}
+ if (!error && tso)
+ tcpstat_inc(tcps_outswtso);
if (ip_mtudisc && error == EMSGSIZE)
ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0);
return error;
Index: sys/netinet/tcp_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.135
diff -u -p -r1.135 tcp_output.c
--- sys/netinet/tcp_output.c 25 Apr 2023 22:56:28 -0000 1.135
+++ sys/netinet/tcp_output.c 8 May 2023 22:37:04 -0000
@@ -210,6 +210,7 @@ tcp_output(struct tcpcb *tp)
#ifdef TCP_ECN
int needect;
#endif
+ int tso;
if (tp->t_flags & TF_BLOCKOUTPUT) {
tp->t_flags |= TF_NEEDOUTPUT;
@@ -279,6 +280,7 @@ again:
}
sendalot = 0;
+ tso = 0;
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
@@ -346,8 +348,25 @@ again:
txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
if (len > txmaxseg) {
- len = txmaxseg;
- sendalot = 1;
+ if (tcp_do_tso &&
+ tp->t_inpcb->inp_options == NULL &&
+ tp->t_inpcb->inp_outputopts6 == NULL &&
+#ifdef TCP_SIGNATURE
+ ((tp->t_flags & TF_SIGNATURE) == 0) &&
+#endif
+ len >= 2 * tp->t_maxseg &&
+ tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
+ !(flags & (TH_SYN|TH_RST|TH_FIN))) {
+ tso = 1;
+ /* avoid small chopped packets */
+ if (len > (len / tp->t_maxseg) * tp->t_maxseg) {
+ len = (len / tp->t_maxseg) * tp->t_maxseg;
+ sendalot = 1;
+ }
+ } else {
+ len = txmaxseg;
+ sendalot = 1;
+ }
}
if (off + len < so->so_snd.sb_cc)
flags &= ~TH_FIN;
@@ -365,7 +384,7 @@ again:
* to send into a small window), then must resend.
*/
if (len) {
- if (len == txmaxseg)
+ if (len >= txmaxseg)
goto send;
if ((idle || (tp->t_flags & TF_NODELAY)) &&
len + off >= so->so_snd.sb_cc && !soissending(so) &&
@@ -616,10 +635,19 @@ send:
/*
* Adjust data length if insertion of options will
* bump the packet length beyond the t_maxopd length.
+ * Clear the FIN bit because we cut off the tail of
+ * the segment.
*/
if (len > tp->t_maxopd - optlen) {
- len = tp->t_maxopd - optlen;
- sendalot = 1;
+ if (tso) {
+ if (len + hdrlen + max_linkhdr > MAXMCLBYTES) {
+ len = MAXMCLBYTES - hdrlen - max_linkhdr;
+ sendalot = 1;
+ }
+ } else {
+ len = tp->t_maxopd - optlen;
+ sendalot = 1;
+ }
flags &= ~TH_FIN;
}
@@ -723,6 +751,12 @@ send:
m->m_pkthdr.ph_ifidx = 0;
m->m_pkthdr.len = hdrlen + len;
+ /* Enable TSO and specify the size of the resulting segments. */
+ if (tso) {
+ m->m_pkthdr.csum_flags |= M_TCP_TSO;
+ m->m_pkthdr.ph_mss = tp->t_maxseg;
+ }
+
if (!tp->t_template)
panic("tcp_output");
#ifdef DIAGNOSTIC
@@ -1152,4 +1186,177 @@ tcp_setpersist(struct tcpcb *tp)
TCP_TIMER_ARM(tp, TCPT_PERSIST, msec);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
+}
+
+int
+tcp_chopper(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp,
+ u_int mss)
+{
+ struct ip *ip = NULL;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+#endif
+ struct tcphdr *th;
+ int firstlen, iphlen, hlen, tlen, off;
+ int error;
+
+ ml_init(ml);
+ ml_enqueue(ml, m0);
+
+ ip = mtod(m0, struct ip *);
+ switch (ip->ip_v) {
+ case 4:
+ iphlen = ip->ip_hl << 2;
+ if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF)) ||
+ iphlen != sizeof(struct ip) || ip->ip_p != IPPROTO_TCP) {
+ /* only TCP without fragment or IP option supported */
+ error = EPROTOTYPE;
+ goto bad;
+ }
+ break;
+#ifdef INET6
+ case 6:
+ ip = NULL;
+ ip6 = mtod(m0, struct ip6_hdr *);
+ iphlen = sizeof(struct ip6_hdr);
+ if (ip6->ip6_nxt != IPPROTO_TCP) {
+ /* only TCP without IPv6 header chain supported */
+ error = EPROTOTYPE;
+ goto bad;
+ }
+ break;
+#endif
+ default:
+ panic("%s: unknown ip version %d", __func__, ip->ip_v);
+ }
+
+ tlen = m0->m_pkthdr.len;
+ if (tlen < iphlen + sizeof(struct tcphdr)) {
+ error = ENOPROTOOPT;
+ goto bad;
+ }
+ /* IP and TCP header should be contiguous, this check is paranoia */
+ if (m0->m_len < iphlen + sizeof(*th)) {
+ ml_dequeue(ml);
+ if ((m0 = m_pullup(m0, iphlen + sizeof(*th))) == NULL) {
+ error = ENOBUFS;
+ goto bad;
+ }
+ ml_enqueue(ml, m0);
+ }
+ th = (struct tcphdr *)(mtod(m0, caddr_t) + iphlen);
+ hlen = iphlen + (th->th_off << 2);
+ if (tlen < hlen) {
+ error = ENOPROTOOPT;
+ goto bad;
+ }
+ firstlen = MIN(tlen - hlen, mss);
+
+ CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO);
+
+ /*
+ * Loop through length of payload after first segment,
+ * make new header and copy data of each part and link onto chain.
+ */
+ for (off = hlen + firstlen; off < tlen; off += mss) {
+ struct mbuf *m;
+ struct tcphdr *mhth;
+ int len;
+
+ len = MIN(tlen - off, mss);
+
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto bad;
+ }
+ ml_enqueue(ml, m);
+ if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0)
+ goto bad;
+
+ /* IP and TCP header to the end, space for link layer header */
+ m->m_len = hlen;
+ m_align(m, hlen);
+
+ /* copy and adjust TCP header */
+ mhth = (struct tcphdr *)(mtod(m, caddr_t) + iphlen);
+ memcpy(mhth, th, hlen - iphlen);
+ mhth->th_seq = htonl(ntohl(th->th_seq) + (off - hlen));
+ if (off + len < tlen)
+ CLR(mhth->th_flags, TH_PUSH|TH_FIN);
+
+ /* add mbuf chain with payload */
+ m->m_pkthdr.len = hlen + len;
+ if ((m->m_next = m_copym(m0, off, len, M_DONTWAIT)) == NULL) {
+ error = ENOBUFS;
+ goto bad;
+ }
+
+ /* copy and adjust IP header, calculate checksum */
+ SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+ mhth->th_sum = 0;
+ if (ip) {
+ struct ip *mhip;
+
+ mhip = mtod(m, struct ip *);
+ *mhip = *ip;
+ mhip->ip_len = htons(hlen + len);
+ mhip->ip_id = htons(ip_randomid());
+ mhip->ip_sum = 0;
+ if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) {
+ m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
+ } else {
+ ipstat_inc(ips_outswcsum);
+ mhip->ip_sum = in_cksum(m, iphlen);
+ }
+ in_proto_cksum_out(m, ifp);
+ }
+#ifdef INET6
+ if (ip6) {
+ struct ip6_hdr *mhip6;
+
+ mhip6 = mtod(m, struct ip6_hdr *);
+ *mhip6 = *ip6;
+ mhip6->ip6_plen = htons(hlen - iphlen + len);
+ in6_proto_cksum_out(m, ifp);
+ }
+#endif
+ }
+
+ /*
+ * Update first segment by trimming what's been copied out
+ * and updating header, then send each segment (in order).
+ */
+ if (hlen + firstlen < tlen) {
+ m_adj(m0, hlen + firstlen - tlen);
+ CLR(th->th_flags, TH_PUSH|TH_FIN);
+ }
+ /* adjust IP header, calculate checksum */
+ SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT);
+ th->th_sum = 0;
+ if (ip) {
+ ip->ip_len = htons(m0->m_pkthdr.len);
+ ip->ip_sum = 0;
+ if (ifp && in_ifcap_cksum(m0, ifp, IFCAP_CSUM_IPv4)) {
+ m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT;
+ } else {
+ ipstat_inc(ips_outswcsum);
+ ip->ip_sum = in_cksum(m0, iphlen);
+ }
+ in_proto_cksum_out(m0, ifp);
+ }
+#ifdef INET6
+ if (ip6) {
+ ip6->ip6_plen = htons(m0->m_pkthdr.len - iphlen);
+ in6_proto_cksum_out(m0, ifp);
+ }
+#endif
+
+ tcpstat_add(tcps_outpkttso, ml_len(ml));
+ return 0;
+
+ bad:
+ tcpstat_inc(tcps_outbadtso);
+ ml_purge(ml);
+ return error;
}
Index: sys/netinet/tcp_subr.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.190
diff -u -p -r1.190 tcp_subr.c
--- sys/netinet/tcp_subr.c 7 Nov 2022 11:22:55 -0000 1.190
+++ sys/netinet/tcp_subr.c 8 May 2023 22:37:04 -0000
@@ -119,6 +119,7 @@ int tcp_ack_on_push = 0; /* set to enabl
int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */
#endif
int tcp_do_rfc3390 = 2; /* Increase TCP's Initial Window to 10*mss */
+int tcp_do_tso = 1; /* TCP segmentation offload for output */
#ifndef TCB_INITIAL_HASH_SIZE
#define TCB_INITIAL_HASH_SIZE 128
Index: sys/netinet/tcp_usrreq.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.217
diff -u -p -r1.217 tcp_usrreq.c
--- sys/netinet/tcp_usrreq.c 14 Mar 2023 00:24:05 -0000 1.217
+++ sys/netinet/tcp_usrreq.c 8 May 2023 22:37:04 -0000
@@ -166,6 +166,7 @@ const struct sysctl_bounded_args tcpctl_
{ TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
{ TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
{ TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
+ { TCPCTL_TSO, &tcp_do_tso, 0, 1 },
};
struct inpcbtable tcbtable;
@@ -1335,6 +1336,10 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o
ASSIGN(tcps_sack_rcv_opts);
ASSIGN(tcps_sack_snd_opts);
ASSIGN(tcps_sack_drop_opts);
+ ASSIGN(tcps_outswtso);
+ ASSIGN(tcps_outhwtso);
+ ASSIGN(tcps_outpkttso);
+ ASSIGN(tcps_outbadtso);
#undef ASSIGN
@@ -1494,8 +1499,8 @@ tcp_sysctl(int *name, u_int namelen, voi
default:
NET_LOCK();
- error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
name,
- namelen, oldp, oldlenp, newp, newlen);
+ error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
+ name, namelen, oldp, oldlenp, newp, newlen);
NET_UNLOCK();
return (error);
}
Index: sys/netinet/tcp_var.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.163
diff -u -p -r1.163 tcp_var.h
--- sys/netinet/tcp_var.h 14 Mar 2023 00:24:05 -0000 1.163
+++ sys/netinet/tcp_var.h 8 May 2023 22:37:04 -0000
@@ -442,6 +442,11 @@ struct tcpstat {
u_int64_t tcps_sack_rcv_opts; /* SACK options received */
u_int64_t tcps_sack_snd_opts; /* SACK options sent */
u_int64_t tcps_sack_drop_opts; /* SACK options dropped */
+
+ u_int32_t tcps_outswtso; /* output tso chopped in software */
+ u_int32_t tcps_outhwtso; /* output tso processed by hardware */
+ u_int32_t tcps_outpkttso; /* packets generated by tso */
+ u_int32_t tcps_outbadtso; /* output tso failed, packet dropped */
};
/*
@@ -473,7 +478,8 @@ struct tcpstat {
#define TCPCTL_SYN_USE_LIMIT 23 /* number of uses before reseeding
hash */
#define TCPCTL_ROOTONLY 24 /* return root only port bitmap */
#define TCPCTL_SYN_HASH_SIZE 25 /* number of buckets in the hash */
-#define TCPCTL_MAXID 26
+#define TCPCTL_TSO 26 /* enable TCP segmentation offload */
+#define TCPCTL_MAXID 27
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@@ -500,8 +506,9 @@ struct tcpstat {
{ "stats", CTLTYPE_STRUCT }, \
{ "always_keepalive", CTLTYPE_INT }, \
{ "synuselimit", CTLTYPE_INT }, \
- { "rootonly", CTLTYPE_STRUCT }, \
+ { "rootonly", CTLTYPE_STRUCT }, \
{ "synhashsize", CTLTYPE_INT }, \
+ { "tso", CTLTYPE_INT }, \
}
struct tcp_ident_mapping {
@@ -614,6 +621,10 @@ enum tcpstat_counters {
tcps_sack_rcv_opts,
tcps_sack_snd_opts,
tcps_sack_drop_opts,
+ tcps_outswtso,
+ tcps_outhwtso,
+ tcps_outpkttso,
+ tcps_outbadtso,
tcps_ncounters,
};
@@ -665,6 +676,7 @@ extern struct pool sackhl_pool;
extern int tcp_sackhole_limit; /* max entries for tcp sack queues */
extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */
extern int tcp_do_rfc3390; /* RFC3390 Increasing TCP's Initial Window */
+extern int tcp_do_tso; /* enable TSO for TCP output packets */
extern struct pool tcpqe_pool;
extern int tcp_reass_limit; /* max entries for tcp reass queues */
@@ -706,6 +718,7 @@ struct tcpcb *
tcp_newtcpcb(struct inpcb *, int);
void tcp_notify(struct inpcb *, int);
int tcp_output(struct tcpcb *);
+int tcp_chopper(struct mbuf *, struct mbuf_list *, struct ifnet *, u_int);
void tcp_pulloutofband(struct socket *, u_int, struct mbuf *, int);
int tcp_reass(struct tcpcb *, struct tcphdr *, struct mbuf *, int *);
void tcp_rscale(struct tcpcb *, u_long);
Index: sys/netinet6/ip6_output.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v
retrieving revision 1.274
diff -u -p -r1.274 ip6_output.c
--- sys/netinet6/ip6_output.c 8 May 2023 13:22:13 -0000 1.274
+++ sys/netinet6/ip6_output.c 8 May 2023 22:37:04 -0000
@@ -686,7 +686,9 @@ reroute:
dontfrag = 1;
else
dontfrag = 0;
- if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */
+ if (dontfrag && /* case 2-b */
+ (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) ?
+ m->m_pkthdr.csum_flags : tlen) > ifp->if_mtu) {
#ifdef IPSEC
if (ip_mtudisc)
ipsec_adjust_mtu(m, mtu);
@@ -698,12 +700,22 @@ reroute:
/*
* transmit packet without fragmentation
*/
- if (dontfrag || (tlen <= mtu)) { /* case 1-a and 2-a */
+ if (dontfrag || tlen <= mtu) { /* case 1-a and 2-a */
in6_proto_cksum_out(m, ifp);
error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt);
goto done;
}
+ if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
+ m->m_pkthdr.ph_mss <= mtu) {
+ if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) ||
+ (error = if_output_ml(ifp, &ml, sin6tosa(dst), ro->ro_rt)))
+ goto done;
+ tcpstat_inc(tcps_outswtso);
+ goto done;
+ }
+ CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
+
/*
* try to fragment the packet. case 1-b
*/
@@ -2829,12 +2841,12 @@ int
ip6_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route_in6 *ro,
int tunalready, int fwd)
{
-#if NPF > 0
- struct ifnet *encif;
-#endif
+ struct mbuf_list ml;
+ struct ifnet *encif = NULL;
struct ip6_hdr *ip6;
struct in6_addr dst;
- int error, ifidx, rtableid;
+ u_int len;
+ int error, ifidx, rtableid, tso = 0;
#if NPF > 0
/*
@@ -2854,17 +2866,23 @@ ip6_output_ipsec_send(struct tdb *tdb, s
* Until now the change was not reconsidered.
* What's the behaviour?
*/
- in6_proto_cksum_out(m, encif);
#endif
- /* Check if we are allowed to fragment */
+ /* Check if we can chop the TCP packet */
ip6 = mtod(m, struct ip6_hdr *);
+ if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) &&
+ m->m_pkthdr.ph_mss <= tdb->tdb_mtu) {
+ tso = 1;
+ len = m->m_pkthdr.ph_mss;
+ } else
+ len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
+
+ /* Check if we are allowed to fragment */
dst = ip6->ip6_dst;
ifidx = m->m_pkthdr.ph_ifidx;
rtableid = m->m_pkthdr.ph_rtableid;
if (ip_mtudisc && tdb->tdb_mtu &&
- sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) > tdb->tdb_mtu &&
- tdb->tdb_mtutimeout > gettime()) {
+ len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) {
int transportmode;
transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET6) &&
@@ -2891,14 +2909,33 @@ ip6_output_ipsec_send(struct tdb *tdb, s
*/
m->m_flags &= ~(M_BCAST | M_MCAST);
- /* Callee frees mbuf */
+ if (tso) {
+ error = tcp_chopper(m, &ml, encif, len);
+ if (error)
+ goto done;
+ } else {
+ CLR(m->m_pkthdr.csum_flags, M_TCP_TSO);
+ in6_proto_cksum_out(m, encif);
+ ml_init(&ml);
+ ml_enqueue(&ml, m);
+ }
+
KERNEL_LOCK();
- error = ipsp_process_packet(m, tdb, AF_INET6, tunalready);
+ while ((m = ml_dequeue(&ml)) != NULL) {
+ /* Callee frees mbuf */
+ error = ipsp_process_packet(m, tdb, AF_INET6, tunalready);
+ if (error)
+ break;
+ }
KERNEL_UNLOCK();
+ done:
if (error) {
+ ml_purge(&ml);
ipsecstat_inc(ipsec_odrops);
tdbstat_inc(tdb, tdb_odrops);
}
+ if (!error && tso)
+ tcpstat_inc(tcps_outswtso);
if (ip_mtudisc && error == EMSGSIZE)
ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx, rtableid, 0);
return error;
Index: sys/sys/mbuf.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.256
diff -u -p -r1.256 mbuf.h
--- sys/sys/mbuf.h 5 May 2023 01:19:51 -0000 1.256
+++ sys/sys/mbuf.h 8 May 2023 13:47:48 -0000
@@ -129,12 +129,13 @@ struct pkthdr {
SLIST_HEAD(, m_tag) ph_tags; /* list of packet tags */
int64_t ph_timestamp; /* packet timestamp */
int len; /* total packet length */
+ u_int ph_rtableid; /* routing table id */
+ u_int ph_ifidx; /* rcv interface index */
u_int16_t ph_tagsset; /* mtags attached */
u_int16_t ph_flowid; /* pseudo unique flow id */
u_int16_t csum_flags; /* checksum flags */
u_int16_t ether_vtag; /* Ethernet 802.1p+Q vlan tag */
- u_int ph_rtableid; /* routing table id */
- u_int ph_ifidx; /* rcv interface index */
+ u_int16_t ph_mss; /* TCP max segment size */
u_int8_t ph_loopcnt; /* mbuf is looping in kernel */
u_int8_t ph_family; /* af, used when queueing */
struct pkthdr_pf pf;
@@ -226,6 +227,7 @@ struct mbuf {
#define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6
*/
#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */
#define M_FLOWID 0x4000 /* ph_flowid is set */
+#define M_TCP_TSO 0x8000 /* TCP Segmentation Offload
needed */
#ifdef _KERNEL
#define MCS_BITS \
Index: usr.bin/netstat/inet.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
retrieving revision 1.174
diff -u -p -r1.174 inet.c
--- usr.bin/netstat/inet.c 12 Aug 2022 14:49:15 -0000 1.174
+++ usr.bin/netstat/inet.c 8 May 2023 14:01:00 -0000
@@ -408,6 +408,10 @@ tcp_stats(char *name)
p(tcps_sndwinup, "\t\t%u window update packet%s\n");
p(tcps_sndctrl, "\t\t%u control packet%s\n");
p(tcps_outswcsum, "\t\t%u packet%s software-checksummed\n");
+ p(tcps_outswtso, "\t\t%u output TSO packet%s software chopped\n");
+ p(tcps_outhwtso, "\t\t%u output TSO packet%s hardware processed\n");
+ p(tcps_outpkttso, "\t\t%u output TSO packet%s generated\n");
+ p(tcps_outbadtso, "\t\t%u output TSO packet%s dropped\n");
p(tcps_rcvtotal, "\t%u packet%s received\n");
p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t%u ack%s (for %llu
byte%s)\n");
p(tcps_rcvdupack, "\t\t%u duplicate ack%s\n");