Hi, This diff implements TCP Segmentation Offloading for ixl(4). I tested it successfully on amd64 and sparc64 with Intel X710. It should increase the TCP bulk performance to 10 Gbit/s. On sparc64 I got an increase from 600 MBit/s to 2.000 Gbit/s.
Further testing is welcome. bye, Jan Index: dev/pci/if_ixl.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v retrieving revision 1.89 diff -u -p -r1.89 if_ixl.c --- dev/pci/if_ixl.c 29 Sep 2023 19:44:47 -0000 1.89 +++ dev/pci/if_ixl.c 18 Oct 2023 15:15:30 -0000 @@ -71,6 +71,7 @@ #include <net/if.h> #include <net/if_dl.h> #include <net/if_media.h> +#include <net/route.h> #include <net/toeplitz.h> #if NBPFILTER > 0 @@ -85,6 +86,8 @@ #include <netinet/ip.h> #include <netinet/ip6.h> #include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> #include <netinet/udp.h> #include <netinet/if_ether.h> @@ -827,6 +830,10 @@ struct ixl_tx_desc { #define IXL_TX_DESC_BSIZE_MASK \ (IXL_TX_DESC_BSIZE_MAX << IXL_TX_DESC_BSIZE_SHIFT) +#define IXL_TX_CTX_DESC_CMD_TSO 0x10 +#define IXL_TX_CTX_DESC_TLEN_SHIFT 30 +#define IXL_TX_CTX_DESC_MSS_SHIFT 50 + #define IXL_TX_DESC_L2TAG1_SHIFT 48 } __packed __aligned(16); @@ -893,11 +900,19 @@ struct ixl_rx_wb_desc_32 { uint64_t qword3; } __packed __aligned(16); -#define IXL_TX_PKT_DESCS 8 +#define IXL_TX_PKT_DESCS 32 #define IXL_TX_QUEUE_ALIGN 128 #define IXL_RX_QUEUE_ALIGN 128 #define IXL_HARDMTU 9712 /* 9726 - ETHER_HDR_LEN */ +#define IXL_TSO_SIZE ((255 * 1024) - 1) +#define IXL_MAX_DMA_SEG_SIZE ((16 * 1024) - 1) + +/* + * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES. + * This interface could not handle packets greater than IXL_TSO_SIZE. + */ +CTASSERT(MAXMCLBYTES < IXL_TSO_SIZE); #define IXL_PCIREG PCI_MAPREG_START @@ -1958,6 +1973,7 @@ ixl_attach(struct device *parent, struct ifp->if_capabilities |= IFCAP_CSUM_IPv4 | IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 | IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6; + ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6; ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status); @@ -2603,7 +2619,7 @@ ixl_txr_alloc(struct ixl_softc *sc, unsi txm = &maps[i]; if (bus_dmamap_create(sc->sc_dmat, - IXL_HARDMTU, IXL_TX_PKT_DESCS, IXL_HARDMTU, 0, + MAXMCLBYTES, IXL_TX_PKT_DESCS, IXL_MAX_DMA_SEG_SIZE, 0, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT, &txm->txm_map) != 0) goto uncreate; @@ -2787,7 +2803,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm } static uint64_t -ixl_tx_setup_offload(struct mbuf *m0) +ixl_tx_setup_offload(struct mbuf *m0, struct ixl_tx_ring *txr, + unsigned int prod) { struct ether_extracted ext; uint64_t hlen; @@ -2800,7 +2817,7 @@ ixl_tx_setup_offload(struct mbuf *m0) } if (!ISSET(m0->m_pkthdr.csum_flags, - M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO)) return (offload); ether_extract_headers(m0, &ext); @@ -2833,6 +2850,28 @@ ixl_tx_setup_offload(struct mbuf *m0) offload |= (sizeof(*ext.udp) >> 2) << IXL_TX_DESC_L4LEN_SHIFT; } + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) { + if (ext.tcp) { + struct ixl_tx_desc *ring, *txd; + uint64_t cmd = 0; + + hlen += ext.tcp->th_off << 2; + ring = IXL_DMA_KVA(&txr->txr_mem); + txd = &ring[prod]; + + cmd |= IXL_TX_DESC_DTYPE_CONTEXT; + cmd |= IXL_TX_CTX_DESC_CMD_TSO; + cmd |= (uint64_t)(m0->m_pkthdr.len - ETHER_HDR_LEN + - hlen) << IXL_TX_CTX_DESC_TLEN_SHIFT; + cmd |= (uint64_t)(m0->m_pkthdr.ph_mss) + << IXL_TX_CTX_DESC_MSS_SHIFT; + + htolem64(&txd->addr, 0); + htolem64(&txd->cmd, cmd); + } else + tcpstat_inc(tcps_outbadtso); + } + return (offload); } @@ -2873,7 +2912,8 @@ ixl_start(struct ifqueue *ifq) mask = sc->sc_tx_ring_ndescs - 1; for (;;) { - if (free <= IXL_TX_PKT_DESCS) { + /* We need one extra descriptor for TSO packets. */ + if (free <= (IXL_TX_PKT_DESCS + 1)) { ifq_set_oactive(ifq); break; } @@ -2882,10 +2922,16 @@ ixl_start(struct ifqueue *ifq) if (m == NULL) break; - offload = ixl_tx_setup_offload(m); + offload = ixl_tx_setup_offload(m, txr, prod); txm = &txr->txr_maps[prod]; map = txm->txm_map; + + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { + prod++; + prod &= mask; + free--; + } if (ixl_load_mbuf(sc->sc_dmat, map, m) != 0) { ifq->ifq_errors++;