On Wed, Oct 18, 2023 at 05:29:41PM +0200, Jan Klemkow wrote:
> This diff implements TCP Segmentation Offloading for ixl(4). I tested
> it successfully on amd64 and sparc64 with Intel X710. It should
> increase the TCP bulk performance to 10 Gbit/s. On sparc64 I got an
> increase from 600 MBit/s to 2.000 Gbit/s.
>
> Further testing is welcome.
tested on amd64
OK bluhm@
> Index: dev/pci/if_ixl.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
> retrieving revision 1.89
> diff -u -p -r1.89 if_ixl.c
> --- dev/pci/if_ixl.c 29 Sep 2023 19:44:47 -0000 1.89
> +++ dev/pci/if_ixl.c 18 Oct 2023 15:15:30 -0000
> @@ -71,6 +71,7 @@
> #include <net/if.h>
> #include <net/if_dl.h>
> #include <net/if_media.h>
> +#include <net/route.h>
> #include <net/toeplitz.h>
>
> #if NBPFILTER > 0
> @@ -85,6 +86,8 @@
> #include <netinet/ip.h>
> #include <netinet/ip6.h>
> #include <netinet/tcp.h>
> +#include <netinet/tcp_timer.h>
> +#include <netinet/tcp_var.h>
> #include <netinet/udp.h>
> #include <netinet/if_ether.h>
>
> @@ -827,6 +830,10 @@ struct ixl_tx_desc {
> #define IXL_TX_DESC_BSIZE_MASK \
> (IXL_TX_DESC_BSIZE_MAX << IXL_TX_DESC_BSIZE_SHIFT)
>
> +#define IXL_TX_CTX_DESC_CMD_TSO 0x10
> +#define IXL_TX_CTX_DESC_TLEN_SHIFT 30
> +#define IXL_TX_CTX_DESC_MSS_SHIFT 50
> +
> #define IXL_TX_DESC_L2TAG1_SHIFT 48
> } __packed __aligned(16);
>
> @@ -893,11 +900,19 @@ struct ixl_rx_wb_desc_32 {
> uint64_t qword3;
> } __packed __aligned(16);
>
> -#define IXL_TX_PKT_DESCS 8
> +#define IXL_TX_PKT_DESCS 32
> #define IXL_TX_QUEUE_ALIGN 128
> #define IXL_RX_QUEUE_ALIGN 128
>
> #define IXL_HARDMTU 9712 /* 9726 - ETHER_HDR_LEN */
> +#define IXL_TSO_SIZE ((255 * 1024) - 1)
> +#define IXL_MAX_DMA_SEG_SIZE ((16 * 1024) - 1)
> +
> +/*
> + * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES.
> + * This interface could not handle packets greater than IXL_TSO_SIZE.
> + */
> +CTASSERT(MAXMCLBYTES < IXL_TSO_SIZE);
>
> #define IXL_PCIREG PCI_MAPREG_START
>
> @@ -1958,6 +1973,7 @@ ixl_attach(struct device *parent, struct
> ifp->if_capabilities |= IFCAP_CSUM_IPv4 |
> IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
> IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
> + ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
>
> ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
>
> @@ -2603,7 +2619,7 @@ ixl_txr_alloc(struct ixl_softc *sc, unsi
> txm = &maps[i];
>
> if (bus_dmamap_create(sc->sc_dmat,
> - IXL_HARDMTU, IXL_TX_PKT_DESCS, IXL_HARDMTU, 0,
> + MAXMCLBYTES, IXL_TX_PKT_DESCS, IXL_MAX_DMA_SEG_SIZE, 0,
> BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
> &txm->txm_map) != 0)
> goto uncreate;
> @@ -2787,7 +2803,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
> }
>
> static uint64_t
> -ixl_tx_setup_offload(struct mbuf *m0)
> +ixl_tx_setup_offload(struct mbuf *m0, struct ixl_tx_ring *txr,
> + unsigned int prod)
> {
> struct ether_extracted ext;
> uint64_t hlen;
> @@ -2800,7 +2817,7 @@ ixl_tx_setup_offload(struct mbuf *m0)
> }
>
> if (!ISSET(m0->m_pkthdr.csum_flags,
> - M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
> + M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO))
> return (offload);
>
> ether_extract_headers(m0, &ext);
> @@ -2833,6 +2850,28 @@ ixl_tx_setup_offload(struct mbuf *m0)
> offload |= (sizeof(*ext.udp) >> 2) << IXL_TX_DESC_L4LEN_SHIFT;
> }
>
> + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) {
> + if (ext.tcp) {
> + struct ixl_tx_desc *ring, *txd;
> + uint64_t cmd = 0;
> +
> + hlen += ext.tcp->th_off << 2;
> + ring = IXL_DMA_KVA(&txr->txr_mem);
> + txd = &ring[prod];
> +
> + cmd |= IXL_TX_DESC_DTYPE_CONTEXT;
> + cmd |= IXL_TX_CTX_DESC_CMD_TSO;
> + cmd |= (uint64_t)(m0->m_pkthdr.len - ETHER_HDR_LEN
> + - hlen) << IXL_TX_CTX_DESC_TLEN_SHIFT;
> + cmd |= (uint64_t)(m0->m_pkthdr.ph_mss)
> + << IXL_TX_CTX_DESC_MSS_SHIFT;
> +
> + htolem64(&txd->addr, 0);
> + htolem64(&txd->cmd, cmd);
> + } else
> + tcpstat_inc(tcps_outbadtso);
> + }
> +
> return (offload);
> }
>
> @@ -2873,7 +2912,8 @@ ixl_start(struct ifqueue *ifq)
> mask = sc->sc_tx_ring_ndescs - 1;
>
> for (;;) {
> - if (free <= IXL_TX_PKT_DESCS) {
> + /* We need one extra descriptor for TSO packets. */
> + if (free <= (IXL_TX_PKT_DESCS + 1)) {
> ifq_set_oactive(ifq);
> break;
> }
> @@ -2882,10 +2922,16 @@ ixl_start(struct ifqueue *ifq)
> if (m == NULL)
> break;
>
> - offload = ixl_tx_setup_offload(m);
> + offload = ixl_tx_setup_offload(m, txr, prod);
>
> txm = &txr->txr_maps[prod];
> map = txm->txm_map;
> +
> + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
> + prod++;
> + prod &= mask;
> + free--;
> + }
>
> if (ixl_load_mbuf(sc->sc_dmat, map, m) != 0) {
> ifq->ifq_errors++;