Hi,

This diff implements TCP Segmentation Offloading for ixl(4).  I tested
it successfully on amd64 and sparc64 with Intel X710.  It should
increase the TCP bulk performance to 10 Gbit/s.  On sparc64 I got an
increase from 600 MBit/s to 2.000 Gbit/s.

Further testing is welcome.

bye,
Jan

Index: dev/pci/if_ixl.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v
retrieving revision 1.89
diff -u -p -r1.89 if_ixl.c
--- dev/pci/if_ixl.c    29 Sep 2023 19:44:47 -0000      1.89
+++ dev/pci/if_ixl.c    18 Oct 2023 15:15:30 -0000
@@ -71,6 +71,7 @@
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
+#include <net/route.h>
 #include <net/toeplitz.h>
 
 #if NBPFILTER > 0
@@ -85,6 +86,8 @@
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/if_ether.h>
 
@@ -827,6 +830,10 @@ struct ixl_tx_desc {
 #define IXL_TX_DESC_BSIZE_MASK         \
        (IXL_TX_DESC_BSIZE_MAX << IXL_TX_DESC_BSIZE_SHIFT)
 
+#define IXL_TX_CTX_DESC_CMD_TSO                0x10
+#define IXL_TX_CTX_DESC_TLEN_SHIFT     30
+#define IXL_TX_CTX_DESC_MSS_SHIFT      50
+
 #define IXL_TX_DESC_L2TAG1_SHIFT       48
 } __packed __aligned(16);
 
@@ -893,11 +900,19 @@ struct ixl_rx_wb_desc_32 {
        uint64_t                qword3;
 } __packed __aligned(16);
 
-#define IXL_TX_PKT_DESCS               8
+#define IXL_TX_PKT_DESCS               32
 #define IXL_TX_QUEUE_ALIGN             128
 #define IXL_RX_QUEUE_ALIGN             128
 
 #define IXL_HARDMTU                    9712 /* 9726 - ETHER_HDR_LEN */
+#define IXL_TSO_SIZE                   ((255 * 1024) - 1)
+#define IXL_MAX_DMA_SEG_SIZE           ((16 * 1024) - 1)
+
+/*
+ * Our TCP/IP Stack could not handle packets greater than MAXMCLBYTES.
+ * This interface could not handle packets greater than IXL_TSO_SIZE.
+ */
+CTASSERT(MAXMCLBYTES < IXL_TSO_SIZE);
 
 #define IXL_PCIREG                     PCI_MAPREG_START
 
@@ -1958,6 +1973,7 @@ ixl_attach(struct device *parent, struct
        ifp->if_capabilities |= IFCAP_CSUM_IPv4 |
            IFCAP_CSUM_TCPv4 | IFCAP_CSUM_UDPv4 |
            IFCAP_CSUM_TCPv6 | IFCAP_CSUM_UDPv6;
+       ifp->if_capabilities |= IFCAP_TSOv4 | IFCAP_TSOv6;
 
        ifmedia_init(&sc->sc_media, 0, ixl_media_change, ixl_media_status);
 
@@ -2603,7 +2619,7 @@ ixl_txr_alloc(struct ixl_softc *sc, unsi
                txm = &maps[i];
 
                if (bus_dmamap_create(sc->sc_dmat,
-                   IXL_HARDMTU, IXL_TX_PKT_DESCS, IXL_HARDMTU, 0,
+                   MAXMCLBYTES, IXL_TX_PKT_DESCS, IXL_MAX_DMA_SEG_SIZE, 0,
                    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
                    &txm->txm_map) != 0)
                        goto uncreate;
@@ -2787,7 +2803,8 @@ ixl_load_mbuf(bus_dma_tag_t dmat, bus_dm
 }
 
 static uint64_t
-ixl_tx_setup_offload(struct mbuf *m0)
+ixl_tx_setup_offload(struct mbuf *m0, struct ixl_tx_ring *txr,
+    unsigned int prod)
 {
        struct ether_extracted ext;
        uint64_t hlen;
@@ -2800,7 +2817,7 @@ ixl_tx_setup_offload(struct mbuf *m0)
        }
 
        if (!ISSET(m0->m_pkthdr.csum_flags,
-           M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT))
+           M_IPV4_CSUM_OUT|M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_TCP_TSO))
                return (offload);
 
        ether_extract_headers(m0, &ext);
@@ -2833,6 +2850,28 @@ ixl_tx_setup_offload(struct mbuf *m0)
                offload |= (sizeof(*ext.udp) >> 2) << IXL_TX_DESC_L4LEN_SHIFT;
        }
 
+       if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) {
+               if (ext.tcp) {
+                       struct ixl_tx_desc *ring, *txd;
+                       uint64_t cmd = 0;
+
+                       hlen += ext.tcp->th_off << 2;
+                       ring = IXL_DMA_KVA(&txr->txr_mem);
+                       txd = &ring[prod];
+
+                       cmd |= IXL_TX_DESC_DTYPE_CONTEXT;
+                       cmd |= IXL_TX_CTX_DESC_CMD_TSO;
+                       cmd |= (uint64_t)(m0->m_pkthdr.len - ETHER_HDR_LEN
+                           - hlen) << IXL_TX_CTX_DESC_TLEN_SHIFT;
+                       cmd |= (uint64_t)(m0->m_pkthdr.ph_mss)
+                           << IXL_TX_CTX_DESC_MSS_SHIFT;
+
+                       htolem64(&txd->addr, 0);
+                       htolem64(&txd->cmd, cmd);
+               } else
+                       tcpstat_inc(tcps_outbadtso);
+       }
+
        return (offload);
 }
 
@@ -2873,7 +2912,8 @@ ixl_start(struct ifqueue *ifq)
        mask = sc->sc_tx_ring_ndescs - 1;
 
        for (;;) {
-               if (free <= IXL_TX_PKT_DESCS) {
+               /* We need one extra descriptor for TSO packets. */
+               if (free <= (IXL_TX_PKT_DESCS + 1)) {
                        ifq_set_oactive(ifq);
                        break;
                }
@@ -2882,10 +2922,16 @@ ixl_start(struct ifqueue *ifq)
                if (m == NULL)
                        break;
 
-               offload = ixl_tx_setup_offload(m);
+               offload = ixl_tx_setup_offload(m, txr, prod);
 
                txm = &txr->txr_maps[prod];
                map = txm->txm_map;
+
+               if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) {
+                       prod++;
+                       prod &= mask;
+                       free--;
+               }
 
                if (ixl_load_mbuf(sc->sc_dmat, map, m) != 0) {
                        ifq->ifq_errors++;

Reply via email to