Author: np
Date: Fri Sep 18 03:01:47 2020
New Revision: 365871
URL: https://svnweb.freebsd.org/changeset/base/365871

Log:
  cxgbe(4): add support for stateless offloads for VXLAN traffic.
  
  Hardware assistance includes checksumming (tx and rx), TSO, and RSS on
  the inner traffic in a VXLAN tunnel.
  
  Relnotes:     Yes
  Sponsored by: Chelsio Communications

Modified:
  head/share/man/man4/cxgbe.4
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/common/common.h
  head/sys/dev/cxgbe/common/t4_hw.c
  head/sys/dev/cxgbe/firmware/t6fw_cfg.txt
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sge.c

Modified: head/share/man/man4/cxgbe.4
==============================================================================
--- head/share/man/man4/cxgbe.4 Fri Sep 18 02:37:57 2020        (r365870)
+++ head/share/man/man4/cxgbe.4 Fri Sep 18 03:01:47 2020        (r365871)
@@ -31,7 +31,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd Dec 10, 2019
+.Dd September 17, 2020
 .Dt CXGBE 4
 .Os
 .Sh NAME
@@ -61,8 +61,8 @@ driver provides support for PCI Express Ethernet adapt
 the Chelsio Terminator 4, Terminator 5, and Terminator 6 ASICs (T4, T5, and 
T6).
 The driver supports Jumbo Frames, Transmit/Receive checksum offload,
 TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN
-tag insertion/extraction, VLAN checksum offload, VLAN TSO, and
-Receive Side Steering (RSS).
+tag insertion/extraction, VLAN checksum offload, VLAN TSO, VXLAN checksum
+offload, VXLAN TSO, and Receive Side Steering (RSS).
 For further hardware information and questions related to hardware
 requirements, see
 .Pa http://www.chelsio.com/ .

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h        Fri Sep 18 02:37:57 2020        
(r365870)
+++ head/sys/dev/cxgbe/adapter.h        Fri Sep 18 03:01:47 2020        
(r365871)
@@ -119,6 +119,7 @@ enum {
        TX_SGL_SEGS = 39,
        TX_SGL_SEGS_TSO = 38,
        TX_SGL_SEGS_EO_TSO = 30,        /* XXX: lower for IPv6. */
+       TX_SGL_SEGS_VXLAN_TSO = 37,
        TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
@@ -286,6 +287,7 @@ struct port_info {
        int nvi;
        int up_vis;
        int uld_vis;
+       bool vxlan_tcam_entry;
 
        struct tx_sched_params *sched_params;
 
@@ -593,6 +595,8 @@ struct sge_txq {
        uint64_t txpkts0_pkts;  /* # of frames in type0 coalesced tx WRs */
        uint64_t txpkts1_pkts;  /* # of frames in type1 coalesced tx WRs */
        uint64_t raw_wrs;       /* # of raw work requests (alloc_wr_mbuf) */
+       uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */
+       uint64_t vxlan_txcsum;
 
        uint64_t kern_tls_records;
        uint64_t kern_tls_short;
@@ -625,6 +629,7 @@ struct sge_rxq {
 
        uint64_t rxcsum;        /* # of times hardware assisted with checksum */
        uint64_t vlan_extraction;/* # of times VLAN tag was extracted */
+       uint64_t vxlan_rxcsum;
 
        /* stats for not-that-common events */
 
@@ -847,6 +852,11 @@ struct adapter {
        struct sge sge;
        int lro_timeout;
        int sc_do_rxcopy;
+
+       int vxlan_port;
+       u_int vxlan_refcount;
+       int rawf_base;
+       int nrawf;
 
        struct taskqueue *tq[MAX_NCHAN];        /* General purpose taskqueues */
        struct task async_event_task;

Modified: head/sys/dev/cxgbe/common/common.h
==============================================================================
--- head/sys/dev/cxgbe/common/common.h  Fri Sep 18 02:37:57 2020        
(r365870)
+++ head/sys/dev/cxgbe/common/common.h  Fri Sep 18 03:01:47 2020        
(r365871)
@@ -249,7 +249,7 @@ struct tp_params {
        uint32_t max_rx_pdu;
        uint32_t max_tx_pdu;
        uint64_t hash_filter_mask;
-       __be16 err_vec_mask;
+       bool rx_pkt_encap;
 
        int8_t fcoe_shift;
        int8_t port_shift;

Modified: head/sys/dev/cxgbe/common/t4_hw.c
==============================================================================
--- head/sys/dev/cxgbe/common/t4_hw.c   Fri Sep 18 02:37:57 2020        
(r365870)
+++ head/sys/dev/cxgbe/common/t4_hw.c   Fri Sep 18 03:01:47 2020        
(r365871)
@@ -9647,19 +9647,11 @@ int t4_init_tp_params(struct adapter *adap, bool sleep
 
        read_filter_mode_and_ingress_config(adap, sleep_ok);
 
-       /*
-        * Cache a mask of the bits that represent the error vector portion of
-        * rx_pkt.err_vec.  T6+ can use a compressed error vector to make room
-        * for information about outer encapsulation (GENEVE/VXLAN/NVGRE).
-        */
-       tpp->err_vec_mask = htobe16(0xffff);
        if (chip_id(adap) > CHELSIO_T5) {
                v = t4_read_reg(adap, A_TP_OUT_CONFIG);
-               if (v & F_CRXPKTENC) {
-                       tpp->err_vec_mask =
-                           htobe16(V_T6_COMPR_RXERR_VEC(M_T6_COMPR_RXERR_VEC));
-               }
-       }
+               tpp->rx_pkt_encap = v & F_CRXPKTENC;
+       } else
+               tpp->rx_pkt_encap = false;
 
        rx_len = t4_read_reg(adap, A_TP_PMM_RX_PAGE_SIZE);
        tx_len = t4_read_reg(adap, A_TP_PMM_TX_PAGE_SIZE);

Modified: head/sys/dev/cxgbe/firmware/t6fw_cfg.txt
==============================================================================
--- head/sys/dev/cxgbe/firmware/t6fw_cfg.txt    Fri Sep 18 02:37:57 2020        
(r365870)
+++ head/sys/dev/cxgbe/firmware/t6fw_cfg.txt    Fri Sep 18 03:01:47 2020        
(r365871)
@@ -146,7 +146,8 @@
        nethctrl = 1024
        neq = 2048
        nqpcq = 8192
-       nexactf = 456
+       nexactf = 454
+       nrawf = 2
        cmask = all
        pmask = all
        ncrypto_lookaside = 16
@@ -272,7 +273,7 @@
 
 [fini]
        version = 0x1
-       checksum = 0x13640470
+       checksum = 0xa92352a8
 #
 # $FreeBSD$
 #

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c        Fri Sep 18 02:37:57 2020        
(r365870)
+++ head/sys/dev/cxgbe/t4_main.c        Fri Sep 18 03:01:47 2020        
(r365871)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
+#include <sys/eventhandler.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
@@ -1069,6 +1070,8 @@ t4_attach(device_t dev)
        TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
 #endif
 
+       refcount_init(&sc->vxlan_refcount, 0);
+
        rc = t4_map_bars_0_and_4(sc);
        if (rc != 0)
                goto done; /* error message displayed already */
@@ -1716,6 +1719,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
        struct ifnet *ifp;
        struct sbuf *sb;
        struct pfil_head_args pa;
+       struct adapter *sc = vi->adapter;
 
        vi->xact_addr_filt = -1;
        callout_init(&vi->tick, 1);
@@ -1749,28 +1753,36 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 
        ifp->if_capabilities = T4_CAP;
        ifp->if_capenable = T4_CAP_ENABLE;
+       ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
+           CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
+       if (chip_id(sc) >= CHELSIO_T6) {
+               ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+               ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+               ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
+                   CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+                   CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN;
+       }
+
 #ifdef TCP_OFFLOAD
-       if (vi->nofldrxq != 0 && (vi->adapter->flags & KERN_TLS_OK) == 0)
+       if (vi->nofldrxq != 0 && (sc->flags & KERN_TLS_OK) == 0)
                ifp->if_capabilities |= IFCAP_TOE;
 #endif
 #ifdef RATELIMIT
-       if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0) {
+       if (is_ethoffload(sc) && vi->nofldtxq != 0) {
                ifp->if_capabilities |= IFCAP_TXRTLMT;
                ifp->if_capenable |= IFCAP_TXRTLMT;
        }
 #endif
-       ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
-           CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
 
        ifp->if_hw_tsomax = IP_MAXPACKET;
        ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
 #ifdef RATELIMIT
-       if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0)
+       if (is_ethoffload(sc) && vi->nofldtxq != 0)
                ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
 #endif
        ifp->if_hw_tsomaxsegsize = 65536;
 #ifdef KERN_TLS
-       if (vi->adapter->flags & KERN_TLS_OK) {
+       if (sc->flags & KERN_TLS_OK) {
                ifp->if_capabilities |= IFCAP_TXTLS;
                ifp->if_capenable |= IFCAP_TXTLS;
        }
@@ -2100,6 +2112,17 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, cadd
                if (mask & IFCAP_TXTLS)
                        ifp->if_capenable ^= (mask & IFCAP_TXTLS);
 #endif
+               if (mask & IFCAP_VXLAN_HWCSUM) {
+                       ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM;
+                       ifp->if_hwassist ^= CSUM_INNER_IP6_UDP |
+                           CSUM_INNER_IP6_TCP | CSUM_INNER_IP |
+                           CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP;
+               }
+               if (mask & IFCAP_VXLAN_HWTSO) {
+                       ifp->if_capenable ^= IFCAP_VXLAN_HWTSO;
+                       ifp->if_hwassist ^= CSUM_INNER_IP6_TSO |
+                           CSUM_INNER_IP_TSO;
+               }
 
 #ifdef VLAN_CAPABILITIES
                VLAN_CAPABILITIES(ifp);
@@ -4411,6 +4434,19 @@ get_params__post_init(struct adapter *sc)
                        MPASS(sc->tids.hpftid_base == 0);
                        MPASS(sc->tids.tid_base == sc->tids.nhpftids);
                }
+
+               param[0] = FW_PARAM_PFVF(RAWF_START);
+               param[1] = FW_PARAM_PFVF(RAWF_END);
+               rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
+               if (rc != 0) {
+                       device_printf(sc->dev,
+                          "failed to query rawf parameters: %d.\n", rc);
+                       return (rc);
+               }
+               if ((int)val[1] > (int)val[0]) {
+                       sc->rawf_base = val[0];
+                       sc->nrawf = val[1] - val[0] + 1;
+               }
        }
 
        /*
@@ -5142,6 +5178,7 @@ update_mac_settings(struct ifnet *ifp, int flags)
        struct port_info *pi = vi->pi;
        struct adapter *sc = pi->adapter;
        int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
+       uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
 
        ASSERT_SYNCHRONIZED_OP(sc);
        KASSERT(flags, ("%s: not told what to update.", __func__));
@@ -5215,7 +5252,7 @@ update_mac_settings(struct ifnet *ifp, int flags)
                                rc = -rc;
                                for (j = 0; j < ctx.i; j++) {
                                        if_printf(ifp,
-                                           "failed to add mc address"
+                                           "failed to add mcast address"
                                            " %02x:%02x:%02x:"
                                            "%02x:%02x:%02x rc=%d\n",
                                            ctx.mcaddr[j][0], ctx.mcaddr[j][1],
@@ -5225,14 +5262,36 @@ update_mac_settings(struct ifnet *ifp, int flags)
                                }
                                return (rc);
                        }
+                       ctx.del = 0;
                } else
                        NET_EPOCH_EXIT(et);
 
                rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0);
                if (rc != 0)
-                       if_printf(ifp, "failed to set mc address hash: %d", rc);
+                       if_printf(ifp, "failed to set mcast address hash: %d\n",
+                           rc);
+               if (ctx.del == 0) {
+                       /* We clobbered the VXLAN entry if there was one. */
+                       pi->vxlan_tcam_entry = false;
+               }
        }
 
+       if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 &&
+           pi->vxlan_tcam_entry == false) {
+               rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac,
+                   match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id,
+                   true);
+               if (rc < 0) {
+                       rc = -rc;
+                       if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n",
+                           rc);
+               } else {
+                       MPASS(rc == sc->rawf_base + pi->port_id);
+                       rc = 0;
+                       pi->vxlan_tcam_entry = true;
+               }
+       }
+
        return (rc);
 }
 
@@ -10407,6 +10466,7 @@ clear_stats(struct adapter *sc, u_int port_id)
 #endif
                                rxq->rxcsum = 0;
                                rxq->vlan_extraction = 0;
+                               rxq->vxlan_rxcsum = 0;
 
                                rxq->fl.cl_allocated = 0;
                                rxq->fl.cl_recycled = 0;
@@ -10425,6 +10485,8 @@ clear_stats(struct adapter *sc, u_int port_id)
                                txq->txpkts0_pkts = 0;
                                txq->txpkts1_pkts = 0;
                                txq->raw_wrs = 0;
+                               txq->vxlan_tso_wrs = 0;
+                               txq->vxlan_txcsum = 0;
                                txq->kern_tls_records = 0;
                                txq->kern_tls_short = 0;
                                txq->kern_tls_partial = 0;
@@ -11235,6 +11297,116 @@ DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL)
 }
 #endif
 
+static eventhandler_tag vxlan_start_evtag;
+static eventhandler_tag vxlan_stop_evtag;
+
+struct vxlan_evargs {
+       struct ifnet *ifp;
+       uint16_t port;
+};
+
+static void
+t4_vxlan_start(struct adapter *sc, void *arg)
+{
+       struct vxlan_evargs *v = arg;
+       struct port_info *pi;
+       uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
+       int i, rc;
+
+       if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+               return;
+       if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0)
+               return;
+
+       if (sc->vxlan_refcount == 0) {
+               sc->vxlan_port = v->port;
+               sc->vxlan_refcount = 1;
+               t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE,
+                   V_VXLAN(v->port) | F_VXLAN_EN);
+               for_each_port(sc, i) {
+                       pi = sc->port[i];
+                       if (pi->vxlan_tcam_entry == true)
+                               continue;
+                       rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid,
+                           match_all_mac, match_all_mac,
+                           sc->rawf_base + pi->port_id, 1, pi->port_id, true);
+                       if (rc < 0) {
+                               rc = -rc;
+                               log(LOG_ERR,
+                                   "%s: failed to add VXLAN TCAM entry: %d.\n",
+                                   device_get_name(pi->vi[0].dev), rc);
+                       } else {
+                               MPASS(rc == sc->rawf_base + pi->port_id);
+                               rc = 0;
+                               pi->vxlan_tcam_entry = true;
+                       }
+               }
+       } else if (sc->vxlan_port == v->port) {
+               sc->vxlan_refcount++;
+       } else {
+               log(LOG_ERR, "%s: VXLAN already configured on port  %d; "
+                   "ignoring attempt to configure it on port %d\n",
+                   device_get_nameunit(sc->dev), sc->vxlan_port, v->port);
+       }
+       end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_stop(struct adapter *sc, void *arg)
+{
+       struct vxlan_evargs *v = arg;
+
+       if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+               return;
+       if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0)
+               return;
+
+       /*
+        * VXLANs may have been configured before the driver was loaded so we
+        * may see more stops than starts.  This is not handled cleanly but at
+        * least we keep the refcount sane.
+        */
+       if (sc->vxlan_port != v->port)
+               goto done;
+       if (sc->vxlan_refcount == 0) {
+               log(LOG_ERR,
+                   "%s: VXLAN operation on port %d was stopped earlier; "
+                   "ignoring attempt to stop it again.\n",
+                   device_get_nameunit(sc->dev), sc->vxlan_port);
+       } else if (--sc->vxlan_refcount == 0) {
+               t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0);
+       }
+done:
+       end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp,
+    sa_family_t family, u_int port)
+{
+       struct vxlan_evargs v;
+
+       MPASS(family == AF_INET || family == AF_INET6);
+       v.ifp = ifp;
+       v.port = port;
+
+       t4_iterate(t4_vxlan_start, &v);
+}
+
+static void
+t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t 
family,
+    u_int port)
+{
+       struct vxlan_evargs v;
+
+       MPASS(family == AF_INET || family == AF_INET6);
+       v.ifp = ifp;
+       v.port = port;
+
+       t4_iterate(t4_vxlan_stop, &v);
+}
+
+
 static struct sx mlu;  /* mod load unload */
 SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
 
@@ -11278,6 +11450,14 @@ mod_event(module_t mod, int cmd, void *arg)
 #endif
                        t4_tracer_modload();
                        tweak_tunables();
+                       vxlan_start_evtag =
+                           EVENTHANDLER_REGISTER(vxlan_start,
+                               t4_vxlan_start_handler, NULL,
+                               EVENTHANDLER_PRI_ANY);
+                       vxlan_stop_evtag =
+                           EVENTHANDLER_REGISTER(vxlan_stop,
+                               t4_vxlan_stop_handler, NULL,
+                               EVENTHANDLER_PRI_ANY);
                }
                sx_xunlock(&mlu);
                break;
@@ -11314,6 +11494,10 @@ mod_event(module_t mod, int cmd, void *arg)
                        sx_sunlock(&t4_list_lock);
 
                        if (t4_sge_extfree_refs() == 0) {
+                               EVENTHANDLER_DEREGISTER(vxlan_start,
+                                   vxlan_start_evtag);
+                               EVENTHANDLER_DEREGISTER(vxlan_stop,
+                                   vxlan_stop_evtag);
                                t4_tracer_modunload();
 #ifdef KERN_TLS
                                t6_ktls_modunload();

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c Fri Sep 18 02:37:57 2020        (r365870)
+++ head/sys/dev/cxgbe/t4_sge.c Fri Sep 18 03:01:47 2020        (r365871)
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
+#include <net/if_vxlan.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
@@ -266,8 +267,9 @@ static int find_refill_source(struct adapter *, int, b
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
-static inline u_int txpkt_len16(u_int, u_int);
-static inline u_int txpkt_vm_len16(u_int, u_int);
+static inline u_int txpkt_len16(u_int, const u_int);
+static inline u_int txpkt_vm_len16(u_int, const u_int);
+static inline void calculate_mbuf_len16(struct adapter *, struct mbuf *);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
@@ -1917,13 +1919,42 @@ eth_rx(struct adapter *sc, struct sge_rxq *rxq, const 
 #if defined(INET) || defined(INET6)
        struct lro_ctrl *lro = &rxq->lro;
 #endif
+       uint16_t err_vec, tnl_type, tnlhdr_len;
        static const int sw_hashtype[4][2] = {
                {M_HASHTYPE_NONE, M_HASHTYPE_NONE},
                {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
                {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
                {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
        };
+       static const int sw_csum_flags[2][2] = {
+               {
+                       /* IP, inner IP */
+                       CSUM_ENCAP_VXLAN |
+                           CSUM_L3_CALC | CSUM_L3_VALID |
+                           CSUM_L4_CALC | CSUM_L4_VALID |
+                           CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+                           CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
 
+                       /* IP, inner IP6 */
+                       CSUM_ENCAP_VXLAN |
+                           CSUM_L3_CALC | CSUM_L3_VALID |
+                           CSUM_L4_CALC | CSUM_L4_VALID |
+                           CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+               },
+               {
+                       /* IP6, inner IP */
+                       CSUM_ENCAP_VXLAN |
+                           CSUM_L4_CALC | CSUM_L4_VALID |
+                           CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+                           CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+
+                       /* IP6, inner IP6 */
+                       CSUM_ENCAP_VXLAN |
+                           CSUM_L4_CALC | CSUM_L4_VALID |
+                           CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+               },
+       };
+
        MPASS(plen > sc->params.sge.fl_pktshift);
        if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
            __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
@@ -1963,23 +1994,73 @@ have_mbuf:
        m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
 
        cpl = (const void *)(&d->rss + 1);
-       if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
-               if (ifp->if_capenable & IFCAP_RXCSUM &&
-                   cpl->l2info & htobe32(F_RXF_IP)) {
-                       m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
-                           CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+       if (sc->params.tp.rx_pkt_encap) {
+               const uint16_t ev = be16toh(cpl->err_vec);
+
+               err_vec = G_T6_COMPR_RXERR_VEC(ev);
+               tnl_type = G_T6_RX_TNL_TYPE(ev);
+               tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
+       } else {
+               err_vec = be16toh(cpl->err_vec);
+               tnl_type = 0;
+               tnlhdr_len = 0;
+       }
+       if (cpl->csum_calc && err_vec == 0) {
+               int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
+
+               /* checksum(s) calculated and found to be correct. */
+
+               MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
+                   (cpl->l2info & htobe32(F_RXF_IP6)));
+               m0->m_pkthdr.csum_data = be16toh(cpl->csum);
+               if (tnl_type == 0) {
+                       if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) {
+                               m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+                                   CSUM_L3_VALID | CSUM_L4_CALC |
+                                   CSUM_L4_VALID;
+                       } else if (ipv6 && ifp->if_capenable & 
IFCAP_RXCSUM_IPV6) {
+                               m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+                                   CSUM_L4_VALID;
+                       }
                        rxq->rxcsum++;
-               } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
-                   cpl->l2info & htobe32(F_RXF_IP6)) {
-                       m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
-                           CSUM_PSEUDO_HDR);
-                       rxq->rxcsum++;
-               }
+               } else {
+                       MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
+                       if (__predict_false(cpl->ip_frag)) {
+                               /*
+                                * csum_data is for the inner frame (which is an
+                                * IP fragment) and is not 0xffff.  There is no
+                                * way to pass the inner csum_data to the stack.
+                                * We don't want the stack to use the inner
+                                * csum_data to validate the outer frame or it
+                                * will get rejected.  So we fix csum_data here
+                                * and let sw do the checksum of inner IP
+                                * fragments.
+                                *
+                                * XXX: Need 32b for csum_data2 in an rx mbuf.
+                                * Maybe stuff it into rcv_tstmp?
+                                */
+                               m0->m_pkthdr.csum_data = 0xffff;
+                               if (ipv6) {
+                                       m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+                                           CSUM_L4_VALID;
+                               } else {
+                                       m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+                                           CSUM_L3_VALID | CSUM_L4_CALC |
+                                           CSUM_L4_VALID;
+                               }
+                       } else {
+                               int outer_ipv6;
 
-               if (__predict_false(cpl->ip_frag))
-                       m0->m_pkthdr.csum_data = be16toh(cpl->csum);
-               else
-                       m0->m_pkthdr.csum_data = 0xffff;
+                               MPASS(m0->m_pkthdr.csum_data == 0xffff);
+
+                               outer_ipv6 = tnlhdr_len >=
+                                   sizeof(struct ether_header) +
+                                   sizeof(struct ip6_hdr);
+                               m0->m_pkthdr.csum_flags =
+                                   sw_csum_flags[outer_ipv6][ipv6];
+                       }
+                       rxq->vxlan_rxcsum++;
+               }
        }
 
        if (cpl->vlan_ex) {
@@ -2007,7 +2088,7 @@ have_mbuf:
        m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
 #endif
 #if defined(INET) || defined(INET6)
-       if (rxq->iq.flags & IQ_LRO_ENABLED &&
+       if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 &&
            (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
            M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
                if (sort_before_lro(lro)) {
@@ -2179,10 +2260,10 @@ mbuf_nsegs(struct mbuf *m)
 {
 
        M_ASSERTPKTHDR(m);
-       KASSERT(m->m_pkthdr.l5hlen > 0,
+       KASSERT(m->m_pkthdr.inner_l5hlen > 0,
            ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
-       return (m->m_pkthdr.l5hlen);
+       return (m->m_pkthdr.inner_l5hlen);
 }
 
 static inline void
@@ -2190,7 +2271,7 @@ set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
        M_ASSERTPKTHDR(m);
-       m->m_pkthdr.l5hlen = nsegs;
+       m->m_pkthdr.inner_l5hlen = nsegs;
 }
 
 static inline int
@@ -2316,63 +2397,108 @@ alloc_wr_mbuf(int len, int how)
        return (m);
 }
 
-static inline int
+static inline bool
 needs_hwcsum(struct mbuf *m)
 {
+       const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
+           CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+           CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
+           CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
+           CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
 
        M_ASSERTPKTHDR(m);
 
-       return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP |
-           CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
+       return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
+static inline bool
 needs_tso(struct mbuf *m)
 {
+       const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
+           CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
 
        M_ASSERTPKTHDR(m);
 
-       return (m->m_pkthdr.csum_flags & CSUM_TSO);
+       return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
+static inline bool
+needs_vxlan_csum(struct mbuf *m)
+{
+
+       M_ASSERTPKTHDR(m);
+
+       return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_vxlan_tso(struct mbuf *m)
+{
+       const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
+           CSUM_INNER_IP6_TSO;
+
+       M_ASSERTPKTHDR(m);
+
+       return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
+           (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_inner_tcp_csum(struct mbuf *m)
+{
+       const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
+
+       M_ASSERTPKTHDR(m);
+
+       return (m->m_pkthdr.csum_flags & csum_flags);
+}
+
+static inline bool
 needs_l3_csum(struct mbuf *m)
 {
+       const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
+           CSUM_INNER_IP_TSO;
 
        M_ASSERTPKTHDR(m);
 
-       return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
+       return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
-needs_tcp_csum(struct mbuf *m)
+static inline bool
+needs_outer_tcp_csum(struct mbuf *m)
 {
+       const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
+           CSUM_IP6_TSO;
 
        M_ASSERTPKTHDR(m);
-       return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
+
+       return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 #ifdef RATELIMIT
-static inline int
-needs_l4_csum(struct mbuf *m)
+static inline bool
+needs_outer_l4_csum(struct mbuf *m)
 {
+       const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
+           CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
 
        M_ASSERTPKTHDR(m);
 
-       return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
-           CSUM_TCP_IPV6 | CSUM_TSO));
+       return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
-needs_udp_csum(struct mbuf *m)
+static inline bool
+needs_outer_udp_csum(struct mbuf *m)
 {
+       const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
 
        M_ASSERTPKTHDR(m);
-       return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
+
+       return (m->m_pkthdr.csum_flags & csum_flags);
 }
 #endif
 
-static inline int
+static inline bool
 needs_vlan_insertion(struct mbuf *m)
 {
 
@@ -2513,6 +2639,23 @@ count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cf
 }
 
 /*
+ * The maximum number of segments that can fit in a WR.
+ */
+static int
+max_nsegs_allowed(struct mbuf *m)
+{
+
+       if (needs_tso(m)) {
+               if (needs_vxlan_tso(m))
+                       return (TX_SGL_SEGS_VXLAN_TSO);
+               else
+                       return (TX_SGL_SEGS_TSO);
+       }
+
+       return (TX_SGL_SEGS);
+}
+
+/*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
@@ -2570,7 +2713,7 @@ restart:
                return (0);
        }
 #endif
-       if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
+       if (nsegs > max_nsegs_allowed(m0)) {
                if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
                        rc = EFBIG;
                        goto fail;
@@ -2592,18 +2735,15 @@ restart:
        }
        set_mbuf_nsegs(m0, nsegs);
        set_mbuf_cflags(m0, cflags);
-       if (sc->flags & IS_VF)
-               set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
-       else
-               set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
+       calculate_mbuf_len16(sc, m0);
 
 #ifdef RATELIMIT
        /*
         * Ethofld is limited to TCP and UDP for now, and only when L4 hw
-        * checksumming is enabled.  needs_l4_csum happens to check for all the
-        * right things.
+        * checksumming is enabled.  needs_outer_l4_csum happens to check for
+        * all the right things.
         */
-       if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) {
+       if (__predict_false(needs_eo(cst) && !needs_outer_l4_csum(m0))) {
                m_snd_tag_rele(m0->m_pkthdr.snd_tag);
                m0->m_pkthdr.snd_tag = NULL;
                m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
@@ -2635,21 +2775,27 @@ restart:
        switch (eh_type) {
 #ifdef INET6
        case ETHERTYPE_IPV6:
-       {
-               struct ip6_hdr *ip6 = l3hdr;
-
-               MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
-
-               m0->m_pkthdr.l3hlen = sizeof(*ip6);
+               m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
                break;
-       }
 #endif
 #ifdef INET
        case ETHERTYPE_IP:
        {
                struct ip *ip = l3hdr;
 
-               m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
+               if (needs_vxlan_csum(m0)) {
+                       /* Driver will do the outer IP hdr checksum. */
+                       ip->ip_sum = 0;
+                       if (needs_vxlan_tso(m0)) {
+                               const uint16_t ipl = ip->ip_len;
+
+                               ip->ip_len = 0;
+                               ip->ip_sum = ~in_cksum_hdr(ip);
+                               ip->ip_len = ipl;
+                       } else
+                               ip->ip_sum = in_cksum_hdr(ip);
+               }
+               m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
                break;
        }
 #endif
@@ -2659,8 +2805,59 @@ restart:
                    __func__, eh_type);
        }
 
+       if (needs_vxlan_csum(m0)) {
+               m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
+               m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
+
+               /* Inner headers. */
+               eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
+                   sizeof(struct udphdr) + sizeof(struct vxlan_header));
+               eh_type = ntohs(eh->ether_type);
+               if (eh_type == ETHERTYPE_VLAN) {
+                       struct ether_vlan_header *evh = (void *)eh;
+
+                       eh_type = ntohs(evh->evl_proto);
+                       m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
+               } else
+                       m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
+               l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
+
+               switch (eh_type) {
+#ifdef INET6
+               case ETHERTYPE_IPV6:
+                       m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
+                       break;
+#endif
+#ifdef INET
+               case ETHERTYPE_IP:
+               {
+                       struct ip *ip = l3hdr;
+
+                       m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
+                       break;
+               }
+#endif
+               default:
+                       panic("%s: VXLAN hw offload requested with unknown "
+                           "ethertype 0x%04x.  if_cxgbe must be compiled"
+                           " with the same INET/INET6 options as the kernel.",
+                           __func__, eh_type);
+               }
 #if defined(INET) || defined(INET6)
-       if (needs_tcp_csum(m0)) {
+               if (needs_inner_tcp_csum(m0)) {
+                       tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
+                       m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
+               }
+#endif
+               MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+               m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
+                   CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
+                   CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
+                   CSUM_ENCAP_VXLAN;
+       }
+
+#if defined(INET) || defined(INET6)
+       if (needs_outer_tcp_csum(m0)) {
                tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
                m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
@@ -2670,7 +2867,7 @@ restart:
                            V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
                } else
                        set_mbuf_eo_tsclk_tsoff(m0, 0);
-       } else if (needs_udp_csum(m0)) {
+       } else if (needs_outer_udp_csum(m0)) {
                m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
        }
@@ -3627,6 +3824,9 @@ alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int
        SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
            CTLFLAG_RD, &rxq->vlan_extraction,
            "# of times hardware extracted 802.1Q tag");
+       SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_rxcsum",
+           CTLFLAG_RD, &rxq->vxlan_rxcsum,
+           "# of times hardware assisted with inner checksum (VXLAN) ");
 
        add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
@@ -4281,6 +4481,11 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int
            "# of frames tx'd using type1 txpkts work requests");
        SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
            &txq->raw_wrs, "# of raw work requests (non-packets)");
+       SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs",
+           CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
+       SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_txcsum",
+           CTLFLAG_RD, &txq->vxlan_txcsum,
+           "# of times hardware assisted with inner checksums (VXLAN)");
 
 #ifdef KERN_TLS
        if (sc->flags & KERN_TLS_OK) {
@@ -4570,27 +4775,25 @@ get_pkt_gl(struct mbuf *m, struct sglist *gl)
        KASSERT(gl->sg_nseg == mbuf_nsegs(m),
            ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
            mbuf_nsegs(m), gl->sg_nseg));
-       KASSERT(gl->sg_nseg > 0 &&
-           gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
+       KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m),
            ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
-               gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
+               gl->sg_nseg, max_nsegs_allowed(m)));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
-txpkt_len16(u_int nsegs, u_int tso)
+txpkt_len16(u_int nsegs, const u_int extra)
 {
        u_int n;
 
        MPASS(nsegs > 0);
 
        nsegs--; /* first segment is part of ulptx_sgl */
-       n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
+       n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
+           sizeof(struct cpl_tx_pkt_core) +
            sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
-       if (tso)
-               n += sizeof(struct cpl_tx_pkt_lso_core);
 
        return (howmany(n, 16));
 }
@@ -4600,22 +4803,43 @@ txpkt_len16(u_int nsegs, u_int tso)
  * request header.
  */
 static inline u_int
-txpkt_vm_len16(u_int nsegs, u_int tso)
+txpkt_vm_len16(u_int nsegs, const u_int extra)
 {
        u_int n;
 
        MPASS(nsegs > 0);
 
        nsegs--; /* first segment is part of ulptx_sgl */
-       n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
+       n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
            sizeof(struct cpl_tx_pkt_core) +
            sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
-       if (tso)
-               n += sizeof(struct cpl_tx_pkt_lso_core);
 
        return (howmany(n, 16));
 }
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to