Author: np
Date: Tue Mar 18 20:14:13 2014
New Revision: 263317
URL: http://svnweb.freebsd.org/changeset/base/263317

Log:
  cxgbe(4): significant rx rework.
  
  - More flexible cluster size selection, including the ability to fall
    back to a safe cluster size (PAGE_SIZE from zone_jumbop by default) in
    case an allocation of a larger size fails.
  - A single get_fl_payload() function that assembles the payload into an
    mbuf chain for any kind of freelist.  This replaces two variants: one
    for freelists with buffer packing enabled and another for those without.
  - Buffer packing with any sized cluster.  It was limited to 4K clusters
    only before this change.
  - Enable buffer packing for TOE rx queues as well.
  - Statistics and tunables to go with all these changes.  The driver's
    man page will be updated separately.
  
  MFC after:    5 weeks

Modified:
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/common/t4_hw.h
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sge.c

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h        Tue Mar 18 20:05:55 2014        
(r263316)
+++ head/sys/dev/cxgbe/adapter.h        Tue Mar 18 20:14:13 2014        
(r263317)
@@ -134,10 +134,11 @@ enum {
 
        RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */
 #if MJUMPAGESIZE != MCLBYTES
-       FL_BUF_SIZES_MAX = 5,   /* cluster, jumbop, jumbo9k, jumbo16k, extra */
+       SW_ZONE_SIZES = 4,      /* cluster, jumbop, jumbo9k, jumbo16k */
 #else
-       FL_BUF_SIZES_MAX = 4,   /* cluster, jumbo9k, jumbo16k, extra */
+       SW_ZONE_SIZES = 3,      /* cluster, jumbo9k, jumbo16k */
 #endif
+       CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
        CTRL_EQ_QSIZE = 128,
 
@@ -241,15 +242,28 @@ struct port_info {
        uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */
 };
 
-struct fl_sdesc {
-       bus_dmamap_t map;
-       caddr_t cl;
-       uint8_t tag_idx;        /* the fl->tag entry this map comes from */
+/* Where the cluster came from, how it has been carved up. */
+struct cluster_layout {
+       int8_t zidx;
+       int8_t hwidx;
+       uint16_t region1;       /* mbufs laid out within this region */
+                               /* region2 is the DMA region */
+       uint16_t region3;       /* cluster_metadata within this region */
+};
+
+struct cluster_metadata {
+       u_int refcount;
 #ifdef INVARIANTS
-       __be64 ba_hwtag;
+       struct fl_sdesc *sd;    /* For debug only.  Could easily be stale */
 #endif
 };
 
+struct fl_sdesc {
+       caddr_t cl;
+       uint8_t nmbuf;
+       struct cluster_layout cll;
+};
+
 struct tx_desc {
        __be64 flit[8];
 };
@@ -368,17 +382,19 @@ struct sge_eq {
        uint32_t unstalled;     /* recovered from stall */
 };
 
-struct fl_buf_info {
-       u_int size;
-       int type;
-       int hwtag:4;    /* tag in low 4 bits of the pa. */
-       uma_zone_t zone;
-};
-#define FL_BUF_SIZES(sc)       (sc->sge.fl_buf_sizes)
-#define FL_BUF_SIZE(sc, x)     (sc->sge.fl_buf_info[x].size)
-#define FL_BUF_TYPE(sc, x)     (sc->sge.fl_buf_info[x].type)
-#define FL_BUF_HWTAG(sc, x)    (sc->sge.fl_buf_info[x].hwtag)
-#define FL_BUF_ZONE(sc, x)     (sc->sge.fl_buf_info[x].zone)
+struct sw_zone_info {
+       uma_zone_t zone;        /* zone that this cluster comes from */
+       int size;               /* size of cluster: 2K, 4K, 9K, 16K, etc. */
+       int type;               /* EXT_xxx type of the cluster */
+       int8_t head_hwidx;
+       int8_t tail_hwidx;
+};
+
+struct hw_buf_info {
+       int8_t zidx;            /* backpointer to zone; -ve means unused */
+       int8_t next;            /* next hwidx for this zone; -1 means no more */
+       int size;
+};
 
 enum {
        FL_STARVING     = (1 << 0), /* on the adapter's list of starving fl's */
@@ -392,9 +408,8 @@ enum {
 struct sge_fl {
        bus_dma_tag_t desc_tag;
        bus_dmamap_t desc_map;
-       bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are
-                                               valid */
-       uint8_t tag_idx;
+       struct cluster_layout cll_def;  /* default refill zone, layout */
+       struct cluster_layout cll_alt;  /* alternate refill zone, layout */
        struct mtx fl_lock;
        char lockname[16];
        int flags;
@@ -411,9 +426,17 @@ struct sge_fl {
        uint32_t needed;        /* # of buffers needed to fill up fl. */
        uint32_t lowat;         /* # of buffers <= this means fl needs help */
        uint32_t pending;       /* # of bufs allocated since last doorbell */
-       u_int dmamap_failed;
-       struct mbuf *mstash[8];
        TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
+
+       struct mbuf *m0;
+       struct mbuf **pnext;
+       u_int remaining;
+
+       uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */
+       uint64_t mbuf_inlined;  /* # of mbuf created within clusters */
+       uint64_t cl_allocated;  /* # of clusters allocated */
+       uint64_t cl_recycled;   /* # of clusters recycled */
+       uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */
 };
 
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
@@ -547,8 +570,11 @@ struct sge {
        struct sge_iq **iqmap;  /* iq->cntxt_id to iq mapping */
        struct sge_eq **eqmap;  /* eq->cntxt_id to eq mapping */
 
-       u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE);
-       struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX];
+       int pack_boundary;
+       int8_t safe_hwidx1;     /* may not have room for metadata */
+       int8_t safe_hwidx2;     /* with room for metadata and maybe more */
+       struct sw_zone_info sw_zone_info[SW_ZONE_SIZES];
+       struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES];
 };
 
 struct rss_header;

Modified: head/sys/dev/cxgbe/common/t4_hw.h
==============================================================================
--- head/sys/dev/cxgbe/common/t4_hw.h   Tue Mar 18 20:05:55 2014        
(r263316)
+++ head/sys/dev/cxgbe/common/t4_hw.h   Tue Mar 18 20:14:13 2014        
(r263317)
@@ -87,6 +87,7 @@ enum {
        SGE_NTIMERS = 6,          /* # of interrupt holdoff timer values */
        SGE_NCOUNTERS = 4,        /* # of interrupt packet counter values */
        SGE_MAX_IQ_SIZE = 65520,
+       SGE_FLBUF_SIZES = 16,
 };
 
 struct sge_qstat {                /* data written to SGE queue status entries 
*/

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c        Tue Mar 18 20:05:55 2014        
(r263316)
+++ head/sys/dev/cxgbe/t4_main.c        Tue Mar 18 20:14:13 2014        
(r263317)
@@ -494,6 +494,8 @@ CTASSERT(offsetof(struct sge_ofld_rxq, f
 CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS);
 CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES);
 
+CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
+
 static int
 t4_probe(device_t dev)
 {

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c Tue Mar 18 20:05:55 2014        (r263316)
+++ head/sys/dev/cxgbe/t4_sge.c Tue Mar 18 20:14:13 2014        (r263317)
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kdb.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
+#include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
@@ -52,6 +53,8 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <machine/md_var.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
@@ -124,6 +127,27 @@ static int t4_fl_pack;
 static int t5_fl_pack;
 TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
 
+/*
+ * Allow the driver to create mbuf(s) in a cluster allocated for rx.
+ * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
+ * 1: ok to create mbuf(s) within a cluster if there is room.
+ */
+static int allow_mbufs_in_cluster = 1;
+TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
+
+/*
+ * Largest rx cluster size that the driver is allowed to allocate.
+ */
+static int largest_rx_cluster = MJUM16BYTES;
+TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
+
+/*
+ * Size of cluster allocation that's most likely to succeed.  The driver will
+ * fall back to this size if it fails to allocate clusters larger than this.
+ */
+static int safest_rx_cluster = PAGE_SIZE;
+TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
+
 /* Used to track coalesced tx work request */
 struct txpkts {
        uint64_t *flitp;        /* ptr to flit where next pkt should start */
@@ -140,9 +164,7 @@ struct sgl {
 };
 
 static int service_iq(struct sge_iq *, int);
-static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, 
uint32_t,
-    int *);
-static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, 
uint32_t,
+static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t,
     int *);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf 
*);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
@@ -158,6 +180,8 @@ static int free_ring(struct adapter *, b
 static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
+static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
+    struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_mgmtq(struct adapter *);
@@ -191,7 +215,8 @@ static int refill_fl(struct adapter *, s
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
-static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int);
+static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
+static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
@@ -216,6 +241,7 @@ static int handle_fw_msg(struct sge_iq *
     struct mbuf *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
+static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
@@ -264,7 +290,7 @@ t4_sge_modload(void)
        /* T5's pack boundary is independent of the pad boundary. */
        if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
            !powerof2(fl_pack))
-              t5_fl_pack = max(pad, 64);
+              t5_fl_pack = max(pad, CACHE_LINE_SIZE);
        else
               t5_fl_pack = fl_pack;
 
@@ -313,14 +339,18 @@ t4_tweak_chip_settings(struct adapter *s
        int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
        int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
        uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
-       int sw_flbuf_sizes[] = {
+       static int sge_flbuf_sizes[] = {
                MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
                MJUMPAGESIZE,
+               MJUMPAGESIZE - CL_METADATA_SIZE,
+               MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
 #endif
                MJUM9BYTES,
                MJUM16BYTES,
-               MJUMPAGESIZE - MSIZE
+               MCLBYTES - MSIZE - CL_METADATA_SIZE,
+               MJUM9BYTES - CL_METADATA_SIZE,
+               MJUM16BYTES - CL_METADATA_SIZE,
        };
 
        KASSERT(sc->flags & MASTER_PF,
@@ -358,9 +388,11 @@ t4_tweak_chip_settings(struct adapter *s
            V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
        t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
-       for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) {
+       KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
+           ("%s: hw buffer size table too big", __func__));
+       for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
                t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
-                   sw_flbuf_sizes[i]);
+                   sge_flbuf_sizes[i]);
        }
 
        v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
@@ -415,6 +447,18 @@ t4_tweak_chip_settings(struct adapter *s
 }
 
 /*
+ * SGE wants the buffer to be at least 64B and then a multiple of the pad
+ * boundary or 16, whichever is greater.
+ */
+static inline int
+hwsz_ok(int hwsz)
+{
+       int mask = max(fl_pad, 16) - 1;
+
+       return (hwsz >= 64 && (hwsz & mask) == 0);
+}
+
+/*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
@@ -424,7 +468,7 @@ t4_read_chip_settings(struct adapter *sc
        int i, j, n, rc = 0;
        uint32_t m, v, r;
        uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
-       uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = {
+       static int sw_buf_sizes[] = {   /* Sorted by size */
                MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
                MJUMPAGESIZE,
@@ -432,6 +476,8 @@ t4_read_chip_settings(struct adapter *sc
                MJUM9BYTES,
                MJUM16BYTES
        };
+       struct sw_zone_info *swz, *safe_swz;
+       struct hw_buf_info *hwb;
 
        m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
        v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
@@ -462,6 +508,7 @@ t4_read_chip_settings(struct adapter *sc
                        rc = EINVAL;
                }
        }
+       s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
 
        v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
            V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
@@ -477,45 +524,93 @@ t4_read_chip_settings(struct adapter *sc
                rc = EINVAL;
        }
 
-       /*
-        * Make a list of SGE FL buffer sizes programmed in the chip and tally
-        * it with the FL buffer sizes that we'd like to use.
-        */
-       n = 0;
-       for (i = 0; i < nitems(sge_flbuf_sizes); i++) {
+       /* Filter out unusable hw buffer sizes entirely (mark with -2). */
+       hwb = &s->hw_buf_info[0];
+       for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
                r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
-               sge_flbuf_sizes[i] = r;
-               if (r == MJUMPAGESIZE - MSIZE &&
-                   (sc->flags & BUF_PACKING_OK) == 0) {
-                       sc->flags |= BUF_PACKING_OK;
-                       FL_BUF_HWTAG(sc, n) = i;
-                       FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE;
-                       FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE);
-                       FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE);
-                       n++;
-               }
+               hwb->size = r;
+               hwb->zidx = hwsz_ok(r) ? -1 : -2;
+               hwb->next = -1;
        }
-       for (i = 0; i < nitems(sw_flbuf_sizes); i++) {
-               for (j = 0; j < nitems(sge_flbuf_sizes); j++) {
-                       if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j])
+
+       /*
+        * Create a sorted list in decreasing order of hw buffer sizes (and so
+        * increasing order of spare area) for each software zone.
+        */
+       n = 0;  /* no usable buffer size to begin with */
+       swz = &s->sw_zone_info[0];
+       safe_swz = NULL;
+       for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
+               int8_t head = -1, tail = -1;
+
+               swz->size = sw_buf_sizes[i];
+               swz->zone = m_getzone(swz->size);
+               swz->type = m_gettype(swz->size);
+
+               if (swz->size == safest_rx_cluster)
+                       safe_swz = swz;
+
+               hwb = &s->hw_buf_info[0];
+               for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
+                       if (hwb->zidx != -1 || hwb->size > swz->size)
                                continue;
-                       FL_BUF_HWTAG(sc, n) = j;
-                       FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i];
-                       FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]);
-                       FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]);
+                       hwb->zidx = i;
+                       if (head == -1)
+                               head = tail = j;
+                       else if (hwb->size < s->hw_buf_info[tail].size) {
+                               s->hw_buf_info[tail].next = j;
+                               tail = j;
+                       } else {
+                               int8_t *cur;
+                               struct hw_buf_info *t;
+
+                               for (cur = &head; *cur != -1; cur = &t->next) {
+                                       t = &s->hw_buf_info[*cur];
+                                       if (hwb->size == t->size) {
+                                               hwb->zidx = -2;
+                                               break;
+                                       }
+                                       if (hwb->size > t->size) {
+                                               hwb->next = *cur;
+                                               *cur = j;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               swz->head_hwidx = head;
+               swz->tail_hwidx = tail;
+
+               if (tail != -1) {
                        n++;
-                       break;
+                       if (swz->size - s->hw_buf_info[tail].size >=
+                           CL_METADATA_SIZE)
+                               sc->flags |= BUF_PACKING_OK;
                }
        }
        if (n == 0) {
                device_printf(sc->dev, "no usable SGE FL buffer size.\n");
                rc = EINVAL;
-       } else if (n == 1 && (sc->flags & BUF_PACKING_OK)) {
-               device_printf(sc->dev,
-                   "no usable SGE FL buffer size when not packing buffers.\n");
-               rc = EINVAL;
        }
-       FL_BUF_SIZES(sc) = n;
+
+       s->safe_hwidx1 = -1;
+       s->safe_hwidx2 = -1;
+       if (safe_swz != NULL) {
+               s->safe_hwidx1 = safe_swz->head_hwidx;
+               for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
+                       int spare;
+
+                       hwb = &s->hw_buf_info[i];
+                       spare = safe_swz->size - hwb->size;
+                       if (spare < CL_METADATA_SIZE)
+                               continue;
+                       if (s->safe_hwidx2 == -1 ||
+                           spare == CL_METADATA_SIZE + MSIZE)
+                               s->safe_hwidx2 = i;
+                       if (spare >= CL_METADATA_SIZE + MSIZE)
+                               break;
+               }
+       }
 
        r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
        s->counter_val[0] = G_THRESHOLD_0(r);
@@ -627,6 +722,10 @@ t4_sge_sysctls(struct adapter *sc, struc
     struct sysctl_oid_list *children)
 {
 
+       SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
+           CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
+           "freelist buffer sizes");
+
        SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
            NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
@@ -644,8 +743,7 @@ t4_sge_sysctls(struct adapter *sc, struc
            "pack multiple frames in one fl buffer");
 
        SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
-           NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack,
-           "payload pack boundary (bytes)");
+           NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
@@ -765,7 +863,7 @@ port_intr_iq(struct port_info *pi, int i
 #ifdef TCP_OFFLOAD
        if (sc->flags & INTR_DIRECT) {
                idx %= pi->nrxq + pi->nofldrxq;
-               
+
                if (idx >= pi->nrxq) {
                        idx -= pi->nrxq;
                        iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq;
@@ -796,29 +894,28 @@ port_intr_iq(struct port_info *pi, int i
        return (iq);
 }
 
+/* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
-mtu_to_bufsize(int mtu)
+mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
 {
-       int bufsize;
-
-       /* large enough for a frame even when VLAN extraction is disabled */
-       bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu;
-       bufsize = roundup2(bufsize + fl_pktshift, fl_pad);
-
-       return (bufsize);
-}
+       int payload;
 
 #ifdef TCP_OFFLOAD
-static inline int
-mtu_to_bufsize_toe(struct adapter *sc, int mtu)
-{
-
-       if (sc->tt.rx_coalesce)
-               return (G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)));
+       if (toe) {
+               payload = sc->tt.rx_coalesce ?
+                   G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
+       } else {
+#endif
+               /* large enough even when hw VLAN extraction is disabled */
+               payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +
+                   mtu;
+#ifdef TCP_OFFLOAD
+       }
+#endif
+       payload = roundup2(payload, fl_pad);
 
-       return (mtu);
+       return (payload);
 }
-#endif
 
 int
 t4_setup_port_queues(struct port_info *pi)
@@ -837,7 +934,7 @@ t4_setup_port_queues(struct port_info *p
        struct ifnet *ifp = pi->ifp;
        struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
        struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
-       int bufsize, pack;
+       int maxp, pack, mtu = ifp->if_mtu;
 
        oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
            NULL, "rx queues");
@@ -858,7 +955,7 @@ t4_setup_port_queues(struct port_info *p
         * a) initialize iq and fl
         * b) allocate queue iff it will take direct interrupts.
         */
-       bufsize = mtu_to_bufsize(ifp->if_mtu);
+       maxp = mtu_to_max_payload(sc, mtu, 0);
        pack = enable_buffer_packing(sc);
        for_each_rxq(pi, i, rxq) {
 
@@ -867,7 +964,7 @@ t4_setup_port_queues(struct port_info *p
 
                snprintf(name, sizeof(name), "%s rxq%d-fl",
                    device_get_nameunit(pi->dev), i);
-               init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name);
+               init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name);
 
                if (sc->flags & INTR_DIRECT
 #ifdef TCP_OFFLOAD
@@ -883,8 +980,7 @@ t4_setup_port_queues(struct port_info *p
        }
 
 #ifdef TCP_OFFLOAD
-       bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu);
-       pack = 0;       /* XXX: think about this some more */
+       maxp = mtu_to_max_payload(sc, mtu, 1);
        for_each_ofld_rxq(pi, i, ofld_rxq) {
 
                init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
@@ -892,8 +988,7 @@ t4_setup_port_queues(struct port_info *p
 
                snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
                    device_get_nameunit(pi->dev), i);
-               init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack,
-                   name);
+               init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name);
 
                if (sc->flags & INTR_DIRECT ||
                    (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
@@ -1170,10 +1265,7 @@ service_iq(struct sge_iq *iq, int budget
                                    ("%s: data for an iq (%p) with no freelist",
                                    __func__, iq));
 
-                               m0 = fl->flags & FL_BUF_PACKING ?
-                                   get_fl_payload1(sc, fl, lq, &fl_bufs_used) :
-                                   get_fl_payload2(sc, fl, lq, &fl_bufs_used);
-
+                               m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used);
                                if (__predict_false(m0 == NULL))
                                        goto process_iql;
 #ifdef T4_PKT_TIMESTAMP
@@ -1246,6 +1338,14 @@ service_iq(struct sge_iq *iq, int budget
                                break;
                        }
 
+                       if (fl_bufs_used >= 16) {
+                               FL_LOCK(fl);
+                               fl->needed += fl_bufs_used;
+                               refill_fl(sc, fl, 32);
+                               FL_UNLOCK(fl);
+                               fl_bufs_used = 0;
+                       }
+
                        iq_next(iq);
                        if (++ndescs == limit) {
                                t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
@@ -1262,14 +1362,6 @@ service_iq(struct sge_iq *iq, int budget
                                }
 #endif
 
-                               if (fl_bufs_used > 0) {
-                                       FL_LOCK(fl);
-                                       fl->needed += fl_bufs_used;
-                                       refill_fl(sc, fl, fl->cap / 8);
-                                       FL_UNLOCK(fl);
-                                       fl_bufs_used = 0;
-                               }
-
                                if (budget)
                                        return (EINPROGRESS);
                        }
@@ -1312,7 +1404,7 @@ process_iql:
 
                FL_LOCK(fl);
                fl->needed += fl_bufs_used;
-               starved = refill_fl(sc, fl, fl->cap / 4);
+               starved = refill_fl(sc, fl, 64);
                FL_UNLOCK(fl);
                if (__predict_false(starved != 0))
                        add_fl_to_sfl(sc, fl);
@@ -1321,74 +1413,28 @@ process_iql:
        return (0);
 }
 
-static int
-fill_mbuf_stash(struct sge_fl *fl)
-{
-       int i;
-
-       for (i = 0; i < nitems(fl->mstash); i++) {
-               if (fl->mstash[i] == NULL) {
-                       struct mbuf *m;
-                       if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL)
-                               return (ENOBUFS);
-                       fl->mstash[i] = m;
-               }
-       }
-       return (0);
-}
-
-static struct mbuf *
-get_mbuf_from_stash(struct sge_fl *fl)
+static inline int
+cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
-       int i;
+       int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
 
-       for (i = 0; i < nitems(fl->mstash); i++) {
-               if (fl->mstash[i] != NULL) {
-                       struct mbuf *m;
-
-                       m = fl->mstash[i];
-                       fl->mstash[i] = NULL;
-                       return (m);
-               } else
-                       fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT);
-       }
+       if (rc)
+               MPASS(cll->region3 >= CL_METADATA_SIZE);
 
-       return (m_get(M_NOWAIT, MT_NOINIT));
+       return (rc);
 }
 
-static void
-return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m)
+static inline struct cluster_metadata *
+cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
+    caddr_t cl)
 {
-       int i;
 
-       if (m == NULL)
-               return;
+       if (cl_has_metadata(fl, cll)) {
+               struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 
-       for (i = 0; i < nitems(fl->mstash); i++) {
-               if (fl->mstash[i] == NULL) {
-                       fl->mstash[i] = m;
-                       return;
-               }
+               return ((struct cluster_metadata *)(cl + swz->size) - 1);
        }
-       m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
-       m_free(m);
-}
-
-/* buf can be any address within the buffer */
-static inline u_int *
-find_buf_refcnt(caddr_t buf)
-{
-       uintptr_t ptr = (uintptr_t)buf;
-
-       return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int)));
-}
-
-static inline struct mbuf *
-find_buf_mbuf(caddr_t buf)
-{
-       uintptr_t ptr = (uintptr_t)buf;
-
-       return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1)));
+       return (NULL);
 }
 
 static int
@@ -1396,179 +1442,117 @@ rxb_free(struct mbuf *m, void *arg1, voi
 {
        uma_zone_t zone = arg1;
        caddr_t cl = arg2;
-#ifdef notyet
-       u_int refcount;
 
-       refcount = *find_buf_refcnt(cl);
-       KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__,
-           cl - MSIZE, refcount));
-#endif
-       cl -= MSIZE;
        uma_zfree(zone, cl);
 
        return (EXT_FREE_OK);
 }
 
+/*
+ * The mbuf returned by this function could be allocated from zone_mbuf or
+ * constructed in spare room in the cluster.
+ *
+ * The mbuf carries the payload in one of these ways
+ * a) frame inside the mbuf (mbuf from zone_mbuf)
+ * b) m_cljset (for clusters without metadata) zone_mbuf
+ * c) m_extaddref (cluster with metadata) inline mbuf
+ * d) m_extaddref (cluster with metadata) zone_mbuf
+ */
 static struct mbuf *
-get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
-    int *fl_bufs_used)
+get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int 
flags)
 {
-       struct mbuf *m0, *m;
+       struct mbuf *m;
        struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
-       unsigned int nbuf, len;
-       int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
+       struct cluster_layout *cll = &sd->cll;
+       struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
+       struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
+       struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
+       int len, padded_len;
+       caddr_t payload;
+
+       len = min(total, hwb->size - fl->rx_offset);
+       padded_len = roundup2(len, fl_pad);
+       payload = sd->cl + cll->region1 + fl->rx_offset;
 
-       /*
-        * No assertion for the fl lock because we don't need it.  This routine
-        * is called only from the rx interrupt handler and it only updates
-        * fl->cidx.  (Contrast that with fl->pidx/fl->needed which could be
-        * updated in the rx interrupt handler or the starvation helper routine.
-        * That's why code that manipulates fl->pidx/fl->needed needs the fl
-        * lock but this routine does not).
-        */
+       if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 
-       KASSERT(fl->flags & FL_BUF_PACKING,
-           ("%s: buffer packing disabled for fl %p", __func__, fl));
-
-       len = G_RSPD_LEN(len_newbuf);
+               /*
+                * Copy payload into a freshly allocated mbuf.
+                */
 
-       if ((len_newbuf & F_RSPD_NEWBUF) == 0) {
-               KASSERT(fl->rx_offset > 0,
-                   ("%s: packed frame but driver at offset=0", __func__));
-
-               /* A packed frame is guaranteed to fit entirely in this buf. */
-               KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len,
-                   ("%s: packing error.  bufsz=%u, offset=%u, len=%u",
-                   __func__, FL_BUF_SIZE(sc, sd->tag_idx), fl->rx_offset,
-                   len));
-
-               m0 = get_mbuf_from_stash(fl);
-               if (m0 == NULL ||
-                   m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
-                       return_mbuf_to_stash(fl, m0);
+               m = flags & M_PKTHDR ?
+                   m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
+               if (m == NULL)
                        return (NULL);
-               }
-
-               bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
-                   BUS_DMASYNC_POSTREAD);
-               if (sc->sc_do_rxcopy && (len < RX_COPY_THRESHOLD)) {
+               fl->mbuf_allocated++;
 #ifdef T4_PKT_TIMESTAMP
-                       /* Leave room for a timestamp */
-                       m0->m_data += 8;
+               /* Leave room for a timestamp */
+               m->m_data += 8;
 #endif
-                       bcopy(sd->cl + fl->rx_offset, mtod(m0, caddr_t), len);
-                       m0->m_pkthdr.len = len;
-                       m0->m_len = len;
-               } else {
-                       m0->m_pkthdr.len = len;
-                       m0->m_len = len;
-                       m_extaddref(m0, sd->cl + fl->rx_offset,
-                           roundup2(m0->m_len, fl_pad),
-                           find_buf_refcnt(sd->cl), rxb_free,
-                           FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
-               }
-               fl->rx_offset += len;
-               fl->rx_offset = roundup2(fl->rx_offset, fl_pad);
-               fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
-               if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
-                       fl->rx_offset = 0;
-                       (*fl_bufs_used) += 1;
-                       if (__predict_false(++fl->cidx == fl->cap))
-                               fl->cidx = 0;
-               }
+               /* copy data to mbuf */
+               bcopy(payload, mtod(m, caddr_t), len);
 
-               return (m0);
-       }
+       } else if (sd->nmbuf * MSIZE < cll->region1) {
 
-       KASSERT(len_newbuf & F_RSPD_NEWBUF,
-           ("%s: only new buffer handled here", __func__));
+               /*
+                * There's spare room in the cluster for an mbuf.  Create one
+                * and associate it with the payload that's in the cluster too.
+                */
 
-       nbuf = 0;
+               MPASS(clm != NULL);
+               m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
+               /* No bzero required */
+               if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA, flags | M_NOFREE))
+                       return (NULL);
+               fl->mbuf_inlined++;
+               m_extaddref(m, payload, padded_len, &clm->refcount, rxb_free,
+                   swz->zone, sd->cl);
+               sd->nmbuf++;
 
-       /*
-        * Move to the start of the next buffer if we are still in the middle of
-        * some buffer.  This is the case where there was some room left in the
-        * previous buffer but not enough to fit this frame in its entirety.
-        */
-       if (fl->rx_offset > 0) {
-               KASSERT(roundup2(len, fl_pad) > FL_BUF_SIZE(sc, sd->tag_idx) -
-                   fl->rx_offset, ("%s: frame (%u bytes) should have fit at "
-                   "cidx %u offset %u bufsize %u", __func__, len, fl->cidx,
-                   fl->rx_offset, FL_BUF_SIZE(sc, sd->tag_idx)));
-               nbuf++;
-               fl->rx_offset = 0;
-               sd++;
-               if (__predict_false(++fl->cidx == fl->cap)) {
-                       sd = fl->sdesc;
-                       fl->cidx = 0;
-               }
-       }
+       } else {
 
-       m0 = find_buf_mbuf(sd->cl);
-       if (m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE))
-               goto done;
-       bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
-       m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
-       m_extaddref(m0, sd->cl, roundup2(m0->m_len, fl_pad),
-           find_buf_refcnt(sd->cl), rxb_free, FL_BUF_ZONE(sc, sd->tag_idx),
-           sd->cl);
-       m0->m_pkthdr.len = len;
-
-       fl->rx_offset = roundup2(m0->m_len, fl_pad);
-       fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
-       if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
-               fl->rx_offset = 0;
-               nbuf++;
-               sd++;
-               if (__predict_false(++fl->cidx == fl->cap)) {
-                       sd = fl->sdesc;
-                       fl->cidx = 0;
+               /*
+                * Grab an mbuf from zone_mbuf and associate it with the
+                * payload in the cluster.
+                */
+
+               m = flags & M_PKTHDR ?
+                   m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
+               if (m == NULL)
+                       return (NULL);
+               fl->mbuf_allocated++;
+               if (clm != NULL)
+                       m_extaddref(m, payload, padded_len, &clm->refcount,
+                           rxb_free, swz->zone, sd->cl);
+               else {
+                       m_cljset(m, sd->cl, swz->type);
+                       sd->cl = NULL;  /* consumed, not a recycle candidate */
                }
        }
+       if (flags & M_PKTHDR)
+               m->m_pkthdr.len = total;
+       m->m_len = len;
 
-       m = m0;
-       len -= m->m_len;
-
-       while (len > 0) {
-               m->m_next = find_buf_mbuf(sd->cl);
-               m = m->m_next;
-
-               bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
-                   BUS_DMASYNC_POSTREAD);
+       if (fl->flags & FL_BUF_PACKING) {
+               fl->rx_offset += roundup2(padded_len, sc->sge.pack_boundary);
+               MPASS(fl->rx_offset <= hwb->size);
+               if (fl->rx_offset < hwb->size)
+                       return (m);     /* without advancing the cidx */
+       }
 
-               /* m_init for !M_PKTHDR can't fail so don't bother */
-               m_init(m, NULL, 0, M_NOWAIT, MT_DATA, M_NOFREE);
-               m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
-               m_extaddref(m, sd->cl, roundup2(m->m_len, fl_pad),
-                   find_buf_refcnt(sd->cl), rxb_free,
-                   FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
-
-               fl->rx_offset = roundup2(m->m_len, fl_pad);
-               fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
-               if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
-                       fl->rx_offset = 0;
-                       nbuf++;
-                       sd++;
-                       if (__predict_false(++fl->cidx == fl->cap)) {
-                               sd = fl->sdesc;
-                               fl->cidx = 0;
-                       }
-               }
+       if (__predict_false(++fl->cidx == fl->cap))
+               fl->cidx = 0;
+       fl->rx_offset = 0;
 
-               len -= m->m_len;
-       }
-done:
-       (*fl_bufs_used) += nbuf;
-       return (m0);
+       return (m);
 }
 
 static struct mbuf *
-get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
+get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
     int *fl_bufs_used)
 {
-       struct mbuf *m0, *m;
-       struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
-       unsigned int nbuf, len;
+       struct mbuf *m0, *m, **pnext;
+       u_int nbuf, len;
 
        /*
         * No assertion for the fl lock because we don't need it.  This routine
@@ -1579,87 +1563,54 @@ get_fl_payload2(struct adapter *sc, stru
         * lock but this routine does not).
         */
 
-       KASSERT((fl->flags & FL_BUF_PACKING) == 0,
-           ("%s: buffer packing enabled for fl %p", __func__, fl));
-       if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0))
-               panic("%s: cannot handle packed frames", __func__);
+       nbuf = 0;
        len = G_RSPD_LEN(len_newbuf);
-
-       /*
-        * We never want to run out of mbufs in between a frame when a frame
-        * spans multiple fl buffers.  If the fl's mbuf stash isn't full and
-        * can't be filled up to the brim then fail early.
-        */
-       if (len > FL_BUF_SIZE(sc, sd->tag_idx) && fill_mbuf_stash(fl) != 0)
-               return (NULL);
-
-       m0 = get_mbuf_from_stash(fl);
-       if (m0 == NULL ||
-           m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
-               return_mbuf_to_stash(fl, m0);
-               return (NULL);
+       if (__predict_false(fl->m0 != NULL)) {
+               MPASS(len == fl->m0->m_pkthdr.len);
+               MPASS(fl->remaining < len);
+
+               m0 = fl->m0;
+               pnext = fl->pnext;
+               len = fl->remaining;
+               fl->m0 = NULL;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to