Author: np
Date: Fri Aug 30 01:45:36 2013
New Revision: 255050
URL: http://svnweb.freebsd.org/changeset/base/255050

Log:
  Implement support for rx buffer packing.  Enable it by default for T5
  cards.
  
  This is a T4 and T5 chip feature which lets the chip deliver multiple
  Ethernet frames in a single buffer.  This is more efficient within the
  chip, in the driver, and reduces wastage of space in rx buffers.
  
  - Always allocate rx buffers from the jumbop zone, no matter what the
    MTU is.  Do not use the normal cluster refcounting mechanism.
  - Reserve space for an mbuf and a refcount in the cluster itself and let
    the chip DMA multiple frames in the rest.
  - Use the embedded mbuf for the first frame and allocate mbufs on the
    fly for any additional frames delivered in the cluster.  Each of these
    mbufs has a reference on the underlying cluster.

Modified:
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/t4_sge.c

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h        Fri Aug 30 01:33:26 2013        
(r255049)
+++ head/sys/dev/cxgbe/adapter.h        Fri Aug 30 01:45:36 2013        
(r255050)
@@ -128,9 +128,9 @@ enum {
 
        RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */
 #if MJUMPAGESIZE != MCLBYTES
-       FL_BUF_SIZES = 4,       /* cluster, jumbop, jumbo9k, jumbo16k */
+       FL_BUF_SIZES_MAX = 5,   /* cluster, jumbop, jumbo9k, jumbo16k, extra */
 #else
-       FL_BUF_SIZES = 3,       /* cluster, jumbo9k, jumbo16k */
+       FL_BUF_SIZES_MAX = 4,   /* cluster, jumbo9k, jumbo16k, extra */
 #endif
 
        CTRL_EQ_QSIZE = 128,
@@ -165,6 +165,7 @@ enum {
        MASTER_PF       = (1 << 3),
        ADAP_SYSCTL_CTX = (1 << 4),
        TOM_INIT_DONE   = (1 << 5),
+       BUF_PACKING_OK  = (1 << 6),
 
        CXGBE_BUSY      = (1 << 9),
 
@@ -232,12 +233,11 @@ struct port_info {
 };
 
 struct fl_sdesc {
-       struct mbuf *m;
        bus_dmamap_t map;
        caddr_t cl;
-       uint8_t tag_idx;        /* the sc->fl_tag this map comes from */
+       uint8_t tag_idx;        /* the fl->tag entry this map comes from */
 #ifdef INVARIANTS
-       __be64 ba_tag;
+       __be64 ba_hwtag;
 #endif
 };
 
@@ -359,9 +359,22 @@ struct sge_eq {
        uint32_t unstalled;     /* recovered from stall */
 };
 
+struct fl_buf_info {
+       u_int size;
+       int type;
+       int hwtag:4;    /* tag in low 4 bits of the pa. */
+       uma_zone_t zone;
+};
+#define FL_BUF_SIZES(sc)       (sc->sge.fl_buf_sizes)
+#define FL_BUF_SIZE(sc, x)     (sc->sge.fl_buf_info[x].size)
+#define FL_BUF_TYPE(sc, x)     (sc->sge.fl_buf_info[x].type)
+#define FL_BUF_HWTAG(sc, x)    (sc->sge.fl_buf_info[x].hwtag)
+#define FL_BUF_ZONE(sc, x)     (sc->sge.fl_buf_info[x].zone)
+
 enum {
        FL_STARVING     = (1 << 0), /* on the adapter's list of starving fl's */
        FL_DOOMED       = (1 << 1), /* about to be destroyed */
+       FL_BUF_PACKING  = (1 << 2), /* buffer packing enabled */
 };
 
 #define FL_RUNNING_LOW(fl)     (fl->cap - fl->needed <= fl->lowat)
@@ -370,7 +383,8 @@ enum {
 struct sge_fl {
        bus_dma_tag_t desc_tag;
        bus_dmamap_t desc_map;
-       bus_dma_tag_t tag[FL_BUF_SIZES];
+       bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are
+                                               valid */
        uint8_t tag_idx;
        struct mtx fl_lock;
        char lockname[16];
@@ -383,11 +397,13 @@ struct sge_fl {
        uint16_t qsize;         /* size (# of entries) of the queue */
        uint16_t cntxt_id;      /* SGE context id for the freelist */
        uint32_t cidx;          /* consumer idx (buffer idx, NOT hw desc idx) */
+       uint32_t rx_offset;     /* offset in fl buf (when buffer packing) */
        uint32_t pidx;          /* producer idx (buffer idx, NOT hw desc idx) */
        uint32_t needed;        /* # of buffers needed to fill up fl. */
        uint32_t lowat;         /* # of buffers <= this means fl needs help */
        uint32_t pending;       /* # of bufs allocated since last doorbell */
-       unsigned int dmamap_failed;
+       u_int dmamap_failed;
+       struct mbuf *mstash[8];
        TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
 };
 
@@ -519,6 +535,9 @@ struct sge {
        int eq_start;
        struct sge_iq **iqmap;  /* iq->cntxt_id to iq mapping */
        struct sge_eq **eqmap;  /* eq->cntxt_id to eq mapping */
+
+       u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE);
+       struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX];
 };
 
 struct rss_header;

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c Fri Aug 30 01:33:26 2013        (r255049)
+++ head/sys/dev/cxgbe/t4_sge.c Fri Aug 30 01:45:36 2013        (r255050)
@@ -56,19 +56,6 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 
-struct fl_buf_info {
-       int size;
-       int type;
-       uma_zone_t zone;
-};
-
-/* Filled up by t4_sge_modload */
-static struct fl_buf_info fl_buf_info[FL_BUF_SIZES];
-
-#define FL_BUF_SIZE(x) (fl_buf_info[x].size)
-#define FL_BUF_TYPE(x) (fl_buf_info[x].type)
-#define FL_BUF_ZONE(x) (fl_buf_info[x].zone)
-
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
@@ -85,7 +72,8 @@ TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
- *  Any power of 2, from 32 to 4096 (both inclusive) is a valid value.
+ *  0: disable padding.
+ *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 static int fl_pad = -1;
 TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
@@ -107,6 +95,33 @@ TUNABLE_INT("hw.cxgbe.spg_len", &spg_len
 static int cong_drop = 0;
 TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
 
+/*
+ * Deliver multiple frames in the same free list buffer if they fit.
+ * -1: let the driver decide whether to enable buffer packing or not.
+ *  0: disable buffer packing.
+ *  1: enable buffer packing.
+ */
+static int buffer_packing = -1;
+TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
+
+/*
+ * Start next frame in a packed buffer at this boundary.
+ * -1: driver should figure out a good value.
+ * T4:
+ * ---
+ * if fl_pad != 0
+ *     value specified here will be overridden by fl_pad.
+ * else
+ *     power of 2 from 32 to 4096 (both inclusive) is a valid value here.
+ * T5:
+ * ---
+ * 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
+ */
+static int fl_pack = -1;
+static int t4_fl_pack;
+static int t5_fl_pack;
+TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
+
 /* Used to track coalesced tx work request */
 struct txpkts {
        uint64_t *flitp;        /* ptr to flit where next pkt should start */
@@ -123,12 +138,15 @@ struct sgl {
 };
 
 static int service_iq(struct sge_iq *, int);
-static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t,
+static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, 
uint32_t,
+    int *);
+static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, 
uint32_t,
     int *);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf 
*);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
     int);
-static inline void init_fl(struct sge_fl *, int, int, char *);
+static inline void init_fl(struct adapter *, struct sge_fl *, int, int, int,
+    char *);
 static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t,
     char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t 
*,
@@ -170,8 +188,8 @@ static inline void ring_fl_db(struct ada
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
-static void free_fl_sdesc(struct sge_fl *);
-static void set_fl_tag_idx(struct sge_fl *, int);
+static void free_fl_sdesc(struct adapter *, struct sge_fl *);
+static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
@@ -202,27 +220,20 @@ extern u_int cpu_clflush_line_size;
 #endif
 
 /*
- * Called on MOD_LOAD.  Fills up fl_buf_info[] and validates/calculates the SGE
- * tunables.
+ * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
-       int i;
-       int bufsize[FL_BUF_SIZES] = {
-               MCLBYTES,
-#if MJUMPAGESIZE != MCLBYTES
-               MJUMPAGESIZE,
-#endif
-               MJUM9BYTES,
-               MJUM16BYTES
-       };
+       int pad;
 
-       for (i = 0; i < FL_BUF_SIZES; i++) {
-               FL_BUF_SIZE(i) = bufsize[i];
-               FL_BUF_TYPE(i) = m_gettype(bufsize[i]);
-               FL_BUF_ZONE(i) = m_getzone(bufsize[i]);
-       }
+       /* set pad to a reasonable powerof2 between 16 and 4096 (inclusive) */
+#if defined(__i386__) || defined(__amd64__)
+       pad = max(cpu_clflush_line_size, 16);
+#else
+       pad = max(CACHE_LINE_SIZE, 16);
+#endif
+       pad = min(pad, 4096);
 
        if (fl_pktshift < 0 || fl_pktshift > 7) {
                printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
@@ -230,23 +241,35 @@ t4_sge_modload(void)
                fl_pktshift = 2;
        }
 
-       if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) {
-               int pad;
-
-#if defined(__i386__) || defined(__amd64__)
-               pad = max(cpu_clflush_line_size, 32);
-#else
-               pad = max(CACHE_LINE_SIZE, 32);
-#endif
-               pad = min(pad, 4096);
+       if (fl_pad != 0 &&
+           (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad))) {
 
                if (fl_pad != -1) {
                        printf("Invalid hw.cxgbe.fl_pad value (%d),"
-                           " using %d instead.\n", fl_pad, pad);
+                           " using %d instead.\n", fl_pad, max(pad, 32));
                }
-               fl_pad = pad;
+               fl_pad = max(pad, 32);
        }
 
+       /*
+        * T4 has the same pad and pack boundary.  If a pad boundary is set,
+        * pack boundary must be set to the same value.  Otherwise take the
+        * specified value or auto-calculate something reasonable.
+        */
+       if (fl_pad)
+               t4_fl_pack = fl_pad;
+       else if (fl_pack < 32 || fl_pack > 4096 || !powerof2(fl_pack))
+               t4_fl_pack = max(pad, 32);
+       else
+               t4_fl_pack = fl_pack;
+
+       /* T5's pack boundary is independent of the pad boundary. */
+       if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
+           !powerof2(fl_pack))
+              t5_fl_pack = max(pad, 64);
+       else
+              t5_fl_pack = fl_pack;
+
        if (spg_len != 64 && spg_len != 128) {
                int len;
 
@@ -292,17 +315,41 @@ t4_tweak_chip_settings(struct adapter *s
        int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
        int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
        uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
+       int sw_flbuf_sizes[] = {
+               MCLBYTES,
+#if MJUMPAGESIZE != MCLBYTES
+               MJUMPAGESIZE,
+#endif
+               MJUM9BYTES,
+               MJUM16BYTES,
+               MJUMPAGESIZE - MSIZE
+       };
 
        KASSERT(sc->flags & MASTER_PF,
            ("%s: trying to change chip settings when not master.", __func__));
 
-       m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
-           V_INGPADBOUNDARY(M_INGPADBOUNDARY) | F_EGRSTATUSPAGESIZE;
+       m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
        v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
-           V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
            V_EGRSTATUSPAGESIZE(spg_len == 128);
+       if (is_t4(sc) && (fl_pad || buffer_packing)) {
+               /* t4_fl_pack has the correct value even when fl_pad = 0 */
+               m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+               v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5);
+       } else if (is_t5(sc) && fl_pad) {
+               m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+               v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5);
+       }
        t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
+       if (is_t5(sc) && buffer_packing) {
+               m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
+               if (t5_fl_pack == 16)
+                       v = V_INGPACKBOUNDARY(0);
+               else
+                       v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5);
+               t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
+       }
+
        v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
            V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
            V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
@@ -313,9 +360,9 @@ t4_tweak_chip_settings(struct adapter *s
            V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
        t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
-       for (i = 0; i < FL_BUF_SIZES; i++) {
+       for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) {
                t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
-                   FL_BUF_SIZE(i));
+                   sw_flbuf_sizes[i]);
        }
 
        v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
@@ -376,21 +423,48 @@ int
 t4_read_chip_settings(struct adapter *sc)
 {
        struct sge *s = &sc->sge;
-       int i, rc = 0;
+       int i, j, n, rc = 0;
        uint32_t m, v, r;
        uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
+       uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = {
+               MCLBYTES,
+#if MJUMPAGESIZE != MCLBYTES
+               MJUMPAGESIZE,
+#endif
+               MJUM9BYTES,
+               MJUM16BYTES
+       };
 
-       m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE |
-           V_INGPADBOUNDARY(M_INGPADBOUNDARY) | F_EGRSTATUSPAGESIZE;
+       m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
        v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
-           V_INGPADBOUNDARY(ilog2(fl_pad) - 5) |
            V_EGRSTATUSPAGESIZE(spg_len == 128);
+       if (is_t4(sc) && (fl_pad || buffer_packing)) {
+               m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+               v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5);
+       } else if (is_t5(sc) && fl_pad) {
+               m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY);
+               v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5);
+       }
        r = t4_read_reg(sc, A_SGE_CONTROL);
        if ((r & m) != v) {
                device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
                rc = EINVAL;
        }
 
+       if (is_t5(sc) && buffer_packing) {
+               m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
+               if (t5_fl_pack == 16)
+                       v = V_INGPACKBOUNDARY(0);
+               else
+                       v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5);
+               r = t4_read_reg(sc, A_SGE_CONTROL2);
+               if ((r & m) != v) {
+                       device_printf(sc->dev,
+                           "invalid SGE_CONTROL2(0x%x)\n", r);
+                       rc = EINVAL;
+               }
+       }
+
        v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
            V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
            V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
@@ -405,14 +479,45 @@ t4_read_chip_settings(struct adapter *sc
                rc = EINVAL;
        }
 
-       for (i = 0; i < FL_BUF_SIZES; i++) {
-               v = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
-               if (v != FL_BUF_SIZE(i)) {
-                       device_printf(sc->dev,
-                           "invalid SGE_FL_BUFFER_SIZE[%d](0x%x)\n", i, v);
-                       rc = EINVAL;
+       /*
+        * Make a list of SGE FL buffer sizes programmed in the chip and tally
+        * it with the FL buffer sizes that we'd like to use.
+        */
+       n = 0;
+       for (i = 0; i < nitems(sge_flbuf_sizes); i++) {
+               r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
+               sge_flbuf_sizes[i] = r;
+               if (r == MJUMPAGESIZE - MSIZE &&
+                   (sc->flags & BUF_PACKING_OK) == 0) {
+                       sc->flags |= BUF_PACKING_OK;
+                       FL_BUF_HWTAG(sc, n) = i;
+                       FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE;
+                       FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE);
+                       FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE);
+                       n++;
+               }
+       }
+       for (i = 0; i < nitems(sw_flbuf_sizes); i++) {
+               for (j = 0; j < nitems(sge_flbuf_sizes); j++) {
+                       if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j])
+                               continue;
+                       FL_BUF_HWTAG(sc, n) = j;
+                       FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i];
+                       FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]);
+                       FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]);
+                       n++;
+                       break;
                }
        }
+       if (n == 0) {
+               device_printf(sc->dev, "no usable SGE FL buffer size.\n");
+               rc = EINVAL;
+       } else if (n == 1 && (sc->flags & BUF_PACKING_OK)) {
+               device_printf(sc->dev,
+                   "no usable SGE FL buffer size when not packing buffers.\n");
+               rc = EINVAL;
+       }
+       FL_BUF_SIZES(sc) = n;
 
        r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
        s->counter_val[0] = G_THRESHOLD_0(r);
@@ -515,6 +620,14 @@ t4_sge_sysctls(struct adapter *sc, struc
 
        SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
            NULL, cong_drop, "congestion drop setting");
+
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, "buffer_packing", CTLFLAG_RD,
+           NULL, sc->flags & BUF_PACKING_OK ? 1 : 0,
+           "pack multiple frames in one fl buffer");
+
+       SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
+           NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack,
+           "payload pack boundary (bytes)");
 }
 
 int
@@ -706,7 +819,7 @@ t4_setup_port_queues(struct port_info *p
        struct ifnet *ifp = pi->ifp;
        struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
        struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
-       int bufsize;
+       int bufsize, pack;
 
        oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
            NULL, "rx queues");
@@ -728,6 +841,12 @@ t4_setup_port_queues(struct port_info *p
         * b) allocate queue iff it will take direct interrupts.
         */
        bufsize = mtu_to_bufsize(ifp->if_mtu);
+       if (sc->flags & BUF_PACKING_OK &&
+           ((is_t5(sc) && buffer_packing) ||   /* 1 or -1 both ok for T5 */
+           (is_t4(sc) && buffer_packing == 1)))
+               pack = 1;
+       else
+               pack = 0;
        for_each_rxq(pi, i, rxq) {
 
                init_iq(&rxq->iq, sc, pi->tmr_idx, pi->pktc_idx, pi->qsize_rxq,
@@ -735,7 +854,7 @@ t4_setup_port_queues(struct port_info *p
 
                snprintf(name, sizeof(name), "%s rxq%d-fl",
                    device_get_nameunit(pi->dev), i);
-               init_fl(&rxq->fl, pi->qsize_rxq / 8, bufsize, name);
+               init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name);
 
                if (sc->flags & INTR_DIRECT
 #ifdef TCP_OFFLOAD
@@ -752,6 +871,7 @@ t4_setup_port_queues(struct port_info *p
 
 #ifdef TCP_OFFLOAD
        bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu);
+       pack = 0;       /* XXX: think about this some more */
        for_each_ofld_rxq(pi, i, ofld_rxq) {
 
                init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
@@ -759,7 +879,8 @@ t4_setup_port_queues(struct port_info *p
 
                snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
                    device_get_nameunit(pi->dev), i);
-               init_fl(&ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, name);
+               init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack,
+                   name);
 
                if (sc->flags & INTR_DIRECT ||
                    (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
@@ -1036,7 +1157,12 @@ service_iq(struct sge_iq *iq, int budget
                                    ("%s: data for an iq (%p) with no freelist",
                                    __func__, iq));
 
-                               m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used);
+                               m0 = fl->flags & FL_BUF_PACKING ?
+                                   get_fl_payload1(sc, fl, lq, &fl_bufs_used) :
+                                   get_fl_payload2(sc, fl, lq, &fl_bufs_used);
+
+                               if (__predict_false(m0 == NULL))
+                                       goto process_iql;
 #ifdef T4_PKT_TIMESTAMP
                                /*
                                 * 60 bit timestamp for the payload is
@@ -1136,6 +1262,7 @@ service_iq(struct sge_iq *iq, int budget
                        }
                }
 
+process_iql:
                if (STAILQ_EMPTY(&iql))
                        break;
 
@@ -1181,13 +1308,102 @@ service_iq(struct sge_iq *iq, int budget
        return (0);
 }
 
+static int
+fill_mbuf_stash(struct sge_fl *fl)
+{
+       int i;
+
+       for (i = 0; i < nitems(fl->mstash); i++) {
+               if (fl->mstash[i] == NULL) {
+                       struct mbuf *m;
+                       if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL)
+                               return (ENOBUFS);
+                       fl->mstash[i] = m;
+               }
+       }
+       return (0);
+}
+
+static struct mbuf *
+get_mbuf_from_stash(struct sge_fl *fl)
+{
+       int i;
+
+       for (i = 0; i < nitems(fl->mstash); i++) {
+               if (fl->mstash[i] != NULL) {
+                       struct mbuf *m;
+
+                       m = fl->mstash[i];
+                       fl->mstash[i] = NULL;
+                       return (m);
+               } else
+                       fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT);
+       }
+
+       return (m_get(M_NOWAIT, MT_NOINIT));
+}
+
+static void
+return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m)
+{
+       int i;
+
+       if (m == NULL)
+               return;
+
+       for (i = 0; i < nitems(fl->mstash); i++) {
+               if (fl->mstash[i] == NULL) {
+                       fl->mstash[i] = m;
+                       return;
+               }
+       }
+       m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
+       m_free(m);
+}
+
+/* buf can be any address within the buffer */
+static inline u_int *
+find_buf_refcnt(caddr_t buf)
+{
+       uintptr_t ptr = (uintptr_t)buf;
+
+       return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int)));
+}
+
+static inline struct mbuf *
+find_buf_mbuf(caddr_t buf)
+{
+       uintptr_t ptr = (uintptr_t)buf;
+
+       return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1)));
+}
+
+static int
+rxb_free(struct mbuf *m, void *arg1, void *arg2)
+{
+       uma_zone_t zone = arg1;
+       caddr_t cl = arg2;
+#ifdef INVARIANTS
+       u_int refcount;
+
+       refcount = *find_buf_refcnt(cl);
+       KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__,
+           cl - MSIZE, refcount));
+#endif
+       cl -= MSIZE;
+       uma_zfree(zone, cl);
+
+       return (EXT_FREE_OK);
+}
+
 static struct mbuf *
-get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
+get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
     int *fl_bufs_used)
 {
        struct mbuf *m0, *m;
        struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
        unsigned int nbuf, len;
+       int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
 
        /*
         * No assertion for the fl lock because we don't need it.  This routine
@@ -1198,29 +1414,194 @@ get_fl_payload(struct adapter *sc, struc
         * lock but this routine does not).
         */
 
+       KASSERT(fl->flags & FL_BUF_PACKING,
+           ("%s: buffer packing disabled for fl %p", __func__, fl));
+
+       len = G_RSPD_LEN(len_newbuf);
+
+       if ((len_newbuf & F_RSPD_NEWBUF) == 0) {
+               KASSERT(fl->rx_offset > 0,
+                   ("%s: packed frame but driver at offset=0", __func__));
+
+               /* A packed frame is guaranteed to fit entirely in this buf. */
+               KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len,
+                   ("%s: packing error.  bufsz=%u, offset=%u, len=%u",
+                   __func__, FL_BUF_SIZE(sc, sd->tag_idx), fl->rx_offset,
+                   len));
+
+               m0 = get_mbuf_from_stash(fl);
+               if (m0 == NULL ||
+                   m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
+                       return_mbuf_to_stash(fl, m0);
+                       return (NULL);
+               }
+
+               bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
+                   BUS_DMASYNC_POSTREAD);
+               if (len < RX_COPY_THRESHOLD) {
+#ifdef T4_PKT_TIMESTAMP
+                       /* Leave room for a timestamp */
+                       m0->m_data += 8;
+#endif
+                       bcopy(sd->cl + fl->rx_offset, mtod(m0, caddr_t), len);
+                       m0->m_pkthdr.len = len;
+                       m0->m_len = len;
+               } else {
+                       m0->m_pkthdr.len = len;
+                       m0->m_len = len;
+                       m_extaddref(m0, sd->cl + fl->rx_offset,
+                           roundup2(m0->m_len, fl_pad),
+                           find_buf_refcnt(sd->cl), rxb_free,
+                           FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
+               }
+               fl->rx_offset += len;
+               fl->rx_offset = roundup2(fl->rx_offset, fl_pad);
+               fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
+               if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
+                       fl->rx_offset = 0;
+                       (*fl_bufs_used) += 1;
+                       if (__predict_false(++fl->cidx == fl->cap))
+                               fl->cidx = 0;
+               }
+
+               return (m0);
+       }
+
+       KASSERT(len_newbuf & F_RSPD_NEWBUF,
+           ("%s: only new buffer handled here", __func__));
+
+       nbuf = 0;
+
+       /*
+        * Move to the start of the next buffer if we are still in the middle of
+        * some buffer.  This is the case where there was some room left in the
+        * previous buffer but not enough to fit this frame in its entirety.
+        */
+       if (fl->rx_offset > 0) {
+               KASSERT(roundup2(len, fl_pad) > FL_BUF_SIZE(sc, sd->tag_idx) -
+                   fl->rx_offset, ("%s: frame (%u bytes) should have fit at "
+                   "cidx %u offset %u bufsize %u", __func__, len, fl->cidx,
+                   fl->rx_offset, FL_BUF_SIZE(sc, sd->tag_idx)));
+               nbuf++;
+               fl->rx_offset = 0;
+               sd++;
+               if (__predict_false(++fl->cidx == fl->cap)) {
+                       sd = fl->sdesc;
+                       fl->cidx = 0;
+               }
+       }
+
+       m0 = find_buf_mbuf(sd->cl);
+       if (m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE))
+               goto done;
+       bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
+       m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
+       m_extaddref(m0, sd->cl, roundup2(m0->m_len, fl_pad),
+           find_buf_refcnt(sd->cl), rxb_free, FL_BUF_ZONE(sc, sd->tag_idx),
+           sd->cl);
+       m0->m_pkthdr.len = len;
+
+       fl->rx_offset = roundup2(m0->m_len, fl_pad);
+       fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
+       if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
+               fl->rx_offset = 0;
+               nbuf++;
+               sd++;
+               if (__predict_false(++fl->cidx == fl->cap)) {
+                       sd = fl->sdesc;
+                       fl->cidx = 0;
+               }
+       }
+
+       m = m0;
+       len -= m->m_len;
+
+       while (len > 0) {
+               m->m_next = find_buf_mbuf(sd->cl);
+               m = m->m_next;
+
+               bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
+                   BUS_DMASYNC_POSTREAD);
+
+               /* m_init for !M_PKTHDR can't fail so don't bother */
+               m_init(m, NULL, 0, M_NOWAIT, MT_DATA, M_NOFREE);
+               m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
+               m_extaddref(m, sd->cl, roundup2(m->m_len, fl_pad),
+                   find_buf_refcnt(sd->cl), rxb_free,
+                   FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
+
+               fl->rx_offset = roundup2(m->m_len, fl_pad);
+               fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
+               if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
+                       fl->rx_offset = 0;
+                       nbuf++;
+                       sd++;
+                       if (__predict_false(++fl->cidx == fl->cap)) {
+                               sd = fl->sdesc;
+                               fl->cidx = 0;
+                       }
+               }
+
+               len -= m->m_len;
+       }
+done:
+       (*fl_bufs_used) += nbuf;
+       return (m0);
+}
+
+static struct mbuf *
+get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
+    int *fl_bufs_used)
+{
+       struct mbuf *m0, *m;
+       struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
+       unsigned int nbuf, len;
+
+       /*
+        * No assertion for the fl lock because we don't need it.  This routine
+        * is called only from the rx interrupt handler and it only updates
+        * fl->cidx.  (Contrast that with fl->pidx/fl->needed which could be
+        * updated in the rx interrupt handler or the starvation helper routine.
+        * That's why code that manipulates fl->pidx/fl->needed needs the fl
+        * lock but this routine does not).
+        */
+
+       KASSERT((fl->flags & FL_BUF_PACKING) == 0,
+           ("%s: buffer packing enabled for fl %p", __func__, fl));
        if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0))
                panic("%s: cannot handle packed frames", __func__);
        len = G_RSPD_LEN(len_newbuf);
 
-       m0 = sd->m;
-       sd->m = NULL;   /* consumed */
+       /*
+        * We never want to run out of mbufs in between a frame when a frame
+        * spans multiple fl buffers.  If the fl's mbuf stash isn't full and
+        * can't be filled up to the brim then fail early.
+        */
+       if (len > FL_BUF_SIZE(sc, sd->tag_idx) && fill_mbuf_stash(fl) != 0)
+               return (NULL);
+
+       m0 = get_mbuf_from_stash(fl);
+       if (m0 == NULL ||
+           m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
+               return_mbuf_to_stash(fl, m0);
+               return (NULL);
+       }
 
        bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
-       m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR);
-#ifdef T4_PKT_TIMESTAMP
-       /* Leave room for a timestamp */
-       m0->m_data += 8;
-#endif
 
        if (len < RX_COPY_THRESHOLD) {
+#ifdef T4_PKT_TIMESTAMP
+               /* Leave room for a timestamp */
+               m0->m_data += 8;
+#endif
                /* copy data to mbuf, buffer will be recycled */
                bcopy(sd->cl, mtod(m0, caddr_t), len);
                m0->m_len = len;
        } else {
                bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
-               m_cljset(m0, sd->cl, FL_BUF_TYPE(sd->tag_idx));
+               m_cljset(m0, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx));
                sd->cl = NULL;  /* consumed */
-               m0->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
+               m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
        }
        m0->m_pkthdr.len = len;
 
@@ -1235,23 +1616,23 @@ get_fl_payload(struct adapter *sc, struc
        nbuf = 1;       /* # of fl buffers used */
 
        while (len > 0) {
-               m->m_next = sd->m;
-               sd->m = NULL;   /* consumed */
+               /* Can't fail, we checked earlier that the stash was full. */
+               m->m_next = get_mbuf_from_stash(fl);
                m = m->m_next;
 
                bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
                    BUS_DMASYNC_POSTREAD);
 
+               /* m_init for !M_PKTHDR can't fail so don't bother */
                m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
                if (len <= MLEN) {
                        bcopy(sd->cl, mtod(m, caddr_t), len);
                        m->m_len = len;
                } else {
-                       bus_dmamap_unload(fl->tag[sd->tag_idx],
-                           sd->map);
-                       m_cljset(m, sd->cl, FL_BUF_TYPE(sd->tag_idx));
+                       bus_dmamap_unload(fl->tag[sd->tag_idx], sd->map);
+                       m_cljset(m, sd->cl, FL_BUF_TYPE(sc, sd->tag_idx));
                        sd->cl = NULL;  /* consumed */
-                       m->m_len = min(len, FL_BUF_SIZE(sd->tag_idx));
+                       m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
                }
 
                sd++;
@@ -1616,6 +1997,7 @@ void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
        struct port_info *pi = ifp->if_softc;
+       struct adapter *sc = pi->adapter;
        struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
        struct sge_ofld_rxq *ofld_rxq;
@@ -1628,7 +2010,7 @@ t4_update_fl_bufsize(struct ifnet *ifp)
                fl = &rxq->fl;
 
                FL_LOCK(fl);
-               set_fl_tag_idx(fl, bufsize);
+               set_fl_tag_idx(sc, fl, bufsize);
                FL_UNLOCK(fl);
        }
 #ifdef TCP_OFFLOAD
@@ -1637,7 +2019,7 @@ t4_update_fl_bufsize(struct ifnet *ifp)
                fl = &ofld_rxq->fl;
 
                FL_LOCK(fl);
-               set_fl_tag_idx(fl, bufsize);
+               set_fl_tag_idx(sc, fl, bufsize);
                FL_UNLOCK(fl);
        }
 #endif
@@ -1671,11 +2053,15 @@ init_iq(struct sge_iq *iq, struct adapte
 }
 
 static inline void
-init_fl(struct sge_fl *fl, int qsize, int bufsize, char *name)
+init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int bufsize, int 
pack,
+    char *name)
 {
+
        fl->qsize = qsize;
        strlcpy(fl->lockname, name, sizeof(fl->lockname));
-       set_fl_tag_idx(fl, bufsize);
+       if (pack)
+               fl->flags |= FL_BUF_PACKING;
+       set_fl_tag_idx(sc, fl, bufsize);
 }
 
 static inline void
@@ -1804,7 +2190,7 @@ alloc_iq_fl(struct port_info *pi, struct
        if (fl) {
                mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
-               for (i = 0; i < FL_BUF_SIZES; i++) {
+               for (i = 0; i < FL_BUF_SIZES(sc); i++) {
 
                        /*
                         * A freelist buffer must be 16 byte aligned as the SGE
@@ -1813,8 +2199,8 @@ alloc_iq_fl(struct port_info *pi, struct
                         */
                        rc = bus_dma_tag_create(sc->dmat, 16, 0,
                            BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
-                           FL_BUF_SIZE(i), 1, FL_BUF_SIZE(i), BUS_DMA_ALLOCNOW,
-                           NULL, NULL, &fl->tag[i]);
+                           FL_BUF_SIZE(sc, i), 1, FL_BUF_SIZE(sc, i),
+                           BUS_DMA_ALLOCNOW, NULL, NULL, &fl->tag[i]);
                        if (rc != 0) {
                                device_printf(sc->dev,
                                    "failed to create fl DMA tag[%d]: %d\n",
@@ -1843,7 +2229,9 @@ alloc_iq_fl(struct port_info *pi, struct
                c.iqns_to_fl0congen |=
                    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
                        F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
-                       F_FW_IQ_CMD_FL0PADEN);
+                       (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
+                       (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
+                           0));
                if (cong >= 0) {
                        c.iqns_to_fl0congen |=
                                htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
@@ -1964,12 +2352,21 @@ free_iq_fl(struct port_info *pi, struct 
                    fl->desc);
 
                if (fl->sdesc)
-                       free_fl_sdesc(fl);
+                       free_fl_sdesc(sc, fl);
+
+               for (i = 0; i < nitems(fl->mstash); i++) {
+                       struct mbuf *m = fl->mstash[i];
+
+                       if (m != NULL) {
+                               m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
+                               m_free(m);
+                       }
+               }
 
                if (mtx_initialized(&fl->fl_lock))
                        mtx_destroy(&fl->fl_lock);
 
-               for (i = 0; i < FL_BUF_SIZES; i++) {
+               for (i = 0; i < FL_BUF_SIZES(sc); i++) {
                        if (fl->tag[i])
                                bus_dma_tag_destroy(fl->tag[i]);
                }
@@ -2130,6 +2527,10 @@ alloc_rxq(struct port_info *pi, struct s
            "SGE context id of the queue");
        SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
            &rxq->fl.cidx, 0, "consumer index");
+       if (rxq->fl.flags & FL_BUF_PACKING) {
+               SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "rx_offset",
+                   CTLFLAG_RD, &rxq->fl.rx_offset, 0, "packing rx offset");
+       }
        SYSCTL_ADD_UINT(&pi->ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
            &rxq->fl.pidx, 0, "producer index");
 
@@ -2691,6 +3092,12 @@ refill_fl(struct adapter *sc, struct sge
        int rc;
 
        FL_LOCK_ASSERT_OWNED(fl);
+#ifdef INVARIANTS
+       if (fl->flags & FL_BUF_PACKING)
+               KASSERT(sd->tag_idx == 0,
+                   ("%s: expected tag 0 but found tag %d at pidx %u instead",
+                   __func__, sd->tag_idx, fl->pidx));
+#endif
 
        if (nbufs > fl->needed)
                nbufs = fl->needed;
@@ -2699,24 +3106,34 @@ refill_fl(struct adapter *sc, struct sge
 
                if (sd->cl != NULL) {
 
-                       /*
-                        * This happens when a frame small enough to fit
-                        * entirely in an mbuf was received in cl last time.
-                        * We'd held on to cl and can reuse it now.  Note that
-                        * we reuse a cluster of the old size if fl->tag_idx is
-                        * no longer the same as sd->tag_idx.
-                        */
-
-                       KASSERT(*d == sd->ba_tag,
+                       KASSERT(*d == sd->ba_hwtag,
                            ("%s: recyling problem at pidx %d",
                            __func__, fl->pidx));
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to