Author: jhb
Date: Wed Jul  3 16:06:11 2019
New Revision: 349649
URL: https://svnweb.freebsd.org/changeset/base/349649

Log:
  Use unmapped (M_NOMAP) mbufs for zero-copy AIO writes via TOE.
  
  Previously the TOE code used its own custom unmapped mbufs via
  EXT_FLAG_VENDOR1.  The old version always wired the entire AIO request
  buffer first for the duration of the AIO operation and constructed
  multiple mbufs which used the wired buffer as an external buffer.
  
  The new version determines how much room is available in the socket
  buffer and only wires the pages needed for the available room building
  chains of M_NOMAP mbufs.  This means that a large AIO write will now
  limit the amount of wired memory it uses to the size of the socket
  buffer.
  
  Reviewed by:  gallatin, np
  Sponsored by: Chelsio Communications
  Differential Revision:        https://reviews.freebsd.org/D20839

Modified:
  head/sys/dev/cxgbe/tom/t4_cpl_io.c
  head/sys/dev/cxgbe/tom/t4_tls.c
  head/sys/dev/cxgbe/tom/t4_tom.h

Modified: head/sys/dev/cxgbe/tom/t4_cpl_io.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c  Wed Jul  3 09:51:59 2019        
(r349648)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c  Wed Jul  3 16:06:11 2019        
(r349649)
@@ -76,28 +76,6 @@ __FBSDID("$FreeBSD$");
 static void    t4_aiotx_cancel(struct kaiocb *job);
 static void    t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
 
-static size_t
-aiotx_mbuf_pgoff(struct mbuf *m)
-{
-       struct aiotx_buffer *ab;
-
-       MPASS(IS_AIOTX_MBUF(m));
-       ab = m->m_ext.ext_arg1;
-       return ((ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) % PAGE_SIZE);
-}
-
-static vm_page_t *
-aiotx_mbuf_pages(struct mbuf *m)
-{
-       struct aiotx_buffer *ab;
-       int npages;
-
-       MPASS(IS_AIOTX_MBUF(m));
-       ab = m->m_ext.ext_arg1;
-       npages = (ab->ps.offset + (uintptr_t)m->m_ext.ext_arg2) / PAGE_SIZE;
-       return (ab->ps.pages + npages);
-}
-
 void
 send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
 {
@@ -647,10 +625,7 @@ write_tx_sgl(void *dst, struct mbuf *start, struct mbu
 
        i = -1;
        for (m = start; m != stop; m = m->m_next) {
-               if (IS_AIOTX_MBUF(m))
-                       rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
-                           aiotx_mbuf_pgoff(m), m->m_len);
-               else if (m->m_flags & M_NOMAP)
+               if (m->m_flags & M_NOMAP)
                        rc = sglist_append_mb_ext_pgs(&sg, m);
                else
                        rc = sglist_append(&sg, mtod(m, void *), m->m_len);
@@ -713,7 +688,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep
        struct sockbuf *sb = &so->so_snd;
        int tx_credits, shove, compl, sowwakeup;
        struct ofld_tx_sdesc *txsd;
-       bool aiotx_mbuf_seen;
+       bool nomap_mbuf_seen;
 
        INP_WLOCK_ASSERT(inp);
        KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
@@ -766,14 +741,11 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep
                plen = 0;
                nsegs = 0;
                max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
-               aiotx_mbuf_seen = false;
+               nomap_mbuf_seen = false;
                for (m = sndptr; m != NULL; m = m->m_next) {
                        int n;
 
-                       if (IS_AIOTX_MBUF(m))
-                               n = sglist_count_vmpages(aiotx_mbuf_pages(m),
-                                   aiotx_mbuf_pgoff(m), m->m_len);
-                       else if (m->m_flags & M_NOMAP)
+                       if (m->m_flags & M_NOMAP)
                                n = sglist_count_mb_ext_pgs(m);
                        else
                                n = sglist_count(mtod(m, void *), m->m_len);
@@ -802,8 +774,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep
                                break;
                        }
 
-                       if (IS_AIOTX_MBUF(m))
-                               aiotx_mbuf_seen = true;
+                       if (m->m_flags & M_NOMAP)
+                               nomap_mbuf_seen = true;
                        if (max_nsegs_1mbuf < n)
                                max_nsegs_1mbuf = n;
                        sb_sndptr = m;  /* new sb->sb_sndptr if all goes well */
@@ -852,7 +824,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep
                        panic("%s: excess tx.", __func__);
 
                shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
-               if (plen <= max_imm && !aiotx_mbuf_seen) {
+               if (plen <= max_imm && !nomap_mbuf_seen) {
 
                        /* Immediate data tx */
 
@@ -1910,71 +1882,94 @@ t4_uninit_cpl_io_handlers(void)
 }
 
 /*
- * Use the 'backend3' field in AIO jobs to store the amount of data
- * sent by the AIO job so far and the 'backend4' field to hold an
- * error that should be reported when the job is completed.
+ * Use the 'backend1' field in AIO jobs to hold an error that should
+ * be reported when the job is completed, the 'backend3' field to
+ * store the amount of data sent by the AIO job so far, and the
+ * 'backend4' field to hold a reference count on the job.
+ *
+ * Each unmapped mbuf holds a reference on the job as does the queue
+ * so long as the job is queued.
  */
+#define        aio_error       backend1
 #define        aio_sent        backend3
-#define        aio_error       backend4
+#define        aio_refs        backend4
 
 #define        jobtotid(job)                                                   
\
        (((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
-       
+
 static void
-free_aiotx_buffer(struct aiotx_buffer *ab)
+aiotx_free_job(struct kaiocb *job)
 {
-       struct kaiocb *job;
        long status;
        int error;
 
-       if (refcount_release(&ab->refcount) == 0)
+       if (refcount_release(&job->aio_refs) == 0)
                return;
 
-       job = ab->job;
-       error = job->aio_error;
+       error = (intptr_t)job->aio_error;
        status = job->aio_sent;
-       vm_page_unhold_pages(ab->ps.pages, ab->ps.npages);
-       free(ab, M_CXGBE);
 #ifdef VERBOSE_TRACES
        CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
            jobtotid(job), job, status, error);
 #endif
-       if (error == ECANCELED && status != 0)
+       if (error != 0 && status != 0)
                error = 0;
        if (error == ECANCELED)
                aio_cancel(job);
        else if (error)
                aio_complete(job, -1, error);
-       else
+       else {
+               job->msgsnd = 1;
                aio_complete(job, status, 0);
+       }
 }
 
 static void
-t4_aiotx_mbuf_free(struct mbuf *m)
+aiotx_free_pgs(struct mbuf *m)
 {
-       struct aiotx_buffer *ab = m->m_ext.ext_arg1;
+       struct mbuf_ext_pgs *ext_pgs;
+       struct kaiocb *job;
+       struct mtx *mtx;
+       vm_page_t pg;
 
+       MBUF_EXT_PGS_ASSERT(m);
+       ext_pgs = m->m_ext.ext_pgs;
+       job = m->m_ext.ext_arg1;
 #ifdef VERBOSE_TRACES
        CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
-           m->m_len, jobtotid(ab->job));
+           m->m_len, jobtotid(job));
 #endif
-       free_aiotx_buffer(ab);
+
+       mtx = NULL;
+       for (int i = 0; i < ext_pgs->npgs; i++) {
+               pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+               vm_page_change_lock(pg, &mtx);
+               vm_page_unhold(pg);
+       }
+       if (mtx != NULL)
+               mtx_unlock(mtx);
+
+       aiotx_free_job(job);
 }
 
 /*
- * Hold the buffer backing an AIO request and return an AIO transmit
- * buffer.
+ * Allocate a chain of unmapped mbufs describing the next 'len' bytes
+ * of an AIO job.
  */
-static int
-hold_aio(struct kaiocb *job)
+static struct mbuf *
+alloc_aiotx_mbuf(struct kaiocb *job, int len)
 {
-       struct aiotx_buffer *ab;
        struct vmspace *vm;
+       vm_page_t pgs[MBUF_PEXT_MAX_PGS];
+       struct mbuf *m, *top, *last;
+       struct mbuf_ext_pgs *ext_pgs;
        vm_map_t map;
-       vm_offset_t start, end, pgoff;
-       int n;
+       vm_offset_t start;
+       int i, mlen, npages, pgoff;
 
-       MPASS(job->backend1 == NULL);
+       KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
+           ("%s(%p, %d): request to send beyond end of buffer", __func__,
+           job, len));
 
        /*
         * The AIO subsystem will cancel and drain all requests before
@@ -1983,35 +1978,65 @@ hold_aio(struct kaiocb *job)
         */
        vm = job->userproc->p_vmspace;
        map = &vm->vm_map;
-       start = (uintptr_t)job->uaiocb.aio_buf;
+       start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
        pgoff = start & PAGE_MASK;
-       end = round_page(start + job->uaiocb.aio_nbytes);
-       start = trunc_page(start);
-       n = atop(end - start);
 
-       ab = malloc(sizeof(*ab) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
-           M_ZERO);
-       refcount_init(&ab->refcount, 1);
-       ab->ps.pages = (vm_page_t *)(ab + 1);
-       ab->ps.npages = vm_fault_quick_hold_pages(map, start, end - start,
-           VM_PROT_WRITE, ab->ps.pages, n);
-       if (ab->ps.npages < 0) {
-               free(ab, M_CXGBE);
-               return (EFAULT);
-       }
+       top = NULL;
+       last = NULL;
+       while (len > 0) {
+               mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
+               KASSERT(mlen == len || (start + mlen & PAGE_MASK) == 0,
+                   ("%s: next start (%#jx + %#x) is not page aligned",
+                   __func__, (uintmax_t)start, mlen));
 
-       KASSERT(ab->ps.npages == n,
-           ("hold_aio: page count mismatch: %d vs %d", ab->ps.npages, n));
+               npages = vm_fault_quick_hold_pages(map, start, mlen,
+                   VM_PROT_WRITE, pgs, nitems(pgs));
+               if (npages < 0)
+                       break;
 
-       ab->ps.offset = pgoff;
-       ab->ps.len = job->uaiocb.aio_nbytes;
-       ab->job = job;
-       job->backend1 = ab;
+               m = mb_alloc_ext_pgs(M_WAITOK, false, aiotx_free_pgs);
+               if (m == NULL) {
+                       vm_page_unhold_pages(pgs, npages);
+                       break;
+               }
+
+               ext_pgs = m->m_ext.ext_pgs;
+               ext_pgs->first_pg_off = pgoff;
+               ext_pgs->npgs = npages;
+               if (npages == 1) {
+                       KASSERT(mlen + pgoff <= PAGE_SIZE,
+                           ("%s: single page is too large (off %d len %d)",
+                           __func__, pgoff, mlen));
+                       ext_pgs->last_pg_len = mlen;
+               } else {
+                       ext_pgs->last_pg_len = mlen - (PAGE_SIZE - pgoff) -
+                           (npages - 2) * PAGE_SIZE;
+               }
+               for (i = 0; i < npages; i++)
+                       ext_pgs->pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
+
+               m->m_len = mlen;
+               m->m_ext.ext_size = npages * PAGE_SIZE;
+               m->m_ext.ext_arg1 = job;
+               refcount_acquire(&job->aio_refs);
+
 #ifdef VERBOSE_TRACES
-       CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
-           __func__, jobtotid(job), &ab->ps, job, ab->ps.npages);
+               CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
+                   __func__, jobtotid(job), m, job, npages);
 #endif
-       return (0);
+
+               if (top == NULL)
+                       top = m;
+               else
+                       last->m_next = m;
+               last = m;
+
+               len -= mlen;
+               start += mlen;
+               pgoff = 0;
+       }
+
+       return (top);
 }
 
 static void
@@ -2020,18 +2045,16 @@ t4_aiotx_process_job(struct toepcb *toep, struct socke
        struct adapter *sc;
        struct sockbuf *sb;
        struct file *fp;
-       struct aiotx_buffer *ab;
        struct inpcb *inp;
        struct tcpcb *tp;
        struct mbuf *m;
-       int error;
+       int error, len;
        bool moretocome, sendmore;
 
        sc = td_adapter(toep->td);
        sb = &so->so_snd;
        SOCKBUF_UNLOCK(sb);
        fp = job->fd_file;
-       ab = job->backend1;
        m = NULL;
 
 #ifdef MAC
@@ -2040,23 +2063,12 @@ t4_aiotx_process_job(struct toepcb *toep, struct socke
                goto out;
 #endif
 
-       if (ab == NULL) {
-               error = hold_aio(job);
-               if (error != 0)
-                       goto out;
-               ab = job->backend1;
-       }
-
        /* Inline sosend_generic(). */
 
-       job->msgsnd = 1;
-
        error = sblock(sb, SBL_WAIT);
        MPASS(error == 0);
 
 sendanother:
-       m = m_get(M_WAITOK, MT_DATA);
-
        SOCKBUF_LOCK(sb);
        if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
                SOCKBUF_UNLOCK(sb);
@@ -2105,14 +2117,14 @@ sendanother:
         * Write as much data as the socket permits, but no more than a
         * a single sndbuf at a time.
         */
-       m->m_len = sbspace(sb);
-       if (m->m_len > ab->ps.len - job->aio_sent) {
-               m->m_len = ab->ps.len - job->aio_sent;
+       len = sbspace(sb);
+       if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
+               len = job->uaiocb.aio_nbytes - job->aio_sent;
                moretocome = false;
        } else
                moretocome = true;
-       if (m->m_len > sc->tt.sndbuf) {
-               m->m_len = sc->tt.sndbuf;
+       if (len > sc->tt.sndbuf) {
+               len = sc->tt.sndbuf;
                sendmore = true;
        } else
                sendmore = false;
@@ -2120,8 +2132,15 @@ sendanother:
        if (!TAILQ_EMPTY(&toep->aiotx_jobq))
                moretocome = true;
        SOCKBUF_UNLOCK(sb);
-       MPASS(m->m_len != 0);
+       MPASS(len != 0);
 
+       m = alloc_aiotx_mbuf(job, len);
+       if (m == NULL) {
+               sbunlock(sb);
+               error = EFAULT;
+               goto out;
+       }
+
        /* Inlined tcp_usr_send(). */
 
        inp = toep->inp;
@@ -2133,12 +2152,8 @@ sendanother:
                goto out;
        }
 
-       refcount_acquire(&ab->refcount);
-       m_extadd(m, NULL, ab->ps.len, t4_aiotx_mbuf_free, ab,
-           (void *)(uintptr_t)job->aio_sent, 0, EXT_NET_DRV);
-       m->m_ext.ext_flags |= EXT_FLAG_AIOTX;
-       job->aio_sent += m->m_len;
-       
+       job->aio_sent += m_length(m, NULL);
+
        sbappendstream(sb, m, 0);
        m = NULL;
 
@@ -2160,8 +2175,8 @@ sendanother:
                goto out;
 
        /*
-        * If this is a non-blocking socket and the request has not
-        * been fully completed, requeue it until the socket is ready
+        * If this is a blocking socket and the request has not been
+        * fully completed, requeue it until the socket is ready
         * again.
         */
        if (job->aio_sent < job->uaiocb.aio_nbytes &&
@@ -2177,22 +2192,18 @@ sendanother:
        }
 
        /*
-        * If the request will not be requeued, drop a reference on
-        * the aiotx buffer.  Any mbufs in flight should still
-        * contain a reference, but this drops the reference that the
-        * job owns while it is waiting to queue mbufs to the socket.
+        * If the request will not be requeued, drop the queue's
+        * reference to the job.  Any mbufs in flight should still
+        * hold a reference, but this drops the reference that the
+        * queue owns while it is waiting to queue mbufs to the
+        * socket.
         */
-       free_aiotx_buffer(ab);
+       aiotx_free_job(job);
 
 out:
        if (error) {
-               if (ab != NULL) {
-                       job->aio_error = error;
-                       free_aiotx_buffer(ab);
-               } else {
-                       MPASS(job->aio_sent == 0);
-                       aio_complete(job, -1, error);
-               }
+               job->aio_error = (void *)(intptr_t)error;
+               aiotx_free_job(job);
        }
        if (m != NULL)
                m_free(m);
@@ -2246,7 +2257,6 @@ t4_aiotx_queue_toep(struct socket *so, struct toepcb *
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
-       struct aiotx_buffer *ab;
        struct socket *so;
        struct sockbuf *sb;
        struct tcpcb *tp;
@@ -2263,11 +2273,8 @@ t4_aiotx_cancel(struct kaiocb *job)
                TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
        SOCKBUF_UNLOCK(sb);
 
-       ab = job->backend1;
-       if (ab != NULL)
-               free_aiotx_buffer(ab);
-       else
-               aio_cancel(job);
+       job->aio_error = (void *)(intptr_t)ECANCELED;
+       aiotx_free_job(job);
 }
 
 int
@@ -2293,6 +2300,7 @@ t4_aio_queue_aiotx(struct socket *so, struct kaiocb *j
 #endif
        if (!aio_set_cancel_function(job, t4_aiotx_cancel))
                panic("new job was cancelled");
+       refcount_init(&job->aio_refs, 1);
        TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
        if (sowriteable(so))
                t4_aiotx_queue_toep(so, toep);

Modified: head/sys/dev/cxgbe/tom/t4_tls.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tls.c     Wed Jul  3 09:51:59 2019        
(r349648)
+++ head/sys/dev/cxgbe/tom/t4_tls.c     Wed Jul  3 16:06:11 2019        
(r349649)
@@ -1193,7 +1193,6 @@ t4_push_tls_records(struct adapter *sc, struct toepcb 
 
                /* Read the header of the next TLS record. */
                sndptr = sbsndmbuf(sb, tls_ofld->sb_off, &sndptroff);
-               MPASS(!IS_AIOTX_MBUF(sndptr));
                m_copydata(sndptr, sndptroff, sizeof(thdr), (caddr_t)&thdr);
                tls_size = htons(thdr.length);
                plen = TLS_HEADER_LENGTH + tls_size;

Modified: head/sys/dev/cxgbe/tom/t4_tom.h
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.h     Wed Jul  3 09:51:59 2019        
(r349648)
+++ head/sys/dev/cxgbe/tom/t4_tom.h     Wed Jul  3 16:06:11 2019        
(r349649)
@@ -127,11 +127,6 @@ TAILQ_HEAD(pagesetq, pageset);
 #define        PS_WIRED                0x0001  /* Pages wired rather than 
held. */
 #define        PS_PPODS_WRITTEN        0x0002  /* Page pods written to the 
card. */
 
-#define        EXT_FLAG_AIOTX          EXT_FLAG_VENDOR1
-
-#define        IS_AIOTX_MBUF(m)                                                
\
-       ((m)->m_flags & M_EXT && (m)->m_ext.ext_flags & EXT_FLAG_AIOTX)
-
 struct ddp_buffer {
        struct pageset *ps;
 
@@ -151,12 +146,6 @@ struct ddp_pcb {
        struct task requeue_task;
        struct kaiocb *queueing;
        struct mtx lock;
-};
-
-struct aiotx_buffer {
-       struct pageset ps;
-       struct kaiocb *job;
-       int refcount;
 };
 
 struct toepcb {
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to