Hi,

I've spent a few days investigating why USB ethernet adapters are so
horribly slow on my ARMs.  Using dt(4) I realized that it was spending
most of its time in memcpy.  But, why?  As it turns out, all USB data
buffers are mapped COHERENT, which on some/most ARMs means uncached.
Using cached data buffers makes the performance rise from 20 mbit/s to
200 mbit/s.  Quite a difference.

sys/dev/usb/usb_mem.c:
        error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
                               &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);

On x86, COHERENT is essentially a no-op.  On ARM, it depends on the SoC.
Some SoCs have cache-coherent USB controllers, some don't.  Mine does
not, so mapping it COHERENT means uncached and thus slow.

Why do we do that?  Well, when the code was imported in 99, it was
already there.  Since then we have gained infrastructure for DMA
syncs in the USB stack, which I think are proper.

sys/dev/usb/usbdi.c - usbd_transfer() (before transfer)

        if (!usbd_xfer_isread(xfer)) {
                if ((xfer->flags & USBD_NO_COPY) == 0)
                        memcpy(KERNADDR(&xfer->dmabuf, 0), xfer->buffer,
                            xfer->length);
                usb_syncmem(&xfer->dmabuf, 0, xfer->length,
                    BUS_DMASYNC_PREWRITE);
        } else
                usb_syncmem(&xfer->dmabuf, 0, xfer->length,
                    BUS_DMASYNC_PREREAD);
        err = pipe->methods->transfer(xfer);

sys/dev/usb/usbdi.c - usb_transfer_complete() (after transfer)

        if (xfer->actlen != 0) {
                if (usbd_xfer_isread(xfer)) {
                        usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
                            BUS_DMASYNC_POSTREAD);
                        if (!(xfer->flags & USBD_NO_COPY))
                                memcpy(xfer->buffer, KERNADDR(&xfer->dmabuf, 0),
                                    xfer->actlen);
                } else
                        usb_syncmem(&xfer->dmabuf, 0, xfer->actlen,
                            BUS_DMASYNC_POSTWRITE);
        }

We cannot just remove COHERENT, since some drivers, like ehci(4), use
the same backend to allocate their rings.  And I can't vouch for those
drivers' sanity.

As a first step, I would like to go ahead with another solution, which
is based on a diff from Marius Strobl, who added those syncs in the
first place.  Essentially it splits the memory handling into cacheable
and non-cacheable blocks.  The USB data transfers and everyone who uses
usbd_alloc_buffer() then use cacheable buffers, while code like ehci(4)
still don't.  This is a bit of a safer approach imho, since we don't
hurt the controller drivers, but speed up the data buffers.

Once we have verified that there are no regressions, we can adjust
ehci(4) and the like, add proper syncs, make sure they still work as
well as before, and maybe then back this out again.

Keep note that this is all a no-op on X86, but all the other archs will
profit from this.

ok?

Patrick

diff --git a/sys/dev/usb/usb_mem.c b/sys/dev/usb/usb_mem.c
index c65906b43f4..95993093b5a 100644
--- a/sys/dev/usb/usb_mem.c
+++ b/sys/dev/usb/usb_mem.c
@@ -72,7 +72,7 @@ struct usb_frag_dma {
 };
 
 usbd_status    usb_block_allocmem(bus_dma_tag_t, size_t, size_t,
-                   struct usb_dma_block **);
+                   struct usb_dma_block **, int);
 void           usb_block_freemem(struct usb_dma_block *);
 
 LIST_HEAD(, usb_dma_block) usb_blk_freelist =
@@ -84,7 +84,7 @@ LIST_HEAD(, usb_frag_dma) usb_frag_freelist =
 
 usbd_status
 usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t align,
-    struct usb_dma_block **dmap)
+    struct usb_dma_block **dmap, int cacheable)
 {
        int error;
         struct usb_dma_block *p;
@@ -96,7 +96,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
        s = splusb();
        /* First check the free list. */
        for (p = LIST_FIRST(&usb_blk_freelist); p; p = LIST_NEXT(p, next)) {
-               if (p->tag == tag && p->size >= size && p->align >= align) {
+               if (p->tag == tag && p->size >= size && p->align >= align &&
+                   p->cacheable == cacheable) {
                        LIST_REMOVE(p, next);
                        usb_blk_nfree--;
                        splx(s);
@@ -116,6 +117,7 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
        p->tag = tag;
        p->size = size;
        p->align = align;
+       p->cacheable = cacheable;
        error = bus_dmamem_alloc(tag, p->size, align, 0,
                                 p->segs, nitems(p->segs),
                                 &p->nsegs, BUS_DMA_NOWAIT);
@@ -123,7 +125,8 @@ usb_block_allocmem(bus_dma_tag_t tag, size_t size, size_t 
align,
                goto free0;
 
        error = bus_dmamem_map(tag, p->segs, p->nsegs, p->size,
-                              &p->kaddr, BUS_DMA_NOWAIT|BUS_DMA_COHERENT);
+                              &p->kaddr, BUS_DMA_NOWAIT | (cacheable ? 0 :
+                              BUS_DMA_COHERENT));
        if (error)
                goto free1;
 
@@ -187,14 +190,18 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t 
align, struct usb_dma *p)
        usbd_status err;
        struct usb_frag_dma *f;
        struct usb_dma_block *b;
+       int cacheable;
        int i;
        int s;
 
+       cacheable = !!(p->flags & USB_DMA_XFER_BUFFER);
+
        /* If the request is large then just use a full block. */
        if (size > USB_MEM_SMALL || align > USB_MEM_SMALL) {
                DPRINTFN(1, ("%s: large alloc %d\n", __func__, (int)size));
                size = (size + USB_MEM_BLOCK - 1) & ~(USB_MEM_BLOCK - 1);
-               err = usb_block_allocmem(tag, size, align, &p->block);
+               err = usb_block_allocmem(tag, size, align, &p->block,
+                   cacheable);
                if (!err) {
                        p->block->frags = NULL;
                        p->offs = 0;
@@ -205,11 +212,12 @@ usb_allocmem(struct usbd_bus *bus, size_t size, size_t 
align, struct usb_dma *p)
        s = splusb();
        /* Check for free fragments. */
        for (f = LIST_FIRST(&usb_frag_freelist); f; f = LIST_NEXT(f, next))
-               if (f->block->tag == tag)
+               if (f->block->tag == tag && f->block->cacheable == cacheable)
                        break;
        if (f == NULL) {
                DPRINTFN(1, ("usb_allocmem: adding fragments\n"));
-               err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL,&b);
+               err = usb_block_allocmem(tag, USB_MEM_BLOCK, USB_MEM_SMALL, &b,
+                   cacheable);
                if (err) {
                        splx(s);
                        return (err);
diff --git a/sys/dev/usb/usb_mem.h b/sys/dev/usb/usb_mem.h
index 1ae933f4bd9..f4d0bb85bbd 100644
--- a/sys/dev/usb/usb_mem.h
+++ b/sys/dev/usb/usb_mem.h
@@ -40,6 +40,7 @@ struct usb_dma_block {
         caddr_t kaddr;
         bus_dma_segment_t segs[1];
         int nsegs;
+       int cacheable;
         size_t size;
         size_t align;
        struct usb_frag_dma *frags;
diff --git a/sys/dev/usb/usbdi.c b/sys/dev/usb/usbdi.c
index aba1220103f..5f750a2bbcb 100644
--- a/sys/dev/usb/usbdi.c
+++ b/sys/dev/usb/usbdi.c
@@ -305,6 +305,7 @@ usbd_transfer(struct usbd_xfer *xfer)
                if (xfer->rqflags & URQ_AUTO_DMABUF)
                        printf("usbd_transfer: has old buffer!\n");
 #endif
+               xfer->dmabuf.flags |= USB_DMA_XFER_BUFFER;
                err = usb_allocmem(bus, xfer->length, 0, &xfer->dmabuf);
                if (err)
                        return (err);
@@ -389,6 +390,7 @@ usbd_alloc_buffer(struct usbd_xfer *xfer, u_int32_t size)
        if (xfer->rqflags & (URQ_DEV_DMABUF | URQ_AUTO_DMABUF))
                printf("usbd_alloc_buffer: xfer already has a buffer\n");
 #endif
+       xfer->dmabuf.flags |= USB_DMA_XFER_BUFFER;
        err = usb_allocmem(bus, size, 0, &xfer->dmabuf);
        if (err)
                return (NULL);
diff --git a/sys/dev/usb/usbdivar.h b/sys/dev/usb/usbdivar.h
index 6600c402979..11a6d52a0f9 100644
--- a/sys/dev/usb/usbdivar.h
+++ b/sys/dev/usb/usbdivar.h
@@ -47,6 +47,8 @@ struct usb_dma_block;
 struct usb_dma {
        struct usb_dma_block    *block;
        u_int                    offs;
+       int                      flags;
+#define USB_DMA_XFER_BUFFER     0x01
 };
 
 struct usbd_xfer;

Reply via email to