Re: arm64 bus_dmamap_sync(9)
On Fri, May 05, 2017 at 03:19:07PM +0200, Mark Kettenis wrote: > To support sdhc(4) on the Firefly-RK3399, I actually need a working > bus_dmamap_sync(9) implementation that does proper cache flushes. So > far we've gotten away with not having one, because the USB stack tends > to set the BUS_DMA_COHERENT flag which maps memory into uncached > address space such that the cache flushes can be skipped. > > Diff below follows the approach taken by armv7. I've left out the > optimization where BUS_DMASYNC_PREREAD invalidates complete cache > lines instead of writing them back. I'm far from certain that this > optimization actually matters on your typical ARMv8 CPU. > > ok? ok patrick@ > > > Index: bus_dma.c > === > RCS file: /cvs/src/sys/arch/arm64/arm64/bus_dma.c,v > retrieving revision 1.6 > diff -u -p -r1.6 bus_dma.c > --- bus_dma.c 22 Feb 2017 22:55:27 - 1.6 > +++ bus_dma.c 5 May 2017 13:06:50 - > @@ -63,9 +63,9 @@ > > #include > > -#include > - > #include > +#include > +#include > > /* > * Common function for DMA map creation. May be called by bus-specific > @@ -351,6 +351,33 @@ _dmamap_unload(bus_dma_tag_t t, bus_dmam > map->dm_mapsize = 0; > } > > +static void > +_dmamap_sync_segment(vaddr_t va, vsize_t len, int ops) > +{ > + switch (ops) { > + case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE: > + case BUS_DMASYNC_PREREAD: > + cpu_dcache_wbinv_range(va, len); > + break; > + > + case BUS_DMASYNC_PREWRITE: > + cpu_dcache_wb_range(va, len); > + break; > + > + /* > + * Cortex CPUs can do speculative loads so we need to clean the cache > + * after a DMA read to deal with any speculatively loaded cache lines. > + * Since these can't be dirty, we can just invalidate them and don't > + * have to worry about having to write back their contents. > + */ > + case BUS_DMASYNC_POSTREAD: > + case BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE: > + membar_sync(); > + cpu_dcache_inv_range(va, len); > + break; > + } > +} > + > /* > * Common function for DMA map synchronization. May be called > * by bus-specific DMA map synchronization functions. > @@ -376,12 +403,10 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap > curseg = 0; > > while (size && nsegs) { > - paddr_t paddr; > vaddr_t vaddr; > bus_size_t ssize; > > ssize = map->dm_segs[curseg].ds_len; > - paddr = map->dm_segs[curseg]._ds_paddr; > vaddr = map->dm_segs[curseg]._ds_vaddr; > > if (addr != 0) { > @@ -390,7 +415,6 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap > ssize = 0; > } else { > vaddr += addr; > - paddr += addr; > ssize -= addr; > addr = 0; > } > @@ -399,21 +423,7 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap > ssize = size; > > if (ssize != 0) { > - /* > - * If only PREWRITE is requested, writeback. > - * PREWRITE with PREREAD writebacks > - * and invalidates (if noncoherent) *all* cache levels. > - * Otherwise, just invalidate (if noncoherent). > - */ > - if (op & BUS_DMASYNC_PREWRITE) { > - if (op & BUS_DMASYNC_PREREAD) > - ; // XXX MUST ADD CACHEFLUSHING > - else > - ; // XXX MUST ADD CACHEFLUSHING > - } else > - if (op & (BUS_DMASYNC_PREREAD | BUS_DMASYNC_POSTREAD)) { > - ; // XXX MUST ADD CACHEFLUSHING > - } > + _dmamap_sync_segment(vaddr, ssize, op); > size -= ssize; > } > curseg++; >
Re: arm64 bus_dmamap_sync(9)
On Fri, May 05, 2017 at 03:19:07PM +0200, Mark Kettenis wrote: > To support sdhc(4) on the Firefly-RK3399, I actually need a working > bus_dmamap_sync(9) implementation that does proper cache flushes. So > far we've gotten away with not having one, because the USB stack tends > to set the BUS_DMA_COHERENT flag which maps memory into uncached > address space such that the cache flushes can be skipped. > > Diff below follows the approach taken by armv7. I've left out the > optimization where BUS_DMASYNC_PREREAD invalidates complete cache > lines instead of writing them back. I'm far from certain that this > optimization actually matters on your typical ARMv8 CPU. > > ok? ok jsg@ > > > Index: bus_dma.c > === > RCS file: /cvs/src/sys/arch/arm64/arm64/bus_dma.c,v > retrieving revision 1.6 > diff -u -p -r1.6 bus_dma.c > --- bus_dma.c 22 Feb 2017 22:55:27 - 1.6 > +++ bus_dma.c 5 May 2017 13:06:50 - > @@ -63,9 +63,9 @@ > > #include > > -#include > - > #include > +#include > +#include > > /* > * Common function for DMA map creation. May be called by bus-specific > @@ -351,6 +351,33 @@ _dmamap_unload(bus_dma_tag_t t, bus_dmam > map->dm_mapsize = 0; > } > > +static void > +_dmamap_sync_segment(vaddr_t va, vsize_t len, int ops) > +{ > + switch (ops) { > + case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE: > + case BUS_DMASYNC_PREREAD: > + cpu_dcache_wbinv_range(va, len); > + break; > + > + case BUS_DMASYNC_PREWRITE: > + cpu_dcache_wb_range(va, len); > + break; > + > + /* > + * Cortex CPUs can do speculative loads so we need to clean the cache > + * after a DMA read to deal with any speculatively loaded cache lines. > + * Since these can't be dirty, we can just invalidate them and don't > + * have to worry about having to write back their contents. > + */ > + case BUS_DMASYNC_POSTREAD: > + case BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE: > + membar_sync(); > + cpu_dcache_inv_range(va, len); > + break; > + } > +} > + > /* > * Common function for DMA map synchronization. May be called > * by bus-specific DMA map synchronization functions. > @@ -376,12 +403,10 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap > curseg = 0; > > while (size && nsegs) { > - paddr_t paddr; > vaddr_t vaddr; > bus_size_t ssize; > > ssize = map->dm_segs[curseg].ds_len; > - paddr = map->dm_segs[curseg]._ds_paddr; > vaddr = map->dm_segs[curseg]._ds_vaddr; > > if (addr != 0) { > @@ -390,7 +415,6 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap > ssize = 0; > } else { > vaddr += addr; > - paddr += addr; > ssize -= addr; > addr = 0; > } > @@ -399,21 +423,7 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap > ssize = size; > > if (ssize != 0) { > - /* > - * If only PREWRITE is requested, writeback. > - * PREWRITE with PREREAD writebacks > - * and invalidates (if noncoherent) *all* cache levels. > - * Otherwise, just invalidate (if noncoherent). > - */ > - if (op & BUS_DMASYNC_PREWRITE) { > - if (op & BUS_DMASYNC_PREREAD) > - ; // XXX MUST ADD CACHEFLUSHING > - else > - ; // XXX MUST ADD CACHEFLUSHING > - } else > - if (op & (BUS_DMASYNC_PREREAD | BUS_DMASYNC_POSTREAD)) { > - ; // XXX MUST ADD CACHEFLUSHING > - } > + _dmamap_sync_segment(vaddr, ssize, op); > size -= ssize; > } > curseg++; >
arm64 bus_dmamap_sync(9)
To support sdhc(4) on the Firefly-RK3399, I actually need a working bus_dmamap_sync(9) implementation that does proper cache flushes. So far we've gotten away with not having one, because the USB stack tends to set the BUS_DMA_COHERENT flag which maps memory into uncached address space such that the cache flushes can be skipped. Diff below follows the approach taken by armv7. I've left out the optimization where BUS_DMASYNC_PREREAD invalidates complete cache lines instead of writing them back. I'm far from certain that this optimization actually matters on your typical ARMv8 CPU. ok? Index: bus_dma.c === RCS file: /cvs/src/sys/arch/arm64/arm64/bus_dma.c,v retrieving revision 1.6 diff -u -p -r1.6 bus_dma.c --- bus_dma.c 22 Feb 2017 22:55:27 - 1.6 +++ bus_dma.c 5 May 2017 13:06:50 - @@ -63,9 +63,9 @@ #include -#include - #include +#include +#include /* * Common function for DMA map creation. May be called by bus-specific @@ -351,6 +351,33 @@ _dmamap_unload(bus_dma_tag_t t, bus_dmam map->dm_mapsize = 0; } +static void +_dmamap_sync_segment(vaddr_t va, vsize_t len, int ops) +{ + switch (ops) { + case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE: + case BUS_DMASYNC_PREREAD: + cpu_dcache_wbinv_range(va, len); + break; + + case BUS_DMASYNC_PREWRITE: + cpu_dcache_wb_range(va, len); + break; + + /* +* Cortex CPUs can do speculative loads so we need to clean the cache +* after a DMA read to deal with any speculatively loaded cache lines. +* Since these can't be dirty, we can just invalidate them and don't +* have to worry about having to write back their contents. +*/ + case BUS_DMASYNC_POSTREAD: + case BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE: + membar_sync(); + cpu_dcache_inv_range(va, len); + break; + } +} + /* * Common function for DMA map synchronization. May be called * by bus-specific DMA map synchronization functions. @@ -376,12 +403,10 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap curseg = 0; while (size && nsegs) { - paddr_t paddr; vaddr_t vaddr; bus_size_t ssize; ssize = map->dm_segs[curseg].ds_len; - paddr = map->dm_segs[curseg]._ds_paddr; vaddr = map->dm_segs[curseg]._ds_vaddr; if (addr != 0) { @@ -390,7 +415,6 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap ssize = 0; } else { vaddr += addr; - paddr += addr; ssize -= addr; addr = 0; } @@ -399,21 +423,7 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap ssize = size; if (ssize != 0) { - /* -* If only PREWRITE is requested, writeback. -* PREWRITE with PREREAD writebacks -* and invalidates (if noncoherent) *all* cache levels. -* Otherwise, just invalidate (if noncoherent). -*/ - if (op & BUS_DMASYNC_PREWRITE) { - if (op & BUS_DMASYNC_PREREAD) - ; // XXX MUST ADD CACHEFLUSHING - else - ; // XXX MUST ADD CACHEFLUSHING - } else - if (op & (BUS_DMASYNC_PREREAD | BUS_DMASYNC_POSTREAD)) { - ; // XXX MUST ADD CACHEFLUSHING - } + _dmamap_sync_segment(vaddr, ssize, op); size -= ssize; } curseg++;