Re: arm64 bus_dmamap_sync(9)

2017-05-05 Thread Patrick Wildt
On Fri, May 05, 2017 at 03:19:07PM +0200, Mark Kettenis wrote:
> To support sdhc(4) on the Firefly-RK3399, I actually need a working
> bus_dmamap_sync(9) implementation that does proper cache flushes.  So
> far we've gotten away with not having one, because the USB stack tends
> to set the BUS_DMA_COHERENT flag which maps memory into uncached
> address space such that the cache flushes can be skipped.
> 
> Diff below follows the approach taken by armv7.  I've left out the
> optimization where BUS_DMASYNC_PREREAD invalidates complete cache
> lines instead of writing them back.  I'm far from certain that this
> optimization actually matters on your typical ARMv8 CPU.
> 
> ok?

ok patrick@

> 
> 
> Index: bus_dma.c
> ===
> RCS file: /cvs/src/sys/arch/arm64/arm64/bus_dma.c,v
> retrieving revision 1.6
> diff -u -p -r1.6 bus_dma.c
> --- bus_dma.c 22 Feb 2017 22:55:27 -  1.6
> +++ bus_dma.c 5 May 2017 13:06:50 -
> @@ -63,9 +63,9 @@
>  
>  #include 
>  
> -#include 
> -
>  #include 
> +#include 
> +#include 
>  
>  /*
>   * Common function for DMA map creation.  May be called by bus-specific
> @@ -351,6 +351,33 @@ _dmamap_unload(bus_dma_tag_t t, bus_dmam
>   map->dm_mapsize = 0;
>  }
>  
> +static void
> +_dmamap_sync_segment(vaddr_t va, vsize_t len, int ops)
> +{
> + switch (ops) {
> + case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
> + case BUS_DMASYNC_PREREAD:
> + cpu_dcache_wbinv_range(va, len);
> + break;
> +
> + case BUS_DMASYNC_PREWRITE:
> + cpu_dcache_wb_range(va, len);
> + break;
> +
> + /*
> +  * Cortex CPUs can do speculative loads so we need to clean the cache
> +  * after a DMA read to deal with any speculatively loaded cache lines.
> +  * Since these can't be dirty, we can just invalidate them and don't
> +  * have to worry about having to write back their contents.
> +  */
> + case BUS_DMASYNC_POSTREAD:
> + case BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE:
> + membar_sync();
> + cpu_dcache_inv_range(va, len);
> + break;
> + }
> +}
> +
>  /*
>   * Common function for DMA map synchronization.  May be called
>   * by bus-specific DMA map synchronization functions.
> @@ -376,12 +403,10 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
>   curseg = 0;
>  
>   while (size && nsegs) {
> - paddr_t paddr;
>   vaddr_t vaddr;
>   bus_size_t ssize;
>  
>   ssize = map->dm_segs[curseg].ds_len;
> - paddr = map->dm_segs[curseg]._ds_paddr;
>   vaddr = map->dm_segs[curseg]._ds_vaddr;
>  
>   if (addr != 0) {
> @@ -390,7 +415,6 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
>   ssize = 0;
>   } else {
>   vaddr += addr;
> - paddr += addr;
>   ssize -= addr;
>   addr = 0;
>   }
> @@ -399,21 +423,7 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
>   ssize = size;
>  
>   if (ssize != 0) {
> - /*
> -  * If only PREWRITE is requested, writeback.
> -  * PREWRITE with PREREAD writebacks
> -  * and invalidates (if noncoherent) *all* cache levels.
> -  * Otherwise, just invalidate (if noncoherent).
> -  */
> - if (op & BUS_DMASYNC_PREWRITE) {
> - if (op & BUS_DMASYNC_PREREAD)
> - ; // XXX MUST ADD CACHEFLUSHING
> - else
> - ; // XXX MUST ADD CACHEFLUSHING
> - } else
> - if (op & (BUS_DMASYNC_PREREAD | BUS_DMASYNC_POSTREAD)) {
> - ; // XXX MUST ADD CACHEFLUSHING
> - }
> + _dmamap_sync_segment(vaddr, ssize, op);
>   size -= ssize;
>   }
>   curseg++;
> 



Re: arm64 bus_dmamap_sync(9)

2017-05-05 Thread Jonathan Gray
On Fri, May 05, 2017 at 03:19:07PM +0200, Mark Kettenis wrote:
> To support sdhc(4) on the Firefly-RK3399, I actually need a working
> bus_dmamap_sync(9) implementation that does proper cache flushes.  So
> far we've gotten away with not having one, because the USB stack tends
> to set the BUS_DMA_COHERENT flag which maps memory into uncached
> address space such that the cache flushes can be skipped.
> 
> Diff below follows the approach taken by armv7.  I've left out the
> optimization where BUS_DMASYNC_PREREAD invalidates complete cache
> lines instead of writing them back.  I'm far from certain that this
> optimization actually matters on your typical ARMv8 CPU.
> 
> ok?

ok jsg@

> 
> 
> Index: bus_dma.c
> ===
> RCS file: /cvs/src/sys/arch/arm64/arm64/bus_dma.c,v
> retrieving revision 1.6
> diff -u -p -r1.6 bus_dma.c
> --- bus_dma.c 22 Feb 2017 22:55:27 -  1.6
> +++ bus_dma.c 5 May 2017 13:06:50 -
> @@ -63,9 +63,9 @@
>  
>  #include 
>  
> -#include 
> -
>  #include 
> +#include 
> +#include 
>  
>  /*
>   * Common function for DMA map creation.  May be called by bus-specific
> @@ -351,6 +351,33 @@ _dmamap_unload(bus_dma_tag_t t, bus_dmam
>   map->dm_mapsize = 0;
>  }
>  
> +static void
> +_dmamap_sync_segment(vaddr_t va, vsize_t len, int ops)
> +{
> + switch (ops) {
> + case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
> + case BUS_DMASYNC_PREREAD:
> + cpu_dcache_wbinv_range(va, len);
> + break;
> +
> + case BUS_DMASYNC_PREWRITE:
> + cpu_dcache_wb_range(va, len);
> + break;
> +
> + /*
> +  * Cortex CPUs can do speculative loads so we need to clean the cache
> +  * after a DMA read to deal with any speculatively loaded cache lines.
> +  * Since these can't be dirty, we can just invalidate them and don't
> +  * have to worry about having to write back their contents.
> +  */
> + case BUS_DMASYNC_POSTREAD:
> + case BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE:
> + membar_sync();
> + cpu_dcache_inv_range(va, len);
> + break;
> + }
> +}
> +
>  /*
>   * Common function for DMA map synchronization.  May be called
>   * by bus-specific DMA map synchronization functions.
> @@ -376,12 +403,10 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
>   curseg = 0;
>  
>   while (size && nsegs) {
> - paddr_t paddr;
>   vaddr_t vaddr;
>   bus_size_t ssize;
>  
>   ssize = map->dm_segs[curseg].ds_len;
> - paddr = map->dm_segs[curseg]._ds_paddr;
>   vaddr = map->dm_segs[curseg]._ds_vaddr;
>  
>   if (addr != 0) {
> @@ -390,7 +415,6 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
>   ssize = 0;
>   } else {
>   vaddr += addr;
> - paddr += addr;
>   ssize -= addr;
>   addr = 0;
>   }
> @@ -399,21 +423,7 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
>   ssize = size;
>  
>   if (ssize != 0) {
> - /*
> -  * If only PREWRITE is requested, writeback.
> -  * PREWRITE with PREREAD writebacks
> -  * and invalidates (if noncoherent) *all* cache levels.
> -  * Otherwise, just invalidate (if noncoherent).
> -  */
> - if (op & BUS_DMASYNC_PREWRITE) {
> - if (op & BUS_DMASYNC_PREREAD)
> - ; // XXX MUST ADD CACHEFLUSHING
> - else
> - ; // XXX MUST ADD CACHEFLUSHING
> - } else
> - if (op & (BUS_DMASYNC_PREREAD | BUS_DMASYNC_POSTREAD)) {
> - ; // XXX MUST ADD CACHEFLUSHING
> - }
> + _dmamap_sync_segment(vaddr, ssize, op);
>   size -= ssize;
>   }
>   curseg++;
> 



arm64 bus_dmamap_sync(9)

2017-05-05 Thread Mark Kettenis
To support sdhc(4) on the Firefly-RK3399, I actually need a working
bus_dmamap_sync(9) implementation that does proper cache flushes.  So
far we've gotten away with not having one, because the USB stack tends
to set the BUS_DMA_COHERENT flag which maps memory into uncached
address space such that the cache flushes can be skipped.

Diff below follows the approach taken by armv7.  I've left out the
optimization where BUS_DMASYNC_PREREAD invalidates complete cache
lines instead of writing them back.  I'm far from certain that this
optimization actually matters on your typical ARMv8 CPU.

ok?


Index: bus_dma.c
===
RCS file: /cvs/src/sys/arch/arm64/arm64/bus_dma.c,v
retrieving revision 1.6
diff -u -p -r1.6 bus_dma.c
--- bus_dma.c   22 Feb 2017 22:55:27 -  1.6
+++ bus_dma.c   5 May 2017 13:06:50 -
@@ -63,9 +63,9 @@
 
 #include 
 
-#include 
-
 #include 
+#include 
+#include 
 
 /*
  * Common function for DMA map creation.  May be called by bus-specific
@@ -351,6 +351,33 @@ _dmamap_unload(bus_dma_tag_t t, bus_dmam
map->dm_mapsize = 0;
 }
 
+static void
+_dmamap_sync_segment(vaddr_t va, vsize_t len, int ops)
+{
+   switch (ops) {
+   case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
+   case BUS_DMASYNC_PREREAD:
+   cpu_dcache_wbinv_range(va, len);
+   break;
+
+   case BUS_DMASYNC_PREWRITE:
+   cpu_dcache_wb_range(va, len);
+   break;
+
+   /*
+* Cortex CPUs can do speculative loads so we need to clean the cache
+* after a DMA read to deal with any speculatively loaded cache lines.
+* Since these can't be dirty, we can just invalidate them and don't
+* have to worry about having to write back their contents.
+*/
+   case BUS_DMASYNC_POSTREAD:
+   case BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE:
+   membar_sync();
+   cpu_dcache_inv_range(va, len);
+   break;
+   }
+}
+
 /*
  * Common function for DMA map synchronization.  May be called
  * by bus-specific DMA map synchronization functions.
@@ -376,12 +403,10 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
curseg = 0;
 
while (size && nsegs) {
-   paddr_t paddr;
vaddr_t vaddr;
bus_size_t ssize;
 
ssize = map->dm_segs[curseg].ds_len;
-   paddr = map->dm_segs[curseg]._ds_paddr;
vaddr = map->dm_segs[curseg]._ds_vaddr;
 
if (addr != 0) {
@@ -390,7 +415,6 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
ssize = 0;
} else {
vaddr += addr;
-   paddr += addr;
ssize -= addr;
addr = 0;
}
@@ -399,21 +423,7 @@ _dmamap_sync(bus_dma_tag_t t, bus_dmamap
ssize = size;
 
if (ssize != 0) {
-   /*
-* If only PREWRITE is requested, writeback.
-* PREWRITE with PREREAD writebacks
-* and invalidates (if noncoherent) *all* cache levels.
-* Otherwise, just invalidate (if noncoherent).
-*/
-   if (op & BUS_DMASYNC_PREWRITE) {
-   if (op & BUS_DMASYNC_PREREAD)
-   ; // XXX MUST ADD CACHEFLUSHING
-   else
-   ; // XXX MUST ADD CACHEFLUSHING
-   } else
-   if (op & (BUS_DMASYNC_PREREAD | BUS_DMASYNC_POSTREAD)) {
-   ; // XXX MUST ADD CACHEFLUSHING
-   }
+   _dmamap_sync_segment(vaddr, ssize, op);
size -= ssize;
}
curseg++;