Re: [PATCH v3 3/4] arm64: Add IOMMU dma_ops

2015-07-15 Thread Catalin Marinas
On Wed, Jul 15, 2015 at 05:27:22PM +0100, Robin Murphy wrote:
> On 15/07/15 10:31, Catalin Marinas wrote:
> >On Fri, Jul 10, 2015 at 08:19:34PM +0100, Robin Murphy wrote:
> >>+   if (iommu_dma_mapping_error(dev, *handle)) {
> >>+   if (coherent)
> >>+   __free_pages(page, get_order(size));
> >>+   else
> >>+   __free_from_pool(addr, size);
> >>+   addr = NULL;
> >>+   }
> >>+   }
> >>+   }
> >>+   return addr;
> >>+}
> >
> >In the second case here (!__GFP_WAIT), do we do any cache maintenance? I
> >can't see it and it's needed for the !coherent case.
> 
> In the atomic non-coherent case, we're stealing from the atomic pool, so
> addr is already a non-cacheable alias (and alloc_from_pool does memset(0)
> through that). That shouldn't need anything extra, right?

You are right, we already flushed the cache for the atomic pool when we
allocated it in atomic_pool_init().

-- 
Catalin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 3/4] arm64: Add IOMMU dma_ops

2015-07-15 Thread Robin Murphy

On 15/07/15 10:31, Catalin Marinas wrote:

On Fri, Jul 10, 2015 at 08:19:34PM +0100, Robin Murphy wrote:

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index d16a1ce..ccadfd4 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -526,3 +526,426 @@ static int __init dma_debug_do_init(void)
return 0;
  }
  fs_initcall(dma_debug_do_init);
+
+
+#ifdef CONFIG_IOMMU_DMA
+#include 
+#include 
+#include 
+
+/* Thankfully, all cache ops are by VA so we can ignore phys here */
+static void flush_page(const void *virt, phys_addr_t phys)
+{
+   __dma_flush_range(virt, virt + PAGE_SIZE);
+}
+
+static void *__iommu_alloc_attrs(struct device *dev, size_t size,
+dma_addr_t *handle, gfp_t gfp,
+struct dma_attrs *attrs)
+{
+   bool coherent = is_device_dma_coherent(dev);
+   int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
+   void *addr;
+
+   if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
+   return NULL;
+
+   if (gfp & __GFP_WAIT) {
+   struct page **pages;
+   pgprot_t pgprot = coherent ? __pgprot(PROT_NORMAL) :
+__pgprot(PROT_NORMAL_NC);
+
+   pgprot = __get_dma_pgprot(attrs, pgprot, coherent);
+   pages = iommu_dma_alloc(dev, size, gfp, ioprot, coherent,
+   handle, coherent ? NULL : flush_page);


As I replied already on the other patch, the "coherent" argument here
should always be true.

BTW, why do we need to call flush_page via iommu_dma_alloc() and not
flush the buffer directly in the arch __iommu_alloc_attrs()? We already
have the pointer and size after remapping in the CPU address space), it
would keep the iommu_dma_alloc() simpler.


Mostly for the sake of moving arch/arm (and possibly other users) over 
later, where highmem and ATTR_NO_KERNEL_MAPPING make flushing the pages 
at the point of allocation seem the most sensible thing to do. Since 
iommu_dma_alloc already has a temporary scatterlist we can make use of 
the sg mapping iterator there, rather than have separate code to iterate 
over the pages (possibly with open-coded kmap/kunmap) in all the callers.



+   if (!pages)
+   return NULL;
+
+   addr = dma_common_pages_remap(pages, size, VM_USERMAP, pgprot,
+ __builtin_return_address(0));
+   if (!addr)
+   iommu_dma_free(dev, pages, size, handle);
+   } else {
+   struct page *page;
+   /*
+* In atomic context we can't remap anything, so we'll only
+* get the virtually contiguous buffer we need by way of a
+* physically contiguous allocation.
+*/
+   if (coherent) {
+   page = alloc_pages(gfp, get_order(size));
+   addr = page ? page_address(page) : NULL;


We could even use __get_free_pages(gfp & ~__GFP_HIGHMEM) since we don't
have/need highmem on arm64.


True, but then we'd have to dig the struct page back out to pass through 
to iommu_map_page.



+   } else {
+   addr = __alloc_from_pool(size, &page, gfp);
+   }
+   if (addr) {
+   *handle = iommu_dma_map_page(dev, page, 0, size,
+ioprot, false);


Why coherent == false?


I'm not sure I even know any more, but either way it means the wrong 
thing as discussed earlier, so it'll be going away.



+   if (iommu_dma_mapping_error(dev, *handle)) {
+   if (coherent)
+   __free_pages(page, get_order(size));
+   else
+   __free_from_pool(addr, size);
+   addr = NULL;
+   }
+   }
+   }
+   return addr;
+}


In the second case here (!__GFP_WAIT), do we do any cache maintenance? I
can't see it and it's needed for the !coherent case.


In the atomic non-coherent case, we're stealing from the atomic pool, so 
addr is already a non-cacheable alias (and alloc_from_pool does 
memset(0) through that). That shouldn't need anything extra, right?



+static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+  dma_addr_t handle, struct dma_attrs *attrs)
+{
+   /*
+* @cpu_addr will be one of 3 things depending on how it was allocated:
+* - A remapped array of pages from iommu_dma_alloc(), for all
+*   non-atomic allocations.
+* - A non-cacheable alias from the atomic pool, for atomic
+*   allocations by non-coherent devices.
+* - A normal lowmem address

Re: [PATCH v3 3/4] arm64: Add IOMMU dma_ops

2015-07-15 Thread Catalin Marinas
On Fri, Jul 10, 2015 at 08:19:34PM +0100, Robin Murphy wrote:
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index d16a1ce..ccadfd4 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -526,3 +526,426 @@ static int __init dma_debug_do_init(void)
>   return 0;
>  }
>  fs_initcall(dma_debug_do_init);
> +
> +
> +#ifdef CONFIG_IOMMU_DMA
> +#include 
> +#include 
> +#include 
> +
> +/* Thankfully, all cache ops are by VA so we can ignore phys here */
> +static void flush_page(const void *virt, phys_addr_t phys)
> +{
> + __dma_flush_range(virt, virt + PAGE_SIZE);
> +}
> +
> +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
> +  dma_addr_t *handle, gfp_t gfp,
> +  struct dma_attrs *attrs)
> +{
> + bool coherent = is_device_dma_coherent(dev);
> + int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
> + void *addr;
> +
> + if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
> + return NULL;
> +
> + if (gfp & __GFP_WAIT) {
> + struct page **pages;
> + pgprot_t pgprot = coherent ? __pgprot(PROT_NORMAL) :
> +  __pgprot(PROT_NORMAL_NC);
> +
> + pgprot = __get_dma_pgprot(attrs, pgprot, coherent);
> + pages = iommu_dma_alloc(dev, size, gfp, ioprot, coherent,
> + handle, coherent ? NULL : flush_page);

As I replied already on the other patch, the "coherent" argument here
should always be true.

BTW, why do we need to call flush_page via iommu_dma_alloc() and not
flush the buffer directly in the arch __iommu_alloc_attrs()? We already
have the pointer and size after remapping in the CPU address space), it
would keep the iommu_dma_alloc() simpler.

> + if (!pages)
> + return NULL;
> +
> + addr = dma_common_pages_remap(pages, size, VM_USERMAP, pgprot,
> +   __builtin_return_address(0));
> + if (!addr)
> + iommu_dma_free(dev, pages, size, handle);
> + } else {
> + struct page *page;
> + /*
> +  * In atomic context we can't remap anything, so we'll only
> +  * get the virtually contiguous buffer we need by way of a
> +  * physically contiguous allocation.
> +  */
> + if (coherent) {
> + page = alloc_pages(gfp, get_order(size));
> + addr = page ? page_address(page) : NULL;

We could even use __get_free_pages(gfp & ~__GFP_HIGHMEM) since we don't
have/need highmem on arm64.

> + } else {
> + addr = __alloc_from_pool(size, &page, gfp);
> + }
> + if (addr) {
> + *handle = iommu_dma_map_page(dev, page, 0, size,
> +  ioprot, false);

Why coherent == false?

> + if (iommu_dma_mapping_error(dev, *handle)) {
> + if (coherent)
> + __free_pages(page, get_order(size));
> + else
> + __free_from_pool(addr, size);
> + addr = NULL;
> + }
> + }
> + }
> + return addr;
> +}

In the second case here (!__GFP_WAIT), do we do any cache maintenance? I
can't see it and it's needed for the !coherent case.

> +static void __iommu_free_attrs(struct device *dev, size_t size, void 
> *cpu_addr,
> +dma_addr_t handle, struct dma_attrs *attrs)
> +{
> + /*
> +  * @cpu_addr will be one of 3 things depending on how it was allocated:
> +  * - A remapped array of pages from iommu_dma_alloc(), for all
> +  *   non-atomic allocations.
> +  * - A non-cacheable alias from the atomic pool, for atomic
> +  *   allocations by non-coherent devices.
> +  * - A normal lowmem address, for atomic allocations by
> +  *   coherent devices.
> +  * Hence how dodgy the below logic looks...
> +  */
> + if (__free_from_pool(cpu_addr, size)) {
> + iommu_dma_unmap_page(dev, handle, size, 0, NULL);
> + } else if (is_vmalloc_addr(cpu_addr)){
> + struct vm_struct *area = find_vm_area(cpu_addr);
> +
> + if (WARN_ON(!area || !area->pages))
> + return;
> + iommu_dma_free(dev, area->pages, size, &handle);
> + dma_common_free_remap(cpu_addr, size, VM_USERMAP);
> + } else {
> + __free_pages(virt_to_page(cpu_addr), get_order(size));
> + iommu_dma_unmap_page(dev, handle, size, 0, NULL);

Just slightly paranoid but it's better to unmap the page from the iommu
space before freeing (in case there is some rogue device still access

[PATCH v3 3/4] arm64: Add IOMMU dma_ops

2015-07-10 Thread Robin Murphy
Taking some inspiration from the arch/arm code, implement the
arch-specific side of the DMA mapping ops using the new IOMMU-DMA layer.

Unfortunately the device setup code has to start out as a big ugly mess
in order to work usefully right now, as 'proper' operation depends on
changes to device probe and DMA configuration ordering, IOMMU groups for
platform devices, and default domain support in arm/arm64 IOMMU drivers.
The workarounds here need only exist until that work is finished.

Signed-off-by: Robin Murphy 
---
 arch/arm64/mm/dma-mapping.c | 423 
 1 file changed, 423 insertions(+)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index d16a1ce..ccadfd4 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -526,3 +526,426 @@ static int __init dma_debug_do_init(void)
return 0;
 }
 fs_initcall(dma_debug_do_init);
+
+
+#ifdef CONFIG_IOMMU_DMA
+#include 
+#include 
+#include 
+
+/* Thankfully, all cache ops are by VA so we can ignore phys here */
+static void flush_page(const void *virt, phys_addr_t phys)
+{
+   __dma_flush_range(virt, virt + PAGE_SIZE);
+}
+
+static void *__iommu_alloc_attrs(struct device *dev, size_t size,
+dma_addr_t *handle, gfp_t gfp,
+struct dma_attrs *attrs)
+{
+   bool coherent = is_device_dma_coherent(dev);
+   int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
+   void *addr;
+
+   if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
+   return NULL;
+
+   if (gfp & __GFP_WAIT) {
+   struct page **pages;
+   pgprot_t pgprot = coherent ? __pgprot(PROT_NORMAL) :
+__pgprot(PROT_NORMAL_NC);
+
+   pgprot = __get_dma_pgprot(attrs, pgprot, coherent);
+   pages = iommu_dma_alloc(dev, size, gfp, ioprot, coherent,
+   handle, coherent ? NULL : flush_page);
+   if (!pages)
+   return NULL;
+
+   addr = dma_common_pages_remap(pages, size, VM_USERMAP, pgprot,
+ __builtin_return_address(0));
+   if (!addr)
+   iommu_dma_free(dev, pages, size, handle);
+   } else {
+   struct page *page;
+   /*
+* In atomic context we can't remap anything, so we'll only
+* get the virtually contiguous buffer we need by way of a
+* physically contiguous allocation.
+*/
+   if (coherent) {
+   page = alloc_pages(gfp, get_order(size));
+   addr = page ? page_address(page) : NULL;
+   } else {
+   addr = __alloc_from_pool(size, &page, gfp);
+   }
+   if (addr) {
+   *handle = iommu_dma_map_page(dev, page, 0, size,
+ioprot, false);
+   if (iommu_dma_mapping_error(dev, *handle)) {
+   if (coherent)
+   __free_pages(page, get_order(size));
+   else
+   __free_from_pool(addr, size);
+   addr = NULL;
+   }
+   }
+   }
+   return addr;
+}
+
+static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+  dma_addr_t handle, struct dma_attrs *attrs)
+{
+   /*
+* @cpu_addr will be one of 3 things depending on how it was allocated:
+* - A remapped array of pages from iommu_dma_alloc(), for all
+*   non-atomic allocations.
+* - A non-cacheable alias from the atomic pool, for atomic
+*   allocations by non-coherent devices.
+* - A normal lowmem address, for atomic allocations by
+*   coherent devices.
+* Hence how dodgy the below logic looks...
+*/
+   if (__free_from_pool(cpu_addr, size)) {
+   iommu_dma_unmap_page(dev, handle, size, 0, NULL);
+   } else if (is_vmalloc_addr(cpu_addr)){
+   struct vm_struct *area = find_vm_area(cpu_addr);
+
+   if (WARN_ON(!area || !area->pages))
+   return;
+   iommu_dma_free(dev, area->pages, size, &handle);
+   dma_common_free_remap(cpu_addr, size, VM_USERMAP);
+   } else {
+   __free_pages(virt_to_page(cpu_addr), get_order(size));
+   iommu_dma_unmap_page(dev, handle, size, 0, NULL);
+   }
+}
+
+static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ struct dma_attrs *attrs)