On Sat, Oct 03, 2020 at 04:02:57AM +0000, John Stultz wrote:
> This adds a heap that allocates non-contiguous buffers that are
> marked as writecombined, so they are not cached by the CPU.
> 
> This is useful, as most graphics buffers are usually not touched
> by the CPU or only written into once by the CPU. So when mapping
> the buffer over and over between devices, we can skip the CPU
> syncing, which saves a lot of cache management overhead, greatly
> improving performance.
> 
> For folk using ION, there was a ION_FLAG_CACHED flag, which
> signaled if the returned buffer should be CPU cacheable or not.
> With DMA-BUF heaps, we do not yet have such a flag, and by default
> the current heaps (system and cma) produce CPU cachable buffers.
> So for folks transitioning from ION to DMA-BUF Heaps, this fills
> in some of that missing functionality.
> 
> There has been a suggestion to make this functionality a flag
> (DMAHEAP_FLAG_UNCACHED?) on the system heap, similar to how
> ION used the ION_FLAG_CACHED. But I want to make sure an
> _UNCACHED flag would truely be a generic attribute across all
> heaps. So far that has been unclear, so having it as a separate
> heap seemes better for now. (But I'm open to discussion on this
> point!)
> 
> This is a rework of earlier efforts to add a uncached system heap,
> done utilizing the exisitng system heap, adding just a bit of
> logic to handle the uncached case.
> 
> Feedback would be very welcome!
> 
> Many thanks to Liam Mark for his help to get this working.
> 
> Pending opensource users of this code include:
> * AOSP HiKey960 gralloc:
>   - https://android-review.googlesource.com/c/device/linaro/hikey/+/1399519
>   - Visibly improves performance over the system heap
> * AOSP Codec2 (possibly, needs more review):
>   - 
> https://android-review.googlesource.com/c/platform/frameworks/av/+/1360640/17/media/codec2/vndk/C2DmaBufAllocator.cpp#325
> 
> Cc: Sumit Semwal <sumit.sem...@linaro.org>
> Cc: Liam Mark <lm...@codeaurora.org>
> Cc: Laura Abbott <labb...@kernel.org>
> Cc: Brian Starkey <brian.star...@arm.com>
> Cc: Hridya Valsaraju <hri...@google.com>
> Cc: Suren Baghdasaryan <sur...@google.com>
> Cc: Sandeep Patil <sspa...@google.com>
> Cc: Daniel Mentz <danielme...@google.com>
> Cc: Chris Goldsworthy <cgold...@codeaurora.org>
> Cc: �rjan Eide <orjan.e...@arm.com>
> Cc: Robin Murphy <robin.mur...@arm.com>
> Cc: Ezequiel Garcia <ezequ...@collabora.com>
> Cc: Simon Ser <cont...@emersion.fr>
> Cc: James Jones <jajo...@nvidia.com>
> Cc: linux-me...@vger.kernel.org
> Cc: dri-de...@lists.freedesktop.org
> Signed-off-by: John Stultz <john.stu...@linaro.org>
> ---
>  drivers/dma-buf/heaps/system_heap.c | 87 ++++++++++++++++++++++++++---
>  1 file changed, 79 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/dma-buf/heaps/system_heap.c 
> b/drivers/dma-buf/heaps/system_heap.c
> index 2b8d4b6abacb..952f1fd9dacf 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -22,6 +22,7 @@
>  #include <linux/vmalloc.h>
>  
>  struct dma_heap *sys_heap;
> +struct dma_heap *sys_uncached_heap;
>  
>  struct system_heap_buffer {
>       struct dma_heap *heap;
> @@ -31,6 +32,8 @@ struct system_heap_buffer {
>       struct sg_table sg_table;
>       int vmap_cnt;
>       void *vaddr;
> +
> +     bool uncached;
>  };
>  
>  struct dma_heap_attachment {
> @@ -38,6 +41,8 @@ struct dma_heap_attachment {
>       struct sg_table *table;
>       struct list_head list;
>       bool mapped;
> +
> +     bool uncached;
>  };
>  
>  #define HIGH_ORDER_GFP  (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
> @@ -94,7 +99,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
>       a->dev = attachment->dev;
>       INIT_LIST_HEAD(&a->list);
>       a->mapped = false;
> -
> +     a->uncached = buffer->uncached;
>       attachment->priv = a;
>  
>       mutex_lock(&buffer->lock);
> @@ -124,9 +129,13 @@ static struct sg_table *system_heap_map_dma_buf(struct 
> dma_buf_attachment *attac
>  {
>       struct dma_heap_attachment *a = attachment->priv;
>       struct sg_table *table = a->table;
> +     int attr = 0;
>       int ret;
>  
> -     ret = dma_map_sgtable(attachment->dev, table, direction, 0);
> +     if (a->uncached)
> +             attr = DMA_ATTR_SKIP_CPU_SYNC;
> +
> +     ret = dma_map_sgtable(attachment->dev, table, direction, attr);
>       if (ret)
>               return ERR_PTR(ret);
>  
> @@ -139,9 +148,12 @@ static void system_heap_unmap_dma_buf(struct 
> dma_buf_attachment *attachment,
>                                     enum dma_data_direction direction)
>  {
>       struct dma_heap_attachment *a = attachment->priv;
> +     int attr = 0;
>  
> +     if (a->uncached)
> +             attr = DMA_ATTR_SKIP_CPU_SYNC;
>       a->mapped = false;
> -     dma_unmap_sgtable(attachment->dev, table, direction, 0);
> +     dma_unmap_sgtable(attachment->dev, table, direction, attr);
>  }
>  
>  static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> @@ -150,6 +162,9 @@ static int system_heap_dma_buf_begin_cpu_access(struct 
> dma_buf *dmabuf,
>       struct system_heap_buffer *buffer = dmabuf->priv;
>       struct dma_heap_attachment *a;
>  
> +     if (buffer->uncached)
> +             return 0;
> +
>       mutex_lock(&buffer->lock);
>  
>       if (buffer->vmap_cnt)
> @@ -171,6 +186,9 @@ static int system_heap_dma_buf_end_cpu_access(struct 
> dma_buf *dmabuf,
>       struct system_heap_buffer *buffer = dmabuf->priv;
>       struct dma_heap_attachment *a;
>  
> +     if (buffer->uncached)
> +             return 0;
> +
>       mutex_lock(&buffer->lock);
>  
>       if (buffer->vmap_cnt)
> @@ -194,6 +212,9 @@ static int system_heap_mmap(struct dma_buf *dmabuf, 
> struct vm_area_struct *vma)
>       struct sg_page_iter piter;
>       int ret;
>  
> +     if (buffer->uncached)
> +             vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
> +
>       for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
>               struct page *page = sg_page_iter_page(&piter);
>  
> @@ -215,8 +236,12 @@ static void *system_heap_do_vmap(struct 
> system_heap_buffer *buffer)
>       struct page **pages = vmalloc(sizeof(struct page *) * npages);
>       struct page **tmp = pages;
>       struct sg_page_iter piter;
> +     pgprot_t pgprot = PAGE_KERNEL;
>       void *vaddr;
>  
> +     if (buffer->uncached)
> +             pgprot = pgprot_writecombine(PAGE_KERNEL);

I think this should go after the 'if (!pages)' check. Best to get the
allocation failure check as close to the allocation as possible IMO.

> +
>       if (!pages)
>               return ERR_PTR(-ENOMEM);
>  
> @@ -225,7 +250,7 @@ static void *system_heap_do_vmap(struct 
> system_heap_buffer *buffer)
>               *tmp++ = sg_page_iter_page(&piter);
>       }
>  
> -     vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> +     vaddr = vmap(pages, npages, VM_MAP, pgprot);
>       vfree(pages);
>  
>       if (!vaddr)
> @@ -278,6 +303,10 @@ static void system_heap_dma_buf_release(struct dma_buf 
> *dmabuf)
>       int i;
>  
>       table = &buffer->sg_table;
> +     /* Unmap the uncached buffers from the heap device (pairs with the map 
> on allocate) */
> +     if (buffer->uncached)
> +             dma_unmap_sgtable(dma_heap_get_dev(buffer->heap), table, 
> DMA_BIDIRECTIONAL, 0);
> +
>       for_each_sg(table->sgl, sg, table->nents, i) {
>               struct page *page = sg_page(sg);
>  
> @@ -320,10 +349,11 @@ static struct page *alloc_largest_available(unsigned 
> long size,
>       return NULL;
>  }
>  
> -static int system_heap_allocate(struct dma_heap *heap,
> -                             unsigned long len,
> -                             unsigned long fd_flags,
> -                             unsigned long heap_flags)
> +static int system_heap_do_allocate(struct dma_heap *heap,
> +                                unsigned long len,
> +                                unsigned long fd_flags,
> +                                unsigned long heap_flags,
> +                                bool uncached)
>  {
>       struct system_heap_buffer *buffer;
>       DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> @@ -344,6 +374,7 @@ static int system_heap_allocate(struct dma_heap *heap,
>       mutex_init(&buffer->lock);
>       buffer->heap = heap;
>       buffer->len = len;
> +     buffer->uncached = uncached;
>  
>       INIT_LIST_HEAD(&pages);
>       i = 0;
> @@ -393,6 +424,16 @@ static int system_heap_allocate(struct dma_heap *heap,
>               /* just return, as put will call release and that will free */
>               return ret;
>       }
> +
> +     /*
> +      * For uncached buffers, we need to initially flush cpu cache, since
> +      * the __GFP_ZERO on the allocation means the zeroing was done by the
> +      * cpu and thus it is likely cached. Map (and implicitly flush) it out
> +      * now so we don't get corruption later on.
> +      */
> +     if (buffer->uncached)
> +             dma_map_sgtable(dma_heap_get_dev(heap), table, 
> DMA_BIDIRECTIONAL, 0);

Do we have to keep this mapping around for the entire lifetime of the
buffer?

Also, this problem (and solution) keeps lingering around. It really
feels like there should be a better way to solve "clean the linear
mapping all the way to DRAM", but I don't know what that should be.

> +
>       return ret;
>  
>  free_pages:
> @@ -410,10 +451,30 @@ static int system_heap_allocate(struct dma_heap *heap,
>       return ret;
>  }
>  
> +static int system_heap_allocate(struct dma_heap *heap,
> +                             unsigned long len,
> +                             unsigned long fd_flags,
> +                             unsigned long heap_flags)
> +{
> +     return system_heap_do_allocate(heap, len, fd_flags, heap_flags, false);
> +}
> +
>  static const struct dma_heap_ops system_heap_ops = {
>       .allocate = system_heap_allocate,
>  };
>  
> +static int system_uncached_heap_allocate(struct dma_heap *heap,
> +                                      unsigned long len,
> +                                      unsigned long fd_flags,
> +                                      unsigned long heap_flags)
> +{
> +     return system_heap_do_allocate(heap, len, fd_flags, heap_flags, true);
> +}
> +
> +static const struct dma_heap_ops system_uncached_heap_ops = {
> +     .allocate = system_uncached_heap_allocate,
> +};
> +
>  static int system_heap_create(void)
>  {
>       struct dma_heap_export_info exp_info;
> @@ -426,6 +487,16 @@ static int system_heap_create(void)
>       if (IS_ERR(sys_heap))
>               return PTR_ERR(sys_heap);
>  
> +     exp_info.name = "system-uncached";
> +     exp_info.ops = &system_uncached_heap_ops;
> +     exp_info.priv = NULL;
> +
> +     sys_uncached_heap = dma_heap_add(&exp_info);
> +     if (IS_ERR(sys_uncached_heap))
> +             return PTR_ERR(sys_heap);
> +

In principle, there's a race here between the heap getting registered
to sysfs and the dma_mask getting updated.

I don't think it would cause a problem in practice, but with the API
as it is, there's no way to avoid it - so I wonder if the
dma_heap_get_dev() accessor isn't the right API design.

Cheers,
-Brian

> +     dma_coerce_mask_and_coherent(dma_heap_get_dev(sys_uncached_heap), 
> DMA_BIT_MASK(64));
> +
>       return 0;
>  }
>  module_init(system_heap_create);
> -- 
> 2.17.1
> 

Reply via email to