-----Original Message-----
From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf Of 
Christian König
Sent: Monday, October 09, 2017 5:08 AM
To: Kasiviswanathan, Harish <harish.kasiviswanat...@amd.com>; 
amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH v2 1/2] drm/amdgpu: Refactor amdgpu_move_blit

Am 07.10.2017 um 00:08 schrieb Harish Kasiviswanathan:
> Add more generic function amdgpu_copy_ttm_mem_to_mem() that supports
> arbitrary copy size, offsets and two BOs (source & dest.).
>
> This is useful for KFD Cross Memory Attach feature where data needs to
> be copied from BOs from different processes
>
> v2: Add struct amdgpu_copy_mem and changed amdgpu_copy_ttm_mem_to_mem()
> function parameters to use the struct
>
> Change-Id: I848d541a84a1c2d12827d9dcf6d9054d854b4159
> Signed-off-by: Harish Kasiviswanathan <harish.kasiviswanat...@amd.com>

Looks like a nice cleanup to me, with the notes below fixed the patch is 
Reviewed-by: Christian König <christian.koe...@amd.com>.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 169 
> +++++++++++++++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  12 +++
>   2 files changed, 132 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 1086f03..dda4f08 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -289,97 +289,168 @@ static uint64_t amdgpu_mm_node_addr(struct 
> ttm_buffer_object *bo,
>       return addr;
>   }
>   
> -static int amdgpu_move_blit(struct ttm_buffer_object *bo,
> -                         bool evict, bool no_wait_gpu,
> -                         struct ttm_mem_reg *new_mem,
> -                         struct ttm_mem_reg *old_mem)
> +/**
> + * amdgpu_copy_ttm_mem_to_mem - Helper function for copy
> + *
> + * The function copies @size bytes from {src->mem + src->offset} to
> + * {dst->mem + dst->offset}. src->bo and dst->bo could be same BO for a
> + * move and different for a BO to BO copy.
> + *
> + * @f: Returns the last fence if multiple jobs are submitted.
> + */
> +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev,

Please name that amdgpu_ttm_copy_.... to note that the function is part 
of the TTM subsystem in amdgpu.

[HK] - Will fix the name.

> +                            struct amdgpu_copy_mem *src,
> +                            struct amdgpu_copy_mem *dst,
> +                            uint64_t size,
> +                            struct reservation_object *resv,
> +                            struct dma_fence **f)
>   {
> -     struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> -
> -     struct drm_mm_node *old_mm, *new_mm;
> -     uint64_t old_start, old_size, new_start, new_size;
> -     unsigned long num_pages;
> +     struct drm_mm_node *src_mm, *dst_mm;
> +     uint64_t src_node_start, dst_node_start, src_node_size,
> +              dst_node_size, src_page_offset, dst_page_offset;
>       struct dma_fence *fence = NULL;
> -     int r;
> -
> -     BUILD_BUG_ON((PAGE_SIZE % AMDGPU_GPU_PAGE_SIZE) != 0);
> +     int r = 0;
> +     const uint64_t GTT_MAX_BYTES = (AMDGPU_GTT_MAX_TRANSFER_SIZE *
> +                                     AMDGPU_GPU_PAGE_SIZE);
>   
>       if (!ring->ready) {
>               DRM_ERROR("Trying to move memory with ring turned off.\n");
>               return -EINVAL;
>       }
>   
> -     old_mm = old_mem->mm_node;
> -     old_size = old_mm->size;
> -     old_start = amdgpu_mm_node_addr(bo, old_mm, old_mem);
> +     src_mm = src->mem->mm_node;
> +     while (src->offset >= (src_mm->size << PAGE_SHIFT)) {
> +             src->offset -= (src_mm->size << PAGE_SHIFT);
> +             ++src_mm;
> +     }
> +     src_node_start = amdgpu_mm_node_addr(src->bo, src_mm, src->mem) +
> +                                          src->offset;
> +     src_node_size = (src_mm->size << PAGE_SHIFT) - src->offset;
> +     src_page_offset = src_node_start & (PAGE_SIZE - 1);
>   
> -     new_mm = new_mem->mm_node;
> -     new_size = new_mm->size;
> -     new_start = amdgpu_mm_node_addr(bo, new_mm, new_mem);
> +     dst_mm = dst->mem->mm_node;
> +     while (dst->offset >= (dst_mm->size << PAGE_SHIFT)) {
> +             dst->offset -= (dst_mm->size << PAGE_SHIFT);
> +             ++dst_mm;
> +     }
> +     dst_node_start = amdgpu_mm_node_addr(dst->bo, dst_mm, dst->mem) +
> +                                          dst->offset;
> +     dst_node_size = (dst_mm->size << PAGE_SHIFT) - dst->offset;
> +     dst_page_offset = dst_node_start & (PAGE_SIZE - 1);
>   
> -     num_pages = new_mem->num_pages;
>       mutex_lock(&adev->mman.gtt_window_lock);
> -     while (num_pages) {
> -             unsigned long cur_pages = min(min(old_size, new_size),
> -                                           
> (u64)AMDGPU_GTT_MAX_TRANSFER_SIZE);
> -             uint64_t from = old_start, to = new_start;
> +
> +     while (size) {
> +             unsigned long cur_size;
> +             uint64_t from = src_node_start, to = dst_node_start;
>               struct dma_fence *next;
>   
> -             if (old_mem->mem_type == TTM_PL_TT &&
> -                 !amdgpu_gtt_mgr_is_allocated(old_mem)) {
> -                     r = amdgpu_map_buffer(bo, old_mem, cur_pages,
> -                                           old_start, 0, ring, &from);
> +             /* Copy size cannot exceed GTT_MAX_BYTES. So if src or dst
> +              * begins at an offset, then adjust the size accordingly
> +              */
> +             cur_size = min3(min(src_node_size, dst_node_size), size,
> +                             GTT_MAX_BYTES);
> +             if (cur_size + src_page_offset > GTT_MAX_BYTES ||
> +                 cur_size + dst_page_offset > GTT_MAX_BYTES)
> +                     cur_size -= max(src_page_offset, dst_page_offset);
> +
> +             /* Map only what needs to be accessed. Map src to window 0 and
> +              * dst to window 1
> +              */
> +             if (src->mem->mem_type == TTM_PL_TT &&
> +                 !amdgpu_gtt_mgr_is_allocated(src->mem)) {
> +                     r = amdgpu_map_buffer(src->bo, src->mem,
> +                                     PFN_UP(cur_size + src_page_offset),
> +                                     src_node_start, 0, ring,
> +                                     &from);
>                       if (r)
>                               goto error;
> +                     /* Adjust the offset because amdgpu_map_buffer returns
> +                      * start of mapped page
> +                      */
> +                     from += src_page_offset;
>               }
>   
> -             if (new_mem->mem_type == TTM_PL_TT &&
> -                 !amdgpu_gtt_mgr_is_allocated(new_mem)) {
> -                     r = amdgpu_map_buffer(bo, new_mem, cur_pages,
> -                                           new_start, 1, ring, &to);
> +             if (dst->mem->mem_type == TTM_PL_TT &&
> +                 !amdgpu_gtt_mgr_is_allocated(dst->mem)) {
> +                     r = amdgpu_map_buffer(dst->bo, dst->mem,
> +                                     PFN_UP(cur_size + dst_page_offset),
> +                                     dst_node_start, 1, ring,
> +                                     &to);
>                       if (r)
>                               goto error;
> +                     to += dst_page_offset;
>               }
>   
> -             r = amdgpu_copy_buffer(ring, from, to,
> -                                    cur_pages * PAGE_SIZE,
> -                                    bo->resv, &next, false, true);
> +             r = amdgpu_copy_buffer(ring, from, to, cur_size,
> +                                    resv, &next, false, true);
>               if (r)
>                       goto error;
>   
>               dma_fence_put(fence);
>               fence = next;
>   
> -             num_pages -= cur_pages;
> -             if (!num_pages)
> +             size -= cur_size;
> +             if (!size)
>                       break;
>   
> -             old_size -= cur_pages;
> -             if (!old_size) {
> -                     old_start = amdgpu_mm_node_addr(bo, ++old_mm, old_mem);
> -                     old_size = old_mm->size;
> +             src_node_size -= cur_size;
> +             if (!src_node_size) {
> +                     src_node_start = amdgpu_mm_node_addr(src->bo, ++src_mm,
> +                                                          src->mem);
> +                     src_node_size = (src_mm->size << PAGE_SHIFT);
>               } else {
> -                     old_start += cur_pages * PAGE_SIZE;
> +                     src_node_start += cur_size;
> +                     src_page_offset = src_node_start & (PAGE_SIZE - 1);
>               }
> -
> -             new_size -= cur_pages;
> -             if (!new_size) {
> -                     new_start = amdgpu_mm_node_addr(bo, ++new_mm, new_mem);
> -                     new_size = new_mm->size;
> +             dst_node_size -= cur_size;
> +             if (!dst_node_size) {
> +                     dst_node_start = amdgpu_mm_node_addr(dst->bo, ++dst_mm,
> +                                                          dst->mem);
> +                     dst_node_size = (dst_mm->size << PAGE_SHIFT);
>               } else {
> -                     new_start += cur_pages * PAGE_SIZE;
> +                     dst_node_start += cur_size;
> +                     dst_page_offset = dst_node_start & (PAGE_SIZE - 1);
>               }
>       }
> +error:
>       mutex_unlock(&adev->mman.gtt_window_lock);
> +     if (f)
> +             *f = dma_fence_get(fence);
> +     dma_fence_put(fence);
> +     return r;
> +}
> +
> +
> +static int amdgpu_move_blit(struct ttm_buffer_object *bo,
> +                         bool evict, bool no_wait_gpu,
> +                         struct ttm_mem_reg *new_mem,
> +                         struct ttm_mem_reg *old_mem)
> +{

I think I would rather prefer to change the callers of that function, 
but not 100% sure.

How much change would that be? But not a blocking issue.

[HK]: The function is called 3 times in amdgpu_ttm.c. I don't see an elegant 
way to avoid this function. So for now I am leaving it as it is.

> +     struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
> +     struct amdgpu_copy_mem src, dst;
> +     struct dma_fence *fence = NULL;
> +     int r;
> +
> +     src.bo = bo;
> +     dst.bo = bo;
> +     src.mem = old_mem;
> +     dst.mem = new_mem;
> +     src.offset = 0;
> +     dst.offset = 0;
> +
> +     r = amdgpu_copy_ttm_mem_to_mem(adev, &src, &dst,
> +                                    new_mem->num_pages << PAGE_SHIFT,
> +                                    bo->resv, &fence);
> +     if (r)
> +             goto error;
>   
>       r = ttm_bo_pipeline_move(bo, fence, evict, new_mem);
>       dma_fence_put(fence);
>       return r;
>   
>   error:
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> -
>       if (fence)
>               dma_fence_wait(fence, false);
>       dma_fence_put(fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index 7abae68..2188034 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -58,6 +58,12 @@ struct amdgpu_mman {
>       struct amd_sched_entity                 entity;
>   };
>   
> +struct amdgpu_copy_mem {
> +     struct ttm_buffer_object        *bo;
> +     struct ttm_mem_reg              *mem;
> +     unsigned long                   offset;
> +};
> +
>   extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func;
>   extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func;
>   
> @@ -72,6 +78,12 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t 
> src_offset,
>                      struct reservation_object *resv,
>                      struct dma_fence **fence, bool direct_submit,
>                      bool vm_needs_flush);
> +int amdgpu_copy_ttm_mem_to_mem(struct amdgpu_device *adev,
> +                            struct amdgpu_copy_mem *src,
> +                            struct amdgpu_copy_mem *dst,
> +                            uint64_t size,
> +                            struct reservation_object *resv,
> +                            struct dma_fence **f);
>   int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>                       uint64_t src_data,
>                       struct reservation_object *resv,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to