Hmm, I'm not sure. This change probably fixes this issue, but there may 
be other similar corner cases in other situations where the restore 
worker fails and needs to retry. The better place to call untrack in  
amdgpu_amdkfd_restore_userptr_worker would be at the very end. Anything 
that's left in the userptr_inval_list at that point needs to be untracked.

For now as a quick fix for an urgent bug, this change is Reviewed-by: 
Felix Kuehling <felix.kuehl...@amd.com>. But please revisit this and 
check if there are similar corner cases as I explained above.

Regards,
   Felix

On 3/5/2019 1:09 PM, Yang, Philip wrote:
> Userptr restore may have concurrent userptr invalidation after
> hmm_vma_fault adds the range to the hmm->ranges list, needs call
> hmm_vma_range_done to remove the range from hmm->ranges list first,
> then reschedule the restore worker. Otherwise hmm_vma_fault will add
> same range to the list, this will cause loop in the list because
> range->next point to range itself.
>
> Add function untrack_invalid_user_pages to reduce code duplication.
>
> Change-Id: I31407739dc10554f8e418c7a0e0415d3d95552f1
> Signed-off-by: Philip Yang <philip.y...@amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28 ++++++++++++++-----
>   1 file changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 314c048fcac6..783d760ccfe3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1731,6 +1731,23 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>       return 0;
>   }
>   
> +/* Remove invalid userptr BOs from hmm track list
> + *
> + * Stop HMM track the userptr update
> + */
> +static void untrack_invalid_user_pages(struct amdkfd_process_info 
> *process_info)
> +{
> +     struct kgd_mem *mem, *tmp_mem;
> +     struct amdgpu_bo *bo;
> +
> +     list_for_each_entry_safe(mem, tmp_mem,
> +                              &process_info->userptr_inval_list,
> +                              validate_list.head) {
> +             bo = mem->bo;
> +             amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> +     }
> +}
> +
>   /* Validate invalid userptr BOs
>    *
>    * Validates BOs on the userptr_inval_list, and moves them back to the
> @@ -1848,12 +1865,7 @@ static int validate_invalid_user_pages(struct 
> amdkfd_process_info *process_info)
>   out_free:
>       kfree(pd_bo_list_entries);
>   out_no_mem:
> -     list_for_each_entry_safe(mem, tmp_mem,
> -                              &process_info->userptr_inval_list,
> -                              validate_list.head) {
> -             bo = mem->bo;
> -             amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> -     }
> +     untrack_invalid_user_pages(process_info);
>   
>       return ret;
>   }
> @@ -1897,8 +1909,10 @@ static void 
> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>        * and we can just restart the queues.
>        */
>       if (!list_empty(&process_info->userptr_inval_list)) {
> -             if (atomic_read(&process_info->evicted_bos) != evicted_bos)
> +             if (atomic_read(&process_info->evicted_bos) != evicted_bos) {
> +                     untrack_invalid_user_pages(process_info);
>                       goto unlock_out; /* Concurrent eviction, try again */
> +             }
>   
>               if (validate_invalid_user_pages(process_info))
>                       goto unlock_out;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to