From: Xiaogang Chen <xiaogang.c...@amd.com>

Notice userptr buffer restore process has following issues:

1: amdgpu_ttm_tt_get_user_pages can fail(-EFAULT). If it failed we should not 
set
it valid(mem->invalid = 0). In this case mem has no associated hmm range or 
user_pages
associated.

2: mmu notifier can happen concurrently and update 
mem->range->notifier->invalidate_seq,
but not mem->range->notifier_seq. That causes mem->range->notifier_seq stale
when mem is in process_info->userptr_inval_list and 
amdgpu_amdkfd_restore_userptr_worker
got interrupted. At next rescheduled next attempt we use stale 
mem->range->notifier_seq
to compare with mem->range->notifier->invalidate_seq.

Signed-off-by: Xiaogang Chen <xiaogang.c...@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 45 +++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7b1f5933ebaa..6881f1b0844c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2444,7 +2444,9 @@ static int update_invalid_user_pages(struct 
amdkfd_process_info *process_info,
                        ret = -EAGAIN;
                        goto unlock_out;
                }
-               mem->invalid = 0;
+                /* set mem valid if mem has hmm range associated */
+               if (mem->range)
+                       mem->invalid = 0;
        }
 
 unlock_out:
@@ -2576,16 +2578,28 @@ static int confirm_valid_user_pages_locked(struct 
amdkfd_process_info *process_i
        list_for_each_entry_safe(mem, tmp_mem,
                                 &process_info->userptr_inval_list,
                                 validate_list.head) {
-               bool valid = amdgpu_ttm_tt_get_user_pages_done(
-                               mem->bo->tbo.ttm, mem->range);
+               /* Only check mem with hmm range associated */
+               bool valid;
 
-               mem->range = NULL;
-               if (!valid) {
-                       WARN(!mem->invalid, "Invalid BO not marked invalid");
+               if (mem->range) {
+                       valid = amdgpu_ttm_tt_get_user_pages_done(
+                                       mem->bo->tbo.ttm, mem->range);
+
+                       mem->range = NULL;
+                       if (!valid) {
+                               WARN(!mem->invalid, "Invalid BO not marked 
invalid");
+                               ret = -EAGAIN;
+                               continue;
+                       }
+               } else
+                       /* keep mem without hmm range at userptr_inval_list */
+                       continue;
+
+               if (mem->invalid) {
+                       WARN(1, "Valid BO is marked invalid");
                        ret = -EAGAIN;
                        continue;
                }
-               WARN(mem->invalid, "Valid BO is marked invalid");
 
                list_move_tail(&mem->validate_list.head,
                               &process_info->userptr_valid_list);
@@ -2644,8 +2658,23 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
         * reference counting inside KFD will handle this case.
         */
        mutex_lock(&process_info->notifier_lock);
-       if (process_info->evicted_bos != evicted_bos)
+       if (process_info->evicted_bos != evicted_bos) {
+               /* mmu notifier interrupted amdgpu_amdkfd_restore_userptr_worker
+                * before reschedule next attempt update stale 
mem->range->notifier_seq
+                * inside userptr_inval_list
+                */
+               struct kgd_mem *mem, *tmp_mem;
+
+               list_for_each_entry_safe(mem, tmp_mem,
+                               &process_info->userptr_inval_list,
+                               validate_list.head) {
+
+                       if (mem->range)
+                               mem->range->notifier_seq = 
mem->range->notifier->invalidate_seq;
+               }
+
                goto unlock_notifier_out;
+       }
 
        if (confirm_valid_user_pages_locked(process_info)) {
                WARN(1, "User pages unexpectedly invalid");
-- 
2.25.1

Reply via email to