On 8/28/2025 8:31 PM, Christian König wrote:
That was actually complete nonsense and not validating the BOs
at all. The code just cleared all VM areas were it couldn't grab the
lock for a BO.

Try to fix this. Only compile tested at the moment.

v2: fix fence slot reservation as well as pointed out by Sunil.
     also validate PDs, PTs, per VM BOs and update PDEs
v3: grab the status_lock while working with the done list.

Signed-off-by: Christian König <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 136 ++++++++++------------
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  35 ++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |   2 +
  3 files changed, 97 insertions(+), 76 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 424831997cb1..abc2f96bea76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -545,108 +545,92 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
        return ret;
  }
-static int
-amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
+static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
  {
        struct ttm_operation_ctx ctx = { false, false };
-       int ret;
amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
-
-       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
-       if (ret)
-               DRM_ERROR("Fail to validate\n");
-
-       return ret;
+       return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
  }
static int
-amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
+amdgpu_userq_validate_bos(struct amdgpu_device *adev, struct drm_exec *exec,
+                         struct amdgpu_vm *vm)
  {
-       struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
-       struct amdgpu_vm *vm = &fpriv->vm;
-       struct amdgpu_device *adev = uq_mgr->adev;
+       struct ttm_operation_ctx ctx = { false, false };
        struct amdgpu_bo_va *bo_va;
-       struct ww_acquire_ctx *ticket;
-       struct drm_exec exec;
        struct amdgpu_bo *bo;
-       struct dma_resv *resv;
-       bool clear, unlock;
-       int ret = 0;
-
-       drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
-       drm_exec_until_all_locked(&exec) {
-               ret = amdgpu_vm_lock_pd(vm, &exec, 2);
-               drm_exec_retry_on_contention(&exec);
-               if (unlikely(ret)) {
-                       drm_file_err(uq_mgr->file, "Failed to lock PD\n");
-                       goto unlock_all;
-               }
-
-               /* Lock the done list */
-               list_for_each_entry(bo_va, &vm->done, base.vm_status) {
-                       bo = bo_va->base.bo;
-                       if (!bo)
-                               continue;
-
-                       ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
-                       drm_exec_retry_on_contention(&exec);
-                       if (unlikely(ret))
-                               goto unlock_all;
-               }
-       }
+       int ret;
spin_lock(&vm->status_lock);
-       while (!list_empty(&vm->moved)) {
-               bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
+       while (!list_empty(&vm->invalidated)) {
+               bo_va = list_first_entry(&vm->invalidated,
+                                        struct amdgpu_bo_va,
                                         base.vm_status);
                spin_unlock(&vm->status_lock);
- /* Per VM BOs never need to bo cleared in the page tables */
+               bo = bo_va->base.bo;
+               ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
+               if (unlikely(ret))
+                       return ret;
+
+               amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+               ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+               if (ret)
+                       return ret;
+
+               /* This moves the bo_va to the done list */
                ret = amdgpu_vm_bo_update(adev, bo_va, false);
                if (ret)
-                       goto unlock_all;
+                       return ret;
+
                spin_lock(&vm->status_lock);
        }
+       spin_unlock(&vm->status_lock);
- ticket = &exec.ticket;
-       while (!list_empty(&vm->invalidated)) {
-               bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
-                                        base.vm_status);
-               resv = bo_va->base.bo->tbo.base.resv;
-               spin_unlock(&vm->status_lock);
+       return 0;
+}
- bo = bo_va->base.bo;
-               ret = amdgpu_userq_validate_vm_bo(NULL, bo);
-               if (ret) {
-                       drm_file_err(uq_mgr->file, "Failed to validate BO\n");
-                       goto unlock_all;
-               }
+static int
+amdgpu_userq_update_vm(struct amdgpu_userq_mgr *uq_mgr)
+{
+       struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
+       struct amdgpu_device *adev = uq_mgr->adev;
+       struct amdgpu_vm *vm = &fpriv->vm;
+       struct drm_exec exec;
+       int ret;
- /* Try to reserve the BO to avoid clearing its ptes */
-               if (!adev->debug_vm && dma_resv_trylock(resv)) {
-                       clear = false;
-                       unlock = true;
-               /* The caller is already holding the reservation lock */
-               } else if (dma_resv_locking_ctx(resv) == ticket) {
-                       clear = false;
-                       unlock = false;
-               /* Somebody else is using the BO right now */
-               } else {
-                       clear = true;
-                       unlock = false;
-               }
+       drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
+       drm_exec_until_all_locked(&exec) {
+               ret = amdgpu_vm_lock_pd(vm, &exec, 1);
What difference does this makes changing value from 2 -> 1? Earlier we were fixing the slot 2 and now slot 1 ?
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret))
+                       goto unlock_all;
- ret = amdgpu_vm_bo_update(adev, bo_va, clear);
+               ret = amdgpu_vm_lock_done(vm, &exec, 1);
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret))
+                       goto unlock_all;
- if (unlock)
-                       dma_resv_unlock(resv);
-               if (ret)
+               ret = amdgpu_vm_validate(adev, vm, NULL,
+                                        amdgpu_userq_validate_vm,
+                                        NULL);
+               if (unlikely(ret))
                        goto unlock_all;
- spin_lock(&vm->status_lock);
+               ret = amdgpu_userq_validate_bos(adev, &exec, vm);
After moving the invalidated bos to the done list, dont we need to go through the done list in the end once more to lock all the bo's that we need.
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret))
+                       goto unlock_all;
        }
-       spin_unlock(&vm->status_lock);
+
+       ret = amdgpu_vm_handle_moved(adev, vm, NULL);
+       if (ret)
+               goto unlock_all;
+
+       ret = amdgpu_vm_update_pdes(adev, vm, false);
This was missing earlier or was done in any other indirect way ?
+       if (ret)
+               goto unlock_all;
ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
        if (ret)
@@ -667,7 +651,7 @@ static void amdgpu_userq_restore_worker(struct work_struct 
*work)
mutex_lock(&uq_mgr->userq_mutex); - ret = amdgpu_userq_validate_bos(uq_mgr);
+       ret = amdgpu_userq_update_vm(uq_mgr);
        if (ret) {
                drm_file_err(uq_mgr->file, "Failed to validate BOs to 
restore\n");
                goto unlock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index bf42246a3db2..16451c9bbe1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct 
drm_exec *exec,
                                    2 + num_fences);
  }
+/**
+ * amdgpu_vm_lock_done - lock all BOs on the done list
+ * @exec: drm execution context
+ * @num_fences: number of extra fences to reserve
+ *
+ * Lock the BOs on the done list in the DRM execution context.
+ */
+int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec,
+                       unsigned int num_fences)
num_fence not used and hence should be set as unused.
+{
+       struct list_head *prev = &vm->done;
+       struct amdgpu_bo_va *bo_va;
+       struct amdgpu_bo *bo;
+       int ret;
+
+       /* We can only trust prev->next while holding the lock */
+       spin_lock(&vm->status_lock);
+       while (!list_is_head(prev->next, &vm->done)) {
+               bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status);
+               spin_unlock(&vm->status_lock);
+
+               bo = bo_va->base.bo;
+               if (bo) {
+                       ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1);
+                       if (unlikely(ret))
+                               return ret;
+               }
+               spin_lock(&vm->status_lock);
+               prev = prev->next;
+       }
+       spin_unlock(&vm->status_lock);
+
+       return 0;
+}
+
  /**
   * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
   *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index f9549f6b3d1f..0e3884dfdb6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -492,6 +492,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
struct amdgpu_vm *vm);
  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
  int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec,
                      unsigned int num_fences);
+int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec,
+                       unsigned int num_fences);
Since we aren't using them we should set as unused.
Regards
Sunil Khatri
  bool amdgpu_vm_ready(struct amdgpu_vm *vm);
  uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm 
*vm);
  int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,

Reply via email to