amdgpu: fix userq VM validation v4

Alex Deucher Thu, 11 Sep 2025 09:24:44 -0700

On Thu, Sep 11, 2025 at 8:09 AM Christian König
<[email protected]> wrote:
>
> That was actually complete nonsense and not validating the BOs
> at all. The code just cleared all VM areas were it couldn't grab the
> lock for a BO.
>
> Try to fix this. Only compile tested at the moment.
>
> v2: fix fence slot reservation as well as pointed out by Sunil.
>     also validate PDs, PTs, per VM BOs and update PDEs
> v3: grab the status_lock while working with the done list.
> v4: rename functions, add some comments, fix waiting for updates to
>     complete.
> v4: rename amdgpu_vm_lock_done_list(), add some more comments
>
> Signed-off-by: Christian König <[email protected]>
> Reviewed-by: Sunil Khatri <[email protected]>


Series is:
Reviewed-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 148 +++++++++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  35 +++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |   2 +
>  3 files changed, 110 insertions(+), 75 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 9608fe3b5a9e..0ccbd3c5d88d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -661,108 +661,106 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
> *uq_mgr)
>         return ret;
>  }
>
> +static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
> +{
> +       struct ttm_operation_ctx ctx = { false, false };
> +
> +       amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> +       return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> +}
> +
> +/* Handle all BOs on the invalidated list, validate them and update the PTs 
> */
>  static int
> -amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
> +amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
> +                        struct amdgpu_vm *vm)
>  {
>         struct ttm_operation_ctx ctx = { false, false };
> +       struct amdgpu_bo_va *bo_va;
> +       struct amdgpu_bo *bo;
>         int ret;
>
> -       amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> +       spin_lock(&vm->status_lock);
> +       while (!list_empty(&vm->invalidated)) {
> +               bo_va = list_first_entry(&vm->invalidated,
> +                                        struct amdgpu_bo_va,
> +                                        base.vm_status);
> +               spin_unlock(&vm->status_lock);
>
> -       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> -       if (ret)
> -               DRM_ERROR("Fail to validate\n");
> +               bo = bo_va->base.bo;
> +               ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
> +               if (unlikely(ret))
> +                       return ret;
>
> -       return ret;
> +               amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
> +               ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> +               if (ret)
> +                       return ret;
> +
> +               /* This moves the bo_va to the done list */
> +               ret = amdgpu_vm_bo_update(adev, bo_va, false);
> +               if (ret)
> +                       return ret;
> +
> +               spin_lock(&vm->status_lock);
> +       }
> +       spin_unlock(&vm->status_lock);
> +
> +       return 0;
>  }
>
> +/* Make sure the whole VM is ready to be used */
>  static int
> -amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
> +amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
>  {
>         struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> -       struct amdgpu_vm *vm = &fpriv->vm;
>         struct amdgpu_device *adev = uq_mgr->adev;
> +       struct amdgpu_vm *vm = &fpriv->vm;
>         struct amdgpu_bo_va *bo_va;
> -       struct ww_acquire_ctx *ticket;
>         struct drm_exec exec;
> -       struct amdgpu_bo *bo;
> -       struct dma_resv *resv;
> -       bool clear, unlock;
> -       int ret = 0;
> +       int ret;
>
>         drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>         drm_exec_until_all_locked(&exec) {
> -               ret = amdgpu_vm_lock_pd(vm, &exec, 2);
> +               ret = amdgpu_vm_lock_pd(vm, &exec, 1);
>                 drm_exec_retry_on_contention(&exec);
> -               if (unlikely(ret)) {
> -                       drm_file_err(uq_mgr->file, "Failed to lock PD\n");
> +               if (unlikely(ret))
>                         goto unlock_all;
> -               }
> -
> -               /* Lock the done list */
> -               list_for_each_entry(bo_va, &vm->done, base.vm_status) {
> -                       bo = bo_va->base.bo;
> -                       if (!bo)
> -                               continue;
>
> -                       ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
> -                       drm_exec_retry_on_contention(&exec);
> -                       if (unlikely(ret))
> -                               goto unlock_all;
> -               }
> -       }
> -
> -       spin_lock(&vm->status_lock);
> -       while (!list_empty(&vm->moved)) {
> -               bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
> -                                        base.vm_status);
> -               spin_unlock(&vm->status_lock);
> -
> -               /* Per VM BOs never need to bo cleared in the page tables */
> -               ret = amdgpu_vm_bo_update(adev, bo_va, false);
> -               if (ret)
> +               ret = amdgpu_vm_lock_done_list(vm, &exec, 1);
> +               drm_exec_retry_on_contention(&exec);
> +               if (unlikely(ret))
>                         goto unlock_all;
> -               spin_lock(&vm->status_lock);
> -       }
> -
> -       ticket = &exec.ticket;
> -       while (!list_empty(&vm->invalidated)) {
> -               bo_va = list_first_entry(&vm->invalidated, struct 
> amdgpu_bo_va,
> -                                        base.vm_status);
> -               resv = bo_va->base.bo->tbo.base.resv;
> -               spin_unlock(&vm->status_lock);
>
> -               bo = bo_va->base.bo;
> -               ret = amdgpu_userq_validate_vm_bo(NULL, bo);
> -               if (ret) {
> -                       drm_file_err(uq_mgr->file, "Failed to validate BO\n");
> +               /* This validates PDs, PTs and per VM BOs */
> +               ret = amdgpu_vm_validate(adev, vm, NULL,
> +                                        amdgpu_userq_validate_vm,
> +                                        NULL);
> +               if (unlikely(ret))
>                         goto unlock_all;
> -               }
>
> -               /* Try to reserve the BO to avoid clearing its ptes */
> -               if (!adev->debug_vm && dma_resv_trylock(resv)) {
> -                       clear = false;
> -                       unlock = true;
> -               /* The caller is already holding the reservation lock */
> -               } else if (dma_resv_locking_ctx(resv) == ticket) {
> -                       clear = false;
> -                       unlock = false;
> -               /* Somebody else is using the BO right now */
> -               } else {
> -                       clear = true;
> -                       unlock = false;
> -               }
> +               /* This locks and validates the remaining evicted BOs */
> +               ret = amdgpu_userq_bo_validate(adev, &exec, vm);
> +               drm_exec_retry_on_contention(&exec);
> +               if (unlikely(ret))
> +                       goto unlock_all;
> +       }
>
> -               ret = amdgpu_vm_bo_update(adev, bo_va, clear);
> +       ret = amdgpu_vm_handle_moved(adev, vm, NULL);
> +       if (ret)
> +               goto unlock_all;
>
> -               if (unlock)
> -                       dma_resv_unlock(resv);
> -               if (ret)
> -                       goto unlock_all;
> +       ret = amdgpu_vm_update_pdes(adev, vm, false);
> +       if (ret)
> +               goto unlock_all;
>
> -               spin_lock(&vm->status_lock);
> -       }
> -       spin_unlock(&vm->status_lock);
> +       /*
> +        * We need to wait for all VM updates to finish before restarting the
> +        * queues. Using the done list like that is now ok since everything is
> +        * locked in place.
> +        */
> +       list_for_each_entry(bo_va, &vm->done, base.vm_status)
> +               dma_fence_wait(bo_va->last_pt_update, false);
> +       dma_fence_wait(vm->last_update, false);
>
>         ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
>         if (ret)
> @@ -783,7 +781,7 @@ static void amdgpu_userq_restore_worker(struct 
> work_struct *work)
>
>         mutex_lock(&uq_mgr->userq_mutex);
>
> -       ret = amdgpu_userq_validate_bos(uq_mgr);
> +       ret = amdgpu_userq_vm_validate(uq_mgr);
>         if (ret) {
>                 drm_file_err(uq_mgr->file, "Failed to validate BOs to 
> restore\n");
>                 goto unlock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index bd12d8ff15a4..9980c0cded94 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct 
> drm_exec *exec,
>                                     2 + num_fences);
>  }
>
> +/**
> + * amdgpu_vm_lock_done_list - lock all BOs on the done list
> + * @exec: drm execution context
> + * @num_fences: number of extra fences to reserve
> + *
> + * Lock the BOs on the done list in the DRM execution context.
> + */
> +int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec,
> +                            unsigned int num_fences)
> +{
> +       struct list_head *prev = &vm->done;
> +       struct amdgpu_bo_va *bo_va;
> +       struct amdgpu_bo *bo;
> +       int ret;
> +
> +       /* We can only trust prev->next while holding the lock */
> +       spin_lock(&vm->status_lock);
> +       while (!list_is_head(prev->next, &vm->done)) {
> +               bo_va = list_entry(prev->next, typeof(*bo_va), 
> base.vm_status);
> +               spin_unlock(&vm->status_lock);
> +
> +               bo = bo_va->base.bo;
> +               if (bo) {
> +                       ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1);
> +                       if (unlikely(ret))
> +                               return ret;
> +               }
> +               spin_lock(&vm->status_lock);
> +               prev = prev->next;
> +       }
> +       spin_unlock(&vm->status_lock);
> +
> +       return 0;
> +}
> +
>  /**
>   * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
>   *
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index e045c1590d78..3409904b5c63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -491,6 +491,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm);
>  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
>  int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec,
>                       unsigned int num_fences);
> +int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec,
> +                            unsigned int num_fences);
>  bool amdgpu_vm_ready(struct amdgpu_vm *vm);
>  uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm 
> *vm);
>  int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> --
> 2.43.0
>

Re: [PATCH 1/4] drm/amdgpu: fix userq VM validation v4

Reply via email to