On Thu, Sep 11, 2025 at 8:09 AM Christian König <[email protected]> wrote: > > That was actually complete nonsense and not validating the BOs > at all. The code just cleared all VM areas were it couldn't grab the > lock for a BO. > > Try to fix this. Only compile tested at the moment. > > v2: fix fence slot reservation as well as pointed out by Sunil. > also validate PDs, PTs, per VM BOs and update PDEs > v3: grab the status_lock while working with the done list. > v4: rename functions, add some comments, fix waiting for updates to > complete. > v4: rename amdgpu_vm_lock_done_list(), add some more comments > > Signed-off-by: Christian König <[email protected]> > Reviewed-by: Sunil Khatri <[email protected]>
Series is: Reviewed-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 148 +++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 35 +++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 + > 3 files changed, 110 insertions(+), 75 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 9608fe3b5a9e..0ccbd3c5d88d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -661,108 +661,106 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr > *uq_mgr) > return ret; > } > > +static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo) > +{ > + struct ttm_operation_ctx ctx = { false, false }; > + > + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); > + return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); > +} > + > +/* Handle all BOs on the invalidated list, validate them and update the PTs > */ > static int > -amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo) > +amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, > + struct amdgpu_vm *vm) > { > struct ttm_operation_ctx ctx = { false, false }; > + struct amdgpu_bo_va *bo_va; > + struct amdgpu_bo *bo; > int ret; > > - amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); > + spin_lock(&vm->status_lock); > + while (!list_empty(&vm->invalidated)) { > + bo_va = list_first_entry(&vm->invalidated, > + struct amdgpu_bo_va, > + base.vm_status); > + spin_unlock(&vm->status_lock); > > - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); > - if (ret) > - DRM_ERROR("Fail to validate\n"); > + bo = bo_va->base.bo; > + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); > + if (unlikely(ret)) > + return ret; > > - return ret; > + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); > + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); > + if (ret) > + return ret; > + > + /* This moves the bo_va to the done list */ > + ret = amdgpu_vm_bo_update(adev, bo_va, false); > + if (ret) > + return ret; > + > + spin_lock(&vm->status_lock); > + } > + spin_unlock(&vm->status_lock); > + > + return 0; > } > > +/* Make sure the whole VM is ready to be used */ > static int > -amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) > +amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr) > { > struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > - struct amdgpu_vm *vm = &fpriv->vm; > struct amdgpu_device *adev = uq_mgr->adev; > + struct amdgpu_vm *vm = &fpriv->vm; > struct amdgpu_bo_va *bo_va; > - struct ww_acquire_ctx *ticket; > struct drm_exec exec; > - struct amdgpu_bo *bo; > - struct dma_resv *resv; > - bool clear, unlock; > - int ret = 0; > + int ret; > > drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); > drm_exec_until_all_locked(&exec) { > - ret = amdgpu_vm_lock_pd(vm, &exec, 2); > + ret = amdgpu_vm_lock_pd(vm, &exec, 1); > drm_exec_retry_on_contention(&exec); > - if (unlikely(ret)) { > - drm_file_err(uq_mgr->file, "Failed to lock PD\n"); > + if (unlikely(ret)) > goto unlock_all; > - } > - > - /* Lock the done list */ > - list_for_each_entry(bo_va, &vm->done, base.vm_status) { > - bo = bo_va->base.bo; > - if (!bo) > - continue; > > - ret = drm_exec_lock_obj(&exec, &bo->tbo.base); > - drm_exec_retry_on_contention(&exec); > - if (unlikely(ret)) > - goto unlock_all; > - } > - } > - > - spin_lock(&vm->status_lock); > - while (!list_empty(&vm->moved)) { > - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, > - base.vm_status); > - spin_unlock(&vm->status_lock); > - > - /* Per VM BOs never need to bo cleared in the page tables */ > - ret = amdgpu_vm_bo_update(adev, bo_va, false); > - if (ret) > + ret = amdgpu_vm_lock_done_list(vm, &exec, 1); > + drm_exec_retry_on_contention(&exec); > + if (unlikely(ret)) > goto unlock_all; > - spin_lock(&vm->status_lock); > - } > - > - ticket = &exec.ticket; > - while (!list_empty(&vm->invalidated)) { > - bo_va = list_first_entry(&vm->invalidated, struct > amdgpu_bo_va, > - base.vm_status); > - resv = bo_va->base.bo->tbo.base.resv; > - spin_unlock(&vm->status_lock); > > - bo = bo_va->base.bo; > - ret = amdgpu_userq_validate_vm_bo(NULL, bo); > - if (ret) { > - drm_file_err(uq_mgr->file, "Failed to validate BO\n"); > + /* This validates PDs, PTs and per VM BOs */ > + ret = amdgpu_vm_validate(adev, vm, NULL, > + amdgpu_userq_validate_vm, > + NULL); > + if (unlikely(ret)) > goto unlock_all; > - } > > - /* Try to reserve the BO to avoid clearing its ptes */ > - if (!adev->debug_vm && dma_resv_trylock(resv)) { > - clear = false; > - unlock = true; > - /* The caller is already holding the reservation lock */ > - } else if (dma_resv_locking_ctx(resv) == ticket) { > - clear = false; > - unlock = false; > - /* Somebody else is using the BO right now */ > - } else { > - clear = true; > - unlock = false; > - } > + /* This locks and validates the remaining evicted BOs */ > + ret = amdgpu_userq_bo_validate(adev, &exec, vm); > + drm_exec_retry_on_contention(&exec); > + if (unlikely(ret)) > + goto unlock_all; > + } > > - ret = amdgpu_vm_bo_update(adev, bo_va, clear); > + ret = amdgpu_vm_handle_moved(adev, vm, NULL); > + if (ret) > + goto unlock_all; > > - if (unlock) > - dma_resv_unlock(resv); > - if (ret) > - goto unlock_all; > + ret = amdgpu_vm_update_pdes(adev, vm, false); > + if (ret) > + goto unlock_all; > > - spin_lock(&vm->status_lock); > - } > - spin_unlock(&vm->status_lock); > + /* > + * We need to wait for all VM updates to finish before restarting the > + * queues. Using the done list like that is now ok since everything is > + * locked in place. > + */ > + list_for_each_entry(bo_va, &vm->done, base.vm_status) > + dma_fence_wait(bo_va->last_pt_update, false); > + dma_fence_wait(vm->last_update, false); > > ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); > if (ret) > @@ -783,7 +781,7 @@ static void amdgpu_userq_restore_worker(struct > work_struct *work) > > mutex_lock(&uq_mgr->userq_mutex); > > - ret = amdgpu_userq_validate_bos(uq_mgr); > + ret = amdgpu_userq_vm_validate(uq_mgr); > if (ret) { > drm_file_err(uq_mgr->file, "Failed to validate BOs to > restore\n"); > goto unlock; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index bd12d8ff15a4..9980c0cded94 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct > drm_exec *exec, > 2 + num_fences); > } > > +/** > + * amdgpu_vm_lock_done_list - lock all BOs on the done list > + * @exec: drm execution context > + * @num_fences: number of extra fences to reserve > + * > + * Lock the BOs on the done list in the DRM execution context. > + */ > +int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, > + unsigned int num_fences) > +{ > + struct list_head *prev = &vm->done; > + struct amdgpu_bo_va *bo_va; > + struct amdgpu_bo *bo; > + int ret; > + > + /* We can only trust prev->next while holding the lock */ > + spin_lock(&vm->status_lock); > + while (!list_is_head(prev->next, &vm->done)) { > + bo_va = list_entry(prev->next, typeof(*bo_va), > base.vm_status); > + spin_unlock(&vm->status_lock); > + > + bo = bo_va->base.bo; > + if (bo) { > + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1); > + if (unlikely(ret)) > + return ret; > + } > + spin_lock(&vm->status_lock); > + prev = prev->next; > + } > + spin_unlock(&vm->status_lock); > + > + return 0; > +} > + > /** > * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU > * > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index e045c1590d78..3409904b5c63 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -491,6 +491,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, > struct amdgpu_vm *vm); > void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); > int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, > unsigned int num_fences); > +int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, > + unsigned int num_fences); > bool amdgpu_vm_ready(struct amdgpu_vm *vm); > uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm > *vm); > int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, > -- > 2.43.0 >
