amdgpu: add userqueue resume

Shashank Sharma Wed, 08 May 2024 13:29:23 -0700

This patch adds support for userqueue resume. What it typically does is
this:
- adds a new delayed work for resuming all the queues.
- schedules this delayed work from the suspend work.
- validates the BOs and replaces the eviction fence before resuming all
  the queues running under this instance of userq manager.


Cc: Alex Deucher <alexander.deuc...@amd.com>
Cc: Christian Koenig <christian.koe...@amd.com>
Signed-off-by: Shashank Sharma <shashank.sha...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 166 ++++++++++++++++++
 .../gpu/drm/amd/include/amdgpu_userqueue.h    |   1 +
 2 files changed, 167 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
index fdbd542e7f53..02ddd713d068 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
@@ -398,6 +398,167 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
        return r;
 }
 
+static int
+amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
+{
+       struct amdgpu_device *adev = uq_mgr->adev;
+       const struct amdgpu_userq_funcs *userq_funcs;
+       struct amdgpu_usermode_queue *queue;
+       int queue_id, ret;
+
+       userq_funcs = adev->userq_funcs[AMDGPU_HW_IP_GFX];
+
+       /* Resume all the queues for this process */
+       idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
+               ret = userq_funcs->resume(uq_mgr, queue);
+               if (ret)
+                       DRM_ERROR("Failed to resume queue %d\n", queue_id);
+       }
+
+       return ret;
+}
+
+static int
+amdgpu_userqueue_replace_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
+                                 struct drm_exec *exec)
+{
+       int ret;
+       struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
+       struct amdgpu_vm *vm = &fpriv->vm;
+       struct amdgpu_eviction_fence *old_ef, *new_ef;
+       struct amdgpu_bo_va *bo_va, *tmp;
+
+       old_ef = fpriv->ev_fence;
+       new_ef = amdgpu_eviction_fence_create(fpriv);
+       if (!new_ef) {
+               DRM_ERROR("Failed to create new eviction fence\n");
+               return ret;
+       }
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {
+               struct amdgpu_bo *bo = bo_va->base.bo;
+
+               /* Skip pinned BOs */
+               if (bo->tbo.pin_count)
+                       continue;
+
+               ret = drm_exec_lock_obj(exec, &bo->tbo.base);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to lock BO for eviction fence 
replacement\n");
+                       goto free_err;
+               }
+
+               /* replace the old eviction fence with new one */
+               amdgpu_eviction_fence_detach(fpriv, old_ef, bo);
+               ret = amdgpu_eviction_fence_attach(new_ef, bo);
+               if (ret) {
+                       DRM_ERROR("Failed to attch new eviction fence\n");
+                       goto free_err;
+               }
+       }
+
+       /* Update the new eviction fence */
+       fpriv->ev_fence = new_ef;
+       kfree(old_ef);
+       return 0;
+
+free_err:
+       kfree(new_ef);
+       return ret;
+}
+
+/* Expects drm_exec_until_all_locked called on this exec */
+static int
+amdgpu_userqueue_validate_bos(struct amdgpu_userq_mgr *uq_mgr,
+                             struct drm_exec *exec)
+{
+       int ret;
+       struct amdgpu_bo *bo;
+       struct amdgpu_bo_va *bo_va, *tmp;
+       struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
+       struct amdgpu_vm *vm = &fpriv->vm;
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {
+               bo = bo_va->base.bo;
+               ret = drm_exec_lock_obj(exec, &bo->tbo.base);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to exec lock  for validation\n");
+                       goto unlock_all;
+               }
+       }
+
+       list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
+               bo = bo_va->base.bo;
+               ret = drm_exec_lock_obj(exec, &bo->tbo.base);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to lock BO for validation\n");
+                       goto unlock_all;
+               }
+
+               ret = amdgpu_bo_reserve(bo, false);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to reserve BO for validation\n");
+                       goto unlock_all;
+               }
+
+               ret = amdgpu_userqueue_validate_bo(bo);
+               amdgpu_bo_unreserve(bo);
+               if (ret) {
+                       DRM_ERROR("Failed to validate BO\n");
+                       goto unlock_all;
+               }
+       }
+
+       ret = amdgpu_vm_handle_moved(uq_mgr->adev, vm, NULL);
+       if (ret)
+               DRM_ERROR("Failed to handle moved BOs\n");
+
+unlock_all:
+       return ret;
+}
+
+static void amdgpu_userqueue_resume_worker(struct work_struct *work)
+{
+       int ret;
+       struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, 
resume_work.work);
+       struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
+       struct amdgpu_vm *vm = &fpriv->vm;
+       struct drm_exec exec;
+
+       mutex_lock(&uq_mgr->userq_mutex);
+
+       drm_exec_init(&exec, 0, 0);
+       drm_exec_until_all_locked(&exec) {
+               ret = amdgpu_vm_lock_pd(vm, &exec, 2);
+               if (unlikely(ret)) {
+                       DRM_ERROR("Failed to lock PD\n");
+                       goto unlock_all;
+               }
+       }
+
+       ret = amdgpu_userqueue_validate_bos(uq_mgr, &exec);
+       if (ret) {
+               DRM_ERROR("Failed to validate BOs to restore\n");
+               goto unlock_all;
+       }
+
+       ret = amdgpu_userqueue_replace_ev_fence(uq_mgr, &exec);
+       if (ret) {
+               DRM_ERROR("Failed to signal eviction\n");
+               goto unlock_all;
+       }
+
+       ret = amdgpu_userqueue_resume_all(uq_mgr);
+       if (ret) {
+               DRM_ERROR("Failed to resume all queues\n");
+               goto unlock_all;
+       }
+
+unlock_all:
+       drm_exec_fini(&exec);
+       mutex_unlock(&uq_mgr->userq_mutex);
+}
+
 static int
 amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
 {
@@ -442,6 +603,10 @@ amdgpu_userqueue_suspend_worker(struct work_struct *work)
                return;
        }
 
+       /* Schedule a work to restore userqueue after 100 ms */
+       schedule_delayed_work(&uq_mgr->resume_work,
+                             msecs_to_jiffies(AMDGPU_USERQ_RESUME_TIME_MS));
+
 unlock:
        mutex_unlock(&uq_mgr->userq_mutex);
 }
@@ -475,6 +640,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr 
*userq_mgr, struct amdgpu_devi
        /* This reference is required for suspend work */
        fpriv->ev_fence->uq_mgr = userq_mgr;
        INIT_DELAYED_WORK(&userq_mgr->suspend_work, 
amdgpu_userqueue_suspend_worker);
+       INIT_DELAYED_WORK(&userq_mgr->resume_work, 
amdgpu_userqueue_resume_worker);
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h 
b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
index 647e63bf03ab..2e3fe784188e 100644
--- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
+++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
@@ -73,6 +73,7 @@ struct amdgpu_userq_mgr {
        struct amdgpu_device            *adev;
 
        struct delayed_work             suspend_work;
+       struct delayed_work             resume_work;
 };
 
 int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file 
*filp);
-- 
2.43.2

[PATCH 4/4] drm/amdgpu: add userqueue resume

Reply via email to