User queue restore takes reservation locks before userq_mutex, but the create and destroy paths can take userq_mutex and then nest reset_domain->sem under it. Lockdep rightfully reports that as a possible deadlock against the restore worker and other reservation users.
Fix this by keeping reset_domain->sem outside the userq_mutex section in the create path, and by moving queue cleanup out from under userq_mutex in the destroy path. Remove the queue from the global doorbell lookup before dropping userq_mutex so IRQ paths cannot access it while teardown continues. Signed-off-by: Prike Liang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 2408f888c4d9..551426741a7f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue) /* Drop the userq reference. */ amdgpu_userq_buffer_vas_list_cleanup(adev, queue); uq_funcs->mqd_destroy(queue); - /* Use interrupt-safe locking since IRQ handlers may access these XArrays */ - xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); amdgpu_userq_fence_driver_free(queue); queue->fence_drv = NULL; queue->userq_mgr = NULL; @@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n"); queue->state = AMDGPU_USERQ_STATE_HUNG; } - amdgpu_userq_cleanup(queue); + /* Remove the queue from the global doorbell lookup before dropping + * userq_mutex so IRQ paths can't access it while cleanup continues. + */ + xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index); mutex_unlock(&uq_mgr->userq_mutex); + amdgpu_userq_cleanup(queue); pm_runtime_put_autosuspend(adev_to_drm(adev)->dev); @@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) goto clean_fence_driver; } + /* + * Keep reset_domain->sem outside the userq_mutex section returned by + * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks + * before userq_mutex, so taking reset_domain->sem after userq_mutex + * would invert the established order and trigger lockdep. + */ + down_read(&adev->reset_domain->sem); amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); /* don't map the queue if scheduling is halted */ @@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) r = amdgpu_userq_map_helper(queue); if (r) { drm_file_err(uq_mgr->file, "Failed to map Queue\n"); - goto clean_mqd; + goto clean_reset_domain; } } /* drop this refcount during queue destroy */ kref_init(&queue->refcount); - /* Wait for mode-1 reset to complete */ - down_read(&adev->reset_domain->sem); - r = xa_alloc(&uq_mgr->userq_xa, &qid, queue, XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL); if (r) { @@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) clean_reset_domain: up_read(&adev->reset_domain->sem); -clean_mqd: mutex_unlock(&uq_mgr->userq_mutex); uq_funcs->mqd_destroy(queue); clean_fence_driver: -- 2.34.1
