User queue restore takes reservation locks before userq_mutex, but the
create and destroy paths can take userq_mutex and then nest
reset_domain->sem under it. Lockdep rightfully reports that as a
possible deadlock against the restore worker and other reservation
users.

Fix this by keeping reset_domain->sem outside the userq_mutex section in
the create path, and by moving queue cleanup out from under userq_mutex
in the destroy path. Remove the queue from the global doorbell lookup
before dropping userq_mutex so IRQ paths cannot access it while teardown
continues.

Signed-off-by: Prike Liang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 2408f888c4d9..551426741a7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -447,8 +447,6 @@ static void amdgpu_userq_cleanup(struct 
amdgpu_usermode_queue *queue)
        /* Drop the userq reference. */
        amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
        uq_funcs->mqd_destroy(queue);
-       /* Use interrupt-safe locking since IRQ handlers may access these 
XArrays */
-       xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
        amdgpu_userq_fence_driver_free(queue);
        queue->fence_drv = NULL;
        queue->userq_mgr = NULL;
@@ -662,8 +660,12 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, 
struct amdgpu_usermode_que
                drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW 
mapping userq\n");
                queue->state = AMDGPU_USERQ_STATE_HUNG;
        }
-       amdgpu_userq_cleanup(queue);
+       /* Remove the queue from the global doorbell lookup before dropping
+        * userq_mutex so IRQ paths can't access it while cleanup continues.
+        */
+       xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
        mutex_unlock(&uq_mgr->userq_mutex);
+       amdgpu_userq_cleanup(queue);
 
        pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
 
@@ -799,6 +801,13 @@ amdgpu_userq_create(struct drm_file *filp, union 
drm_amdgpu_userq *args)
                goto clean_fence_driver;
        }
 
+       /*
+        * Keep reset_domain->sem outside the userq_mutex section returned by
+        * amdgpu_userq_ensure_ev_fence(). Restore acquires reservation locks
+        * before userq_mutex, so taking reset_domain->sem after userq_mutex
+        * would invert the established order and trigger lockdep.
+        */
+       down_read(&adev->reset_domain->sem);
        amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
 
        /* don't map the queue if scheduling is halted */
@@ -812,16 +821,13 @@ amdgpu_userq_create(struct drm_file *filp, union 
drm_amdgpu_userq *args)
                r = amdgpu_userq_map_helper(queue);
                if (r) {
                        drm_file_err(uq_mgr->file, "Failed to map Queue\n");
-                       goto clean_mqd;
+                       goto clean_reset_domain;
                }
        }
 
        /* drop this refcount during queue destroy */
        kref_init(&queue->refcount);
 
-       /* Wait for mode-1 reset to complete */
-       down_read(&adev->reset_domain->sem);
-
        r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
                     XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
        if (r) {
@@ -850,7 +856,6 @@ amdgpu_userq_create(struct drm_file *filp, union 
drm_amdgpu_userq *args)
 
 clean_reset_domain:
        up_read(&adev->reset_domain->sem);
-clean_mqd:
        mutex_unlock(&uq_mgr->userq_mutex);
        uq_funcs->mqd_destroy(queue);
 clean_fence_driver:
-- 
2.34.1

Reply via email to