On 4/8/26 04:52, Prike Liang wrote: > amdgpu_eviction_fence_suspend_worker() ran amdgpu_userq_wait_for_signal() > with userq_mutex held. The helper used to walk the xarray and block on > queue->last_fence while keeping that lock, so the userspace signal path > could never get the lock while the wait fence sleep waiting, then triggering > 120s hung task warnings.
And that is perfectly intentional. > > Meanwhile, there also rework the userq lock access in the eviction suspension > path for resolving the lockdep/lock order issues. > > Signed-off-by: Prike Liang <[email protected]> > --- > .../drm/amd/amdgpu/amdgpu_eviction_fence.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 107 ++++++++++++++---- > 2 files changed, 85 insertions(+), 24 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > index 5ae477c49a53..00c450e31139 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c > @@ -73,7 +73,6 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct > *work) > * allocate memory while holding this lock, but only after ensuring that > * the eviction fence is signaled. > */ > - cookie = dma_fence_begin_signalling(); > > ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); > amdgpu_userq_evict(uq_mgr); > @@ -83,6 +82,7 @@ amdgpu_eviction_fence_suspend_worker(struct work_struct > *work) > * userq_mutex. Otherwise we won't resume the queues before issuing the > * next fence. > */ > + cookie = dma_fence_begin_signalling(); Absolutely clear NAK to that. This only disables the warning but doesn't fix the locking problem. As far as I can see the patch here is just once more utterly nonsense. What problem are you exactly trying to solve? Regards, Christian. > dma_fence_signal(ev_fence); > dma_fence_end_signalling(cookie); > dma_fence_put(ev_fence); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 9d3c39e96ac1..7691f169415b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -26,6 +26,7 @@ > #include <drm/drm_exec.h> > #include <linux/pm_runtime.h> > #include <drm/drm_drv.h> > +#include <linux/lockdep.h> > > #include "amdgpu.h" > #include "amdgpu_reset.h" > @@ -34,6 +35,23 @@ > #include "amdgpu_hmm.h" > #include "amdgpu_userq_fence.h" > > +#define AMDGPU_USERQ_FENCE_WAIT_POLL_MS 1000 > +static unsigned long > +amdgpu_userq_fence_timeout_ms(struct amdgpu_usermode_queue *queue) > +{ > + struct amdgpu_device *adev = queue->userq_mgr->adev; > + switch (queue->queue_type) { > + case AMDGPU_RING_TYPE_GFX: > + return adev->gfx_timeout; > + case AMDGPU_RING_TYPE_COMPUTE: > + return adev->compute_timeout; > + case AMDGPU_RING_TYPE_SDMA: > + return adev->sdma_timeout; > + default: > + return adev->gfx_timeout; > + } > +} > + > u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) > { > int i; > @@ -176,29 +194,12 @@ static void amdgpu_userq_hang_detect_work(struct > work_struct *work) > */ > void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) > { > - struct amdgpu_device *adev; > unsigned long timeout_ms; > > if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev) > return; > > - adev = queue->userq_mgr->adev; > - /* Determine timeout based on queue type */ > - switch (queue->queue_type) { > - case AMDGPU_RING_TYPE_GFX: > - timeout_ms = adev->gfx_timeout; > - break; > - case AMDGPU_RING_TYPE_COMPUTE: > - timeout_ms = adev->compute_timeout; > - break; > - case AMDGPU_RING_TYPE_SDMA: > - timeout_ms = adev->sdma_timeout; > - break; > - default: > - timeout_ms = adev->gfx_timeout; > - break; > - } > - > + timeout_ms = amdgpu_userq_fence_timeout_ms(queue); > /* Store the fence to monitor and schedule hang detection */ > WRITE_ONCE(queue->hang_detect_fence, queue->last_fence); > schedule_delayed_work(&queue->hang_detect_work, > @@ -1274,16 +1275,76 @@ void amdgpu_userq_reset_work(struct work_struct *work) > static void > amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) > { > - struct amdgpu_usermode_queue *queue; > - unsigned long queue_id; > + lockdep_assert_held(&uq_mgr->userq_mutex); > > - xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { > - struct dma_fence *f = queue->last_fence; > + /* Rescan the userq xarray after each fence poll interval to get > + * newly added queues or fences. > + */ > + for (;;) { > + struct amdgpu_usermode_queue *queue; > + unsigned long queue_id = 0; > + struct dma_fence *f = NULL; > + unsigned long timeout_ms = 0; > + u64 context = 0, seqno = 0; > + bool signaled = false; > + > + xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { > + struct dma_fence *tmp = queue->last_fence; > + > + if (!tmp || dma_fence_is_signaled(tmp)) > + continue; > + > + f = dma_fence_get(tmp); > + timeout_ms = amdgpu_userq_fence_timeout_ms(queue); > + context = tmp->context; > + seqno = tmp->seqno; > + break; > + } > > if (!f) > + return; > + > + if (!timeout_ms) > + timeout_ms = 1; > + > + /* > + * We can't use dma_fence_wait() here. Waiting there and then > + * reacquiring userq_mutex creates a lockdep cycle through > + * dma_fence_map: > + * userq_mutex -> reservation_ww_class_mutex -> dma_fence_map > + * and > + * dma_fence_map -> userq_mutex > + * Instead, drop the mutex, sleep in bounded intervals, then > + * reacquire and poll the fence signaled bit. > + */ > + while (timeout_ms) { > + unsigned long interval_ms; > + > + if (dma_fence_is_signaled(f)) { > + signaled = true; > + break; > + } > + > + interval_ms = min(timeout_ms, > + (unsigned > long)AMDGPU_USERQ_FENCE_WAIT_POLL_MS); > + mutex_unlock(&uq_mgr->userq_mutex); > + msleep(interval_ms); > + mutex_lock(&uq_mgr->userq_mutex); > + timeout_ms -= interval_ms; > + } > + > + if (!signaled && dma_fence_is_signaled(f)) > + signaled = true; > + > + dma_fence_put(f); > + > + if (signaled) > continue; > > - dma_fence_wait(f, false); > + drm_dbg(adev_to_drm(uq_mgr->adev), > + "Timed out waiting for fence=%llu:%llu during > eviction\n", > + context, seqno); > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > } > } >
