CPU0: hang_detect_work → directly calls reset_work() CPU1: evict_all → queues reset_work (via workqueue)
There is a possibility of two reset thread running at same time. To avoid that we add a per queue manager flag to avoid duplication. Signed-off-by: Sunil Khatri <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 16 ++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 1 + 2 files changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 0a1fc45f5b4e..1440f51b667f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -109,6 +109,19 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work) if (!amdgpu_gpu_recovery) return; + /* + * Prevent concurrent/duplicate reset executions. Both hang_detect_work + * (direct call) and evict_all (via schedule+flush_work) can invoke this + * function simultaneously. Use an atomic test-and-set so only the first + * caller proceeds; the second exits early. + * + * Note: amdgpu_in_reset() cannot be used here because in_gpu_reset is + * only set deep inside amdgpu_device_gpu_recover(), well after we've + * already entered this function. + */ + if (atomic_cmpxchg(&uq_mgr->reset_in_progress, 0, 1) != 0) + return; + /* * Iterate through all queue types to detect and reset problematic queues * Process each queue type in the defined order @@ -145,6 +158,8 @@ static void amdgpu_userq_mgr_reset_work(struct work_struct *work) amdgpu_device_gpu_recover(adev, NULL, &reset_context); } + + atomic_set(&uq_mgr->reset_in_progress, 0); } static void amdgpu_userq_hang_detect_work(struct work_struct *work) @@ -1304,6 +1319,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work); + atomic_set(&userq_mgr->reset_in_progress, 0); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 49b33e2d6932..2748ecc0f6c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -129,6 +129,7 @@ struct amdgpu_userq_mgr { * Reset work which is used when eviction fails. */ struct work_struct reset_work; + atomic_t reset_in_progress; atomic_t userq_count[AMDGPU_RING_TYPE_MAX]; }; -- 2.34.1
