This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:
1. Queue detection and reset logic:
- amdgpu_userq_detect_and_reset_queues() identifies failed queues
- Per-IP detect_and_reset callbacks for targeted recovery
- Falls back to full GPU reset when needed
2. Reset infrastructure:
- Adds userq_reset_work workqueue for async reset handling
- Implements pre/post reset handlers for queue state management
- Integrates with existing GPU reset framework
3. Error handling improvements:
- Enhanced state tracking with HUNG state
- Automatic reset triggering on critical failures
- VRAM loss handling during recovery
4. Integration points:
- Added to device init/reset paths
- Called during queue destroy, suspend, and isolation events
- Handles both individual queue and full GPU resets
The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.
v2: add detection and reset calls when preemption/unmaped fails.
add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call
amdgpu_userq_detect_and_reset_queues. (Alex)
warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
warn if the uqm->userq_mutex is not held.
v5: Use array for user queue type counters.(Alex)
all of the uqm->userq_mutex need to be held when calling detect and reset.
(Alex)
v6: simply the userq lock (Alex)
Signed-off-by: Alex Deucher <[email protected]>
Signed-off-by: Jesse Zhang <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 192 +++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
5 files changed, 193 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c56839528843..93c255d1ddfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1311,6 +1311,7 @@ struct amdgpu_device {
struct list_head userq_mgr_list;
struct mutex userq_mutex;
bool userq_halt_for_enforce_isolation;
+ struct work_struct userq_reset_work;
struct amdgpu_uid *uid_info;
/* KFD
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b20383021b50..f1d3c16c67cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4534,6 +4534,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+ INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
adev->gfx.gfx_off_req_count = 1;
adev->gfx.gfx_off_residency = 0;
@@ -5951,6 +5952,10 @@ int amdgpu_device_reinit_after_reset(struct
amdgpu_reset_context *reset_context)
if (r)
goto out;
+ r = amdgpu_userq_post_reset(tmp_adev,
vram_lost);
+ if (r)
+ goto out;
+
drm_client_dev_resume(adev_to_drm(tmp_adev),
false);
/*
@@ -6173,6 +6178,7 @@ static inline void
amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
+ cancel_work(&adev->userq_reset_work);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);
@@ -6293,6 +6299,8 @@ static void amdgpu_device_halt_activities(struct
amdgpu_device *adev,
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
+ amdgpu_userq_pre_reset(tmp_adev);
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 80b85547c810..cf21f36b5761 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -83,6 +83,7 @@ enum amdgpu_ring_type {
AMDGPU_RING_TYPE_MES,
AMDGPU_RING_TYPE_UMSCH_MM,
AMDGPU_RING_TYPE_CPER,
+ AMDGPU_RING_TYPE_MAX,
};
enum amdgpu_ib_pool_type {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index af92450ea6eb..adfacfa495c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -25,8 +25,10 @@
#include <drm/drm_auth.h>
#include <drm/drm_exec.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
#include "amdgpu.h"
+#include "amdgpu_reset.h"
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_userq_fence.h"
@@ -44,6 +46,67 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device
*adev)
return userq_ip_mask;
}
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+ if (amdgpu_device_should_recover_gpu(adev)) {
+ amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->userq_reset_work);
+ /* Wait for the reset job to complete */
+ flush_work(&adev->userq_reset_work);
+ }
+}
+
+static int
+amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_gfx_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_GFX];
+ const struct amdgpu_userq_funcs *userq_compute_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE];
+ const struct amdgpu_userq_funcs *userq_sdma_funcs =
+ adev->userq_funcs[AMDGPU_RING_TYPE_SDMA];
+ bool gpu_reset = false;
+ int r = 0;
+
+ /* warning if global mutex is not held */
+ WARN_ON(!mutex_is_locked(&adev->userq_mutex));
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ } else if (amdgpu_gpu_recovery) {
+ if
((atomic_read(&uq_mgr->userq_count[AMDGPU_RING_TYPE_COMPUTE]) > 0) &&
userq_compute_funcs->detect_and_reset) {
+ r = userq_compute_funcs->detect_and_reset(adev,
AMDGPU_RING_TYPE_COMPUTE);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+
+ if ((atomic_read(&uq_mgr->userq_count[AMDGPU_RING_TYPE_GFX]) >
0) && userq_gfx_funcs->detect_and_reset) {
+ r = userq_gfx_funcs->detect_and_reset(adev,
AMDGPU_RING_TYPE_GFX);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+
+ if ((atomic_read(&uq_mgr->userq_count[AMDGPU_RING_TYPE_SDMA]) >
0) && userq_sdma_funcs->detect_and_reset) {
+ r = userq_sdma_funcs->detect_and_reset(adev,
AMDGPU_RING_TYPE_SDMA);
+ if (r) {
+ gpu_reset = true;
+ goto gpu_reset;
+ }
+ }
+ }
+
+gpu_reset:
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
+ return r;
+}
+
int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr,
u64 expected_size)
{
@@ -83,17 +146,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr
*uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool found_hung_queue = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->preempt(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ found_hung_queue = true;
} else {
queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
}
}
+ if (found_hung_queue)
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
return r;
}
@@ -125,16 +193,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool found_hung_queue = false;
int r = 0;
if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
r = userq_funcs->unmap(uq_mgr, queue);
- if (r)
+ if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
- else
+ found_hung_queue = true;
+ } else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+ }
}
+
+ if (found_hung_queue)
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
+
return r;
}
@@ -145,16 +220,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
r = userq_funcs->map(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ gpu_reset = true;
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -378,10 +459,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
amdgpu_bo_unreserve(queue->db_obj.obj);
}
amdgpu_bo_unref(&queue->db_obj.obj);
-
+ atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
#if defined(CONFIG_DEBUG_FS)
debugfs_remove_recursive(queue->debugfs_queue);
#endif
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
r = amdgpu_userq_unmap_helper(uq_mgr, queue);
/*TODO: It requires a reset for userq hw unmap error*/
if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
@@ -587,6 +669,7 @@ amdgpu_userq_create(struct drm_file *filp, union
drm_amdgpu_userq *args)
kfree(queue_name);
args->out.queue_id = qid;
+ atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
unlock:
mutex_unlock(&adev->userq_mutex);
@@ -847,6 +930,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
int queue_id;
int ret = 0, r;
+ amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
r = amdgpu_userq_preempt_helper(uq_mgr, queue);
@@ -859,6 +943,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
return ret;
}
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+ struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+ userq_reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -885,22 +986,19 @@ void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
{
- int ret;
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ int ret;
/* Wait for any pending userqueue fence work to finish */
ret = amdgpu_userq_wait_for_signal(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout
waiting for work\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Not evicting userqueue, timeout waiting for
work\n");
ret = amdgpu_userq_evict_all(uq_mgr);
- if (ret) {
- drm_file_err(uq_mgr->file, "Failed to evict userqueue\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -909,7 +1007,6 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
cancel_delayed_work_sync(&uq_mgr->resume_work);
return;
}
-
/* Schedule a resume work */
schedule_delayed_work(&uq_mgr->resume_work, 0);
}
@@ -917,12 +1014,18 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file
*file_priv,
struct amdgpu_device *adev)
{
+ int i;
+
idr_init_base(&userq_mgr->userq_idr, 1);
userq_mgr->adev = adev;
userq_mgr->file = file_priv;
mutex_lock(&adev->userq_mutex);
list_add(&userq_mgr->list, &adev->userq_mgr_list);
+ /* Initialize all queue type counters to zero */
+ for (i = 0; i < AMDGPU_RING_TYPE_MAX; i++) {
+ atomic_set(&userq_mgr->userq_count[i], 0);
+ }
mutex_unlock(&adev->userq_mutex);
INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
@@ -939,6 +1042,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr
*userq_mgr)
cancel_delayed_work_sync(&userq_mgr->resume_work);
mutex_lock(&adev->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(userq_mgr);
idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) {
amdgpu_userq_wait_for_last_fence(userq_mgr, queue);
amdgpu_userq_unmap_helper(userq_mgr, queue);
@@ -967,6 +1071,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
return 0;
mutex_lock(&adev->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(uqm);
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
cancel_delayed_work_sync(&uqm->resume_work);
idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
@@ -1021,13 +1126,15 @@ int
amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
adev->userq_halt_for_enforce_isolation = true;
list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
cancel_delayed_work_sync(&uqm->resume_work);
+ amdgpu_userq_detect_and_reset_queues(uqm);
idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
(queue->xcp_id == idx)) {
r = amdgpu_userq_preempt_helper(uqm, queue);
- if (r)
+ if (r) {
ret = r;
+ }
}
}
}
@@ -1066,3 +1173,60 @@ int
amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
mutex_unlock(&adev->userq_mutex);
return ret;
}
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+ const struct amdgpu_userq_funcs *userq_funcs;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm, *tmp;
+ int queue_id;
+
+ list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+ cancel_delayed_work_sync(&uqm->resume_work);
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ amdgpu_userq_wait_for_last_fence(uqm, queue);
+ userq_funcs =
adev->userq_funcs[queue->queue_type];
+ userq_funcs->unmap(uqm, queue);
+ /* just mark all queues as hung at this point.
+ * if unmap succeeds, we could map again
+ * in amdgpu_userq_post_reset() if vram is not
lost
+ */
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+
amdgpu_userq_fence_driver_force_completion(queue);
+ }
+ }
+ }
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+ /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+ * at this point, we should be able to map it again
+ * and continue if vram is not lost.
+ */
+ struct amdgpu_userq_mgr *uqm;
+ struct amdgpu_usermode_queue *queue;
+ const struct amdgpu_userq_funcs *userq_funcs;
+ int queue_id, r = 0;
+
+ list_for_each_entry(uqm, &adev->userq_mgr_list, list) {
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_HUNG &&
!vram_lost) {
+ userq_funcs =
adev->userq_funcs[queue->queue_type];
+
+ r = userq_funcs->map(uqm, queue); // Re-map
queue
+ if (r) {
+ dev_err(adev->dev, "Failed to remap
queue %d\n", queue_id);
+ continue;
+ }
+ queue->state = AMDGPU_USERQ_STATE_MAPPED;
+ }
+ }
+
+ /* Restart resume work after reset */
+ //queue_delayed_work(system_wq, &uqm->resume_work,
msecs_to_jiffies(100));
+ }
+
+ return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 2d63308d55c3..7cd5344c0344 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -93,6 +93,7 @@ struct amdgpu_userq_mgr {
struct delayed_work resume_work;
struct list_head list;
struct drm_file *file;
+ atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
};
struct amdgpu_db_info {
@@ -136,6 +137,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct
amdgpu_device *adev,
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr,
u64 expected_size);
#endif
--
2.49.0