On 1/13/26 09:21, Jesse.Zhang wrote:
> In error scenarios (e.g., malformed commands), user queue fences may never
> be signaled, causing processes to wait indefinitely. To address this while
> preserving the requirement of infinite fence waits, implement an independent
> timeout detection mechanism:
> 
> 1. Initialize a hang detect work when creating a user queue (one-time setup)
> 2. Start the work with queue-type-specific timeout (gfx/compute/sdma) when
>        the last fence is created via amdgpu_userq_signal_ioctl (per-fence 
> timing)
> 3. Trigger queue reset logic if the timer expires before the fence is signaled
> 
> v2: make timeout per queue type (adev->gfx_timeout vs adev->compute_timeout 
> vs adev->sdma_timeout) to be consistent with kernel queues. (Alex)
> v3: The timeout detection must be independent from the fence, e.g. you don't 
> wait for a timeout on the fence
>         but rather have the timeout start as soon as the fence is 
> initialized. (Christian)
> v4: replace the timer with the `hang_detect_work` delayed work.

I don't have time to wrap my head around all that logic again, but from a one 
mile high view that now looks solid to me.

Feel free to add my acked-by, if you need an in-deep review and Alex doesn't 
have time then please just ping me.

Thanks,
Christian.

> 
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c     | 70 ++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h     |  3 +
>  .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   |  1 +
>  3 files changed, 73 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 98110f543307..664a15278c1d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -148,6 +148,69 @@ amdgpu_userq_detect_and_reset_queues(struct 
> amdgpu_userq_mgr *uq_mgr)
>       return r;
>  }
>  
> +static void amdgpu_userq_hang_detect_work(struct work_struct *work)
> +{
> +     struct amdgpu_usermode_queue *queue = container_of(work,
> +                                                       struct 
> amdgpu_usermode_queue,
> +                                                       
> hang_detect_work.work);
> +     struct dma_fence *fence;
> +     struct amdgpu_userq_mgr *uq_mgr;
> +
> +     if (!queue || !queue->userq_mgr)
> +             return;
> +
> +     uq_mgr = queue->userq_mgr;
> +     fence = READ_ONCE(queue->hang_detect_fence);
> +     /* Fence already signaled – no action needed */
> +     if (!fence || dma_fence_is_signaled(fence))
> +             return;
> +
> +     mutex_lock(&uq_mgr->userq_mutex);
> +     amdgpu_userq_detect_and_reset_queues(uq_mgr);
> +     mutex_unlock(&uq_mgr->userq_mutex);
> +}
> +
> +/*
> + * Start hang detection for a user queue fence. A delayed work will be 
> scheduled
> + * to check if the fence is still pending after the timeout period.
> +*/
> +void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
> +{
> +     struct amdgpu_device *adev;
> +     unsigned long timeout_ms;
> +
> +     if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
> +             return;
> +
> +     adev = queue->userq_mgr->adev;
> +     /* Determine timeout based on queue type */
> +     switch (queue->queue_type) {
> +     case AMDGPU_RING_TYPE_GFX:
> +             timeout_ms = adev->gfx_timeout;
> +             break;
> +     case AMDGPU_RING_TYPE_COMPUTE:
> +             timeout_ms = adev->compute_timeout;
> +             break;
> +     case AMDGPU_RING_TYPE_SDMA:
> +             timeout_ms = adev->sdma_timeout;
> +             break;
> +     default:
> +             timeout_ms = adev->gfx_timeout;
> +             break;
> +     }
> +
> +     /* Store the fence to monitor and schedule hang detection */
> +     WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
> +     schedule_delayed_work(&queue->hang_detect_work,
> +                  msecs_to_jiffies(timeout_ms));
> +}
> +
> +static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue 
> *queue)
> +{
> +     INIT_DELAYED_WORK(&queue->hang_detect_work, 
> amdgpu_userq_hang_detect_work);
> +     queue->hang_detect_fence = NULL;
> +}
> +
>  static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue 
> *queue,
>                                          struct amdgpu_bo_va_mapping *va_map, 
> u64 addr)
>  {
> @@ -572,7 +635,6 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>  
>       cancel_delayed_work_sync(&uq_mgr->resume_work);
>       mutex_lock(&uq_mgr->userq_mutex);
> -
>       queue = amdgpu_userq_find(uq_mgr, queue_id);
>       if (!queue) {
>               drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to 
> destroy\n");
> @@ -580,6 +642,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
>               return -EINVAL;
>       }
>       amdgpu_userq_wait_for_last_fence(queue);
> +     /* Cancel any pending hang detection work and cleanup */
> +     if (queue->hang_detect_fence) {
> +             cancel_delayed_work_sync(&queue->hang_detect_work);
> +             queue->hang_detect_fence = NULL;
> +     }
>       r = amdgpu_bo_reserve(queue->db_obj.obj, true);
>       if (!r) {
>               amdgpu_bo_unpin(queue->db_obj.obj);
> @@ -818,6 +885,7 @@ amdgpu_userq_create(struct drm_file *filp, union 
> drm_amdgpu_userq *args)
>       queue->debugfs_queue = debugfs_create_dir(queue_name, 
> filp->debugfs_client);
>       debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue, 
> &amdgpu_mqd_info_fops);
>  #endif
> +     amdgpu_userq_init_hang_detect_work(queue);
>       kfree(queue_name);
>  
>       args->out.queue_id = qid;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> index 1eaa94f8a291..06a06272b41a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
> @@ -72,6 +72,8 @@ struct amdgpu_usermode_queue {
>       u32                     xcp_id;
>       int                     priority;
>       struct dentry           *debugfs_queue;
> +     struct delayed_work hang_detect_work;
> +     struct dma_fence *hang_detect_fence;
>  
>       struct list_head        userq_va_list;
>  };
> @@ -146,6 +148,7 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct 
> amdgpu_device *adev,
>  void amdgpu_userq_reset_work(struct work_struct *work);
>  void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
>  int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
> +void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue 
> *queue);
>  
>  int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
>                                  struct amdgpu_usermode_queue *queue,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> index 25f178536469..374fbd0e859a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
> @@ -569,6 +569,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, 
> void *data,
>  
>       dma_fence_put(queue->last_fence);
>       queue->last_fence = dma_fence_get(fence);
> +     amdgpu_userq_start_hang_detect_work(queue);
>       mutex_unlock(&userq_mgr->userq_mutex);
>  
>       drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,

Reply via email to