amdgpu: fix hung reset queue array return for hws user compute

Alex Deucher Mon, 13 Oct 2025 19:05:47 -0700

On Thu, Oct 9, 2025 at 2:50 PM Jonathan Kim <[email protected]> wrote:
>
> By design the MES will return an array result that is twice the number
> of hung doorbells it can report.
>
> i.e. if up k reported doorbells are supported, then the
> second half of the array, also of length k, holds the HQD information
> (type/queue/pipe) where queue 1 corresponds to index 0 and k,
> queue 2 corresponds to index 1 and k + 1 etc ...


Has this always been the case?  If not, what mes version changed this?

>
> The driver will use the HDQ info to target queue/pipe reset for
> hardware scheduled user compute queues.
>
> Signed-off-by: Jonathan Kim <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c    | 20 ++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h    |  1 +
>  drivers/gpu/drm/amd/amdgpu/mes_userqueue.c |  6 +++---
>  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c     |  8 +++++---
>  drivers/gpu/drm/amd/amdgpu/mes_v12_0.c     |  8 +++++---
>  5 files changed, 30 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index c698e183beda..1af3ddb6f65c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -419,12 +419,24 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct 
> amdgpu_device *adev,
>         if (r) {
>                 dev_err(adev->dev, "failed to detect and reset\n");
>         } else {
> +               int array_hqd_info_offset = 
> adev->mes.hung_queue_hqd_info_offset;
> +               int array_size = adev->mes.hung_queue_db_array_size;
>                 *hung_db_num = 0;
> -               for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
> -                       if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
> +
> +               for (i = 0; i < array_hqd_info_offset; i++) {
> +                       if (db_array[i] == AMDGPU_MES_INVALID_DB_OFFSET)
> +                               continue;
> +
> +                       hung_db_array[i] = db_array[i];
> +                       *hung_db_num += 1;
> +               }
> +
> +               for (i = array_hqd_info_offset; i < array_size; i++) {
> +                       if (!hung_db_num || queue_type != 
> AMDGPU_RING_TYPE_COMPUTE)

Might be better to move the queue_type check before the loop.

> +                               break;
> +
> +                       if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET)
>                                 hung_db_array[i] = db_array[i];

What is the point of this?  If this is hqd info, then it's not a
doorbell, so why add it to the list?  Maybe drop the hqd array
handling until we actually use it.

Alex

> -                               *hung_db_num += 1;
> -                       }
>                 }
>         }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 6b506fc72f58..97c137c90f97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -149,6 +149,7 @@ struct amdgpu_mes {
>         void                *resource_1_addr[AMDGPU_MAX_MES_PIPES];
>
>         int                             hung_queue_db_array_size;
> +       int                             hung_queue_hqd_info_offset;
>         struct amdgpu_bo                *hung_queue_db_array_gpu_obj;
>         uint64_t                        hung_queue_db_array_gpu_addr;
>         void                            *hung_queue_db_array_cpu_addr;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 829129ad7bd1..5c63480dda9c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -208,10 +208,10 @@ static int mes_userq_detect_and_reset(struct 
> amdgpu_device *adev,
>         struct amdgpu_userq_mgr *uqm, *tmp;
>         unsigned int hung_db_num = 0;
>         int queue_id, r, i;
> -       u32 db_array[4];
> +       u32 db_array[8];
>
> -       if (db_array_size > 4) {
> -               dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
> +       if (db_array_size > 8) {
> +               dev_err(adev->dev, "DB array size (%d vs 8) too small\n",
>                         db_array_size);
>                 return -EINVAL;
>         }
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index e82188431f79..da575bb1377f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -66,7 +66,8 @@ static int mes_v11_0_kiq_hw_fini(struct amdgpu_device 
> *adev);
>  #define GFX_MES_DRAM_SIZE      0x80000
>  #define MES11_HW_RESOURCE_1_SIZE (128 * AMDGPU_GPU_PAGE_SIZE)
>
> -#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 4
> +#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset, [4:7] = hqd 
> info */
> +#define MES11_HUNG_HQD_INFO_OFFSET     4
>
>  static void mes_v11_0_ring_set_wptr(struct amdgpu_ring *ring)
>  {
> @@ -1720,8 +1721,9 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block 
> *ip_block)
>         struct amdgpu_device *adev = ip_block->adev;
>         int pipe, r;
>
> -       adev->mes.hung_queue_db_array_size =
> -               MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
> +       adev->mes.hung_queue_db_array_size = MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
> +       adev->mes.hung_queue_hqd_info_offset = MES11_HUNG_HQD_INFO_OFFSET;
> +
>         for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
>                 if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE)
>                         continue;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index be8a352f9771..79dd2261ad04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -47,7 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device 
> *adev);
>
>  #define MES_EOP_SIZE   2048
>
> -#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
> +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd 
> info */
> +#define MES12_HUNG_HQD_INFO_OFFSET     4
>
>  static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)
>  {
> @@ -1899,8 +1900,9 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block 
> *ip_block)
>         struct amdgpu_device *adev = ip_block->adev;
>         int pipe, r;
>
> -       adev->mes.hung_queue_db_array_size =
> -               MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> +       adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
> +       adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;
> +
>         for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
>                 r = amdgpu_mes_init_microcode(adev, pipe);
>                 if (r)
> --
> 2.34.1
>

Re: [PATCH 3/6] drm/amdgpu: fix hung reset queue array return for hws user compute

Reply via email to