On Thu, Oct 9, 2025 at 2:50 PM Jonathan Kim <[email protected]> wrote: > > By design the MES will return an array result that is twice the number > of hung doorbells it can report. > > i.e. if up k reported doorbells are supported, then the > second half of the array, also of length k, holds the HQD information > (type/queue/pipe) where queue 1 corresponds to index 0 and k, > queue 2 corresponds to index 1 and k + 1 etc ...
Has this always been the case? If not, what mes version changed this? > > The driver will use the HDQ info to target queue/pipe reset for > hardware scheduled user compute queues. > > Signed-off-by: Jonathan Kim <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 ++++++++++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + > drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 6 +++--- > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 8 +++++--- > drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 8 +++++--- > 5 files changed, 30 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > index c698e183beda..1af3ddb6f65c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > @@ -419,12 +419,24 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct > amdgpu_device *adev, > if (r) { > dev_err(adev->dev, "failed to detect and reset\n"); > } else { > + int array_hqd_info_offset = > adev->mes.hung_queue_hqd_info_offset; > + int array_size = adev->mes.hung_queue_db_array_size; > *hung_db_num = 0; > - for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) { > - if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) { > + > + for (i = 0; i < array_hqd_info_offset; i++) { > + if (db_array[i] == AMDGPU_MES_INVALID_DB_OFFSET) > + continue; > + > + hung_db_array[i] = db_array[i]; > + *hung_db_num += 1; > + } > + > + for (i = array_hqd_info_offset; i < array_size; i++) { > + if (!hung_db_num || queue_type != > AMDGPU_RING_TYPE_COMPUTE) Might be better to move the queue_type check before the loop. > + break; > + > + if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) > hung_db_array[i] = db_array[i]; What is the point of this? If this is hqd info, then it's not a doorbell, so why add it to the list? Maybe drop the hqd array handling until we actually use it. Alex > - *hung_db_num += 1; > - } > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > index 6b506fc72f58..97c137c90f97 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > @@ -149,6 +149,7 @@ struct amdgpu_mes { > void *resource_1_addr[AMDGPU_MAX_MES_PIPES]; > > int hung_queue_db_array_size; > + int hung_queue_hqd_info_offset; > struct amdgpu_bo *hung_queue_db_array_gpu_obj; > uint64_t hung_queue_db_array_gpu_addr; > void *hung_queue_db_array_cpu_addr; > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c > b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c > index 829129ad7bd1..5c63480dda9c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c > +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c > @@ -208,10 +208,10 @@ static int mes_userq_detect_and_reset(struct > amdgpu_device *adev, > struct amdgpu_userq_mgr *uqm, *tmp; > unsigned int hung_db_num = 0; > int queue_id, r, i; > - u32 db_array[4]; > + u32 db_array[8]; > > - if (db_array_size > 4) { > - dev_err(adev->dev, "DB array size (%d vs 4) too small\n", > + if (db_array_size > 8) { > + dev_err(adev->dev, "DB array size (%d vs 8) too small\n", > db_array_size); > return -EINVAL; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > index e82188431f79..da575bb1377f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > @@ -66,7 +66,8 @@ static int mes_v11_0_kiq_hw_fini(struct amdgpu_device > *adev); > #define GFX_MES_DRAM_SIZE 0x80000 > #define MES11_HW_RESOURCE_1_SIZE (128 * AMDGPU_GPU_PAGE_SIZE) > > -#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 4 > +#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset, [4:7] = hqd > info */ > +#define MES11_HUNG_HQD_INFO_OFFSET 4 > > static void mes_v11_0_ring_set_wptr(struct amdgpu_ring *ring) > { > @@ -1720,8 +1721,9 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block > *ip_block) > struct amdgpu_device *adev = ip_block->adev; > int pipe, r; > > - adev->mes.hung_queue_db_array_size = > - MES11_HUNG_DB_OFFSET_ARRAY_SIZE; > + adev->mes.hung_queue_db_array_size = MES11_HUNG_DB_OFFSET_ARRAY_SIZE; > + adev->mes.hung_queue_hqd_info_offset = MES11_HUNG_HQD_INFO_OFFSET; > + > for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { > if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE) > continue; > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > index be8a352f9771..79dd2261ad04 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c > @@ -47,7 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device > *adev); > > #define MES_EOP_SIZE 2048 > > -#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4 > +#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd > info */ > +#define MES12_HUNG_HQD_INFO_OFFSET 4 > > static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring) > { > @@ -1899,8 +1900,9 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block > *ip_block) > struct amdgpu_device *adev = ip_block->adev; > int pipe, r; > > - adev->mes.hung_queue_db_array_size = > - MES12_HUNG_DB_OFFSET_ARRAY_SIZE; > + adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE; > + adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET; > + > for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) { > r = amdgpu_mes_init_microcode(adev, pipe); > if (r) > -- > 2.34.1 >
