On Thu, Sep 18, 2025 at 9:04 PM Mario Limonciello (AMD)
<[email protected]> wrote:
>
> From: Mario Limonciello <[email protected]>
>
> The MES set resources packet has an optional bit 'lr_compute_wa'
> which can be used for preventing MES hangs on long compute jobs.
>
> Set this bit by default.
>
> Co-developed-by: Yifan Zhang <[email protected]>
> Signed-off-by: Yifan Zhang <[email protected]>
> Signed-off-by: Mario Limonciello <[email protected]>

Acked-by: Alex Deucher <[email protected]>

> ---
> v3:
>  * gate on fw version
> v2:
>  * drop module parameter
>  * add more description to commit text
> ---
>  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        | 6 ++++++
>  drivers/gpu/drm/amd/amdgpu/mes_v12_0.c        | 5 +++++
>  drivers/gpu/drm/amd/include/mes_v11_api_def.h | 3 ++-
>  drivers/gpu/drm/amd/include/mes_v12_api_def.h | 3 ++-
>  4 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 3b91ea601add4..e82188431f796 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -713,6 +713,12 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes 
> *mes)
>         mes_set_hw_res_pkt.enable_reg_active_poll = 1;
>         mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
>         mes_set_hw_res_pkt.oversubscription_timer = 50;
> +       if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x7f)
> +               mes_set_hw_res_pkt.enable_lr_compute_wa = 1;
> +       else
> +               dev_info_once(mes->adev->dev,
> +                             "MES FW version must be >= 0x7f to enable LR 
> compute workaround.\n");
> +
>         if (amdgpu_mes_log_enable) {
>                 mes_set_hw_res_pkt.enable_mes_event_int_logging = 1;
>                 mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr =
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 998893dff08e9..aff06f06aeeec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -769,6 +769,11 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes 
> *mes, int pipe)
>         mes_set_hw_res_pkt.use_different_vmid_compute = 1;
>         mes_set_hw_res_pkt.enable_reg_active_poll = 1;
>         mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
> +       if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x82)
> +               mes_set_hw_res_pkt.enable_lr_compute_wa = 1;
> +       else
> +               dev_info_once(adev->dev,
> +                             "MES FW version must be >= 0x82 to enable LR 
> compute workaround.\n");
>
>         /*
>          * Keep oversubscribe timer for sdma . When we have unmapped doorbell
> diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h 
> b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> index 15680c3f49704..ab1cfc92dbeb1 100644
> --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> @@ -238,7 +238,8 @@ union MESAPI_SET_HW_RESOURCES {
>                                 uint32_t enable_mes_sch_stb_log : 1;
>                                 uint32_t limit_single_process : 1;
>                                 uint32_t is_strix_tmz_wa_enabled  :1;
> -                               uint32_t reserved : 13;
> +                               uint32_t enable_lr_compute_wa : 1;
> +                               uint32_t reserved : 12;
>                         };
>                         uint32_t        uint32_t_all;
>                 };
> diff --git a/drivers/gpu/drm/amd/include/mes_v12_api_def.h 
> b/drivers/gpu/drm/amd/include/mes_v12_api_def.h
> index c04bd351b2505..69611c7e30e35 100644
> --- a/drivers/gpu/drm/amd/include/mes_v12_api_def.h
> +++ b/drivers/gpu/drm/amd/include/mes_v12_api_def.h
> @@ -287,7 +287,8 @@ union MESAPI_SET_HW_RESOURCES {
>                                 uint32_t limit_single_process : 1;
>                                 uint32_t unmapped_doorbell_handling: 2;
>                                 uint32_t enable_mes_fence_int: 1;
> -                               uint32_t reserved : 10;
> +                               uint32_t enable_lr_compute_wa : 1;
> +                               uint32_t reserved : 9;
>                         };
>                         uint32_t uint32_all;
>                 };
> --
> 2.43.0
>

Reply via email to