[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Asad Kamal <asad.ka...@amd.com>

Thanks & Regards
Asad

-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Lazar, Lijo
Sent: Tuesday, July 9, 2024 3:03 PM
To: Slivka, Danijel <danijel.sli...@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Wang, Yang(Kevin) <kevinyang.w...@amd.com>; Feng, Kenneth 
<kenneth.f...@amd.com>
Subject: Re: [PATCH v2] drm/amd/pm: Ignore initial value in smu response 
register



On 7/8/2024 7:01 PM, Danijel Slivka wrote:
> Why:
> If the reg mmMP1_SMN_C2PMSG_90 is being written to during amdgpu
> driver load or driver unload, subsequent amdgpu driver load will fail
> at smu_hw_init. The default of mmMP1_SMN_C2PMSG_90 register at a clean
> environment is 0x1 and if value differs from expected, amdgpu driver
> load will fail.
>
> How to fix:
> Ignore the initial value in smu response register before the first smu
> message is sent,if smc in SMU_FW_INIT state, just proceed further to
> send the message. If register holds an unexpected value after smu
> message was sent set, smc_state to SMU_FW_HANG state and no further
> smu messages will be sent.
>
> v2:
> Set SMU_FW_INIT state at the start of smu hw_init/resume.
> Check smc_fw_state before sending smu message if in hang state skip
> sending message.
> Set SMU_FW_HANG only in case unexpected value is detected
>
> Signed-off-by: Danijel Slivka <danijel.sli...@amd.com>

Patch looks good to me

        Reviewed-by: Lijo Lazar <lijo.la...@amd.com>

Copying Kenneth/Kevin as well.

Thanks,
Lijo

> ---
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     |  2 ++
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  7 ++++
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 34 ++++++++++++++++---
>  3 files changed, 38 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index d79bdb1e8cdf..fb8643d25d1b 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -1755,6 +1755,8 @@ static int smu_start_smc_engine(struct smu_context *smu)
>       struct amdgpu_device *adev = smu->adev;
>       int ret = 0;
>
> +     smu->smc_fw_state = SMU_FW_INIT;
> +
>       if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
>               if (amdgpu_ip_version(adev, MP1_HWIP, 0) < IP_VERSION(11, 0, 
> 0)) {
>                       if (smu->ppt_funcs->load_microcode) { diff --git
> a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index a34c802f52be..b44a185d07e8 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -495,6 +495,12 @@ struct stb_context {
>       spinlock_t lock;
>  };
>
> +enum smu_fw_status {
> +     SMU_FW_INIT = 0,
> +     SMU_FW_RUNTIME,
> +     SMU_FW_HANG,
> +};
> +
>  #define WORKLOAD_POLICY_MAX 7
>
>  /*
> @@ -562,6 +568,7 @@ struct smu_context {
>       uint32_t smc_fw_if_version;
>       uint32_t smc_fw_version;
>       uint32_t smc_fw_caps;
> +     uint8_t smc_fw_state;
>
>       bool uploading_custom_pp_table;
>       bool dc_controlled_by_gpio;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 5592fd825aa3..d7c983a1f3f5 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -315,11 +315,20 @@ int smu_cmn_send_msg_without_waiting(struct smu_context 
> *smu,
>       if (adev->no_hw_access)
>               return 0;
>
> -     reg = __smu_cmn_poll_stat(smu);
> -     res = __smu_cmn_reg2errno(smu, reg);
> -     if (reg == SMU_RESP_NONE ||
> -         res == -EREMOTEIO)
> +     if (smu->smc_fw_state == SMU_FW_HANG) {
> +             dev_err(adev->dev, "SMU is in hanged state, failed to send smu
> +message!\n");
>               goto Out;
> +     }
> +
> +     if (smu->smc_fw_state == SMU_FW_INIT) {
> +             smu->smc_fw_state = SMU_FW_RUNTIME;
> +     } else {
> +             reg = __smu_cmn_poll_stat(smu);
> +             res = __smu_cmn_reg2errno(smu, reg);
> +             if (reg == SMU_RESP_NONE || res == -EREMOTEIO)
> +                     goto Out;
> +     }
> +
>       __smu_cmn_send_msg(smu, msg_index, param);
>       res = 0;
>  Out:
> @@ -350,6 +359,9 @@ int smu_cmn_wait_for_response(struct smu_context *smu)
>       reg = __smu_cmn_poll_stat(smu);
>       res = __smu_cmn_reg2errno(smu, reg);
>
> +     if (res == -EREMOTEIO)
> +             smu->smc_fw_state = SMU_FW_HANG;
> +
>       if (unlikely(smu->adev->pm.smu_debug_mask & SMU_DEBUG_HALT_ON_ERROR) &&
>           res && (res != -ETIME)) {
>               amdgpu_device_halt(smu->adev);
> @@ -418,6 +430,15 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
> *smu,
>                       goto Out;
>       }
>
> +     if (smu->smc_fw_state == SMU_FW_HANG) {
> +             dev_err(adev->dev, "SMU is in hanged state, failed to send smu 
> message!\n");
> +             goto Out;
> +     } else if (smu->smc_fw_state == SMU_FW_INIT) {
> +             /* Ignore initial smu response register value */
> +             poll = false;
> +             smu->smc_fw_state = SMU_FW_RUNTIME;
> +     }
> +
>       if (poll) {
>               reg = __smu_cmn_poll_stat(smu);
>               res = __smu_cmn_reg2errno(smu, reg); @@ -429,8 +450,11 @@ int
> smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>       __smu_cmn_send_msg(smu, (uint16_t) index, param);
>       reg = __smu_cmn_poll_stat(smu);
>       res = __smu_cmn_reg2errno(smu, reg);
> -     if (res != 0)
> +     if (res != 0) {
> +             if (res == -EREMOTEIO)
> +                     smu->smc_fw_state = SMU_FW_HANG;
>               __smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +     }
>       if (read_arg) {
>               smu_cmn_read_arg(smu, read_arg);
>               dev_dbg(adev->dev, "smu send message: %s(%d) param: 0x%08x, 
> resp:
> 0x%08x,\

Reply via email to