Hey guys,


This patch caused some kind of reversion with smu_reset on Navi10. I'm
no expert since everything I know comes from just reading through the
code, so this could be some kind of intended behavior, but after this
patch, if you write a pptable to the sysfs pp_table interface on navi10,
then the SMU will fail to reset successfully, and the result is
seemingly an unrecoverable situation.



I put in a report on bugzilla with dmesg logs
:
https://bugs.freedesktop.org/show_bug.cgi?id=112234


Finding this change was the result of a bisect to find where the issue
started, and reverting the changes to smu_hw_fini resolved the issue.
Any advice on possible proper fixes?

Thanks in advance,

Matt

On 9/2/19 9:44 PM, Quan, Evan wrote:
> These are needed for smu_reset support.
> 
> Change-Id: If29ede4b99758adb08fd4e16665f44fd893ec99b
> Signed-off-by: Evan Quan <evan.q...@amd.com>
> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c     | 17 +++++++++++++++++
>  drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h |  3 +++
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c      | 10 ++++++++++
>  3 files changed, 30 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index d5ee13a78eb7..3cf8d944f890 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -1286,6 +1286,11 @@ static int smu_hw_init(void *handle)
>       return ret;
>  }
>  
> +static int smu_stop_dpms(struct smu_context *smu)
> +{
> +     return smu_send_smc_msg(smu, SMU_MSG_DisableAllSmuFeatures);
> +}
> +
>  static int smu_hw_fini(void *handle)
>  {
>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -1298,6 +1303,18 @@ static int smu_hw_fini(void *handle)
>               smu_powergate_vcn(&adev->smu, true);
>       }
>  
> +     ret = smu_stop_thermal_control(smu);
> +     if (ret) {
> +             pr_warn("Fail to stop thermal control!\n");
> +             return ret;
> +     }
> +
> +     ret = smu_stop_dpms(smu);
> +     if (ret) {
> +             pr_warn("Fail to stop Dpms!\n");
> +             return ret;
> +     }
> +
>       kfree(table_context->driver_pptable);
>       table_context->driver_pptable = NULL;
>  
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index b19224cb6d6d..8e4b0ad24712 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -498,6 +498,7 @@ struct smu_funcs
>       int (*get_current_clk_freq)(struct smu_context *smu, enum smu_clk_type 
> clk_id, uint32_t *value);
>       int (*init_max_sustainable_clocks)(struct smu_context *smu);
>       int (*start_thermal_control)(struct smu_context *smu);
> +     int (*stop_thermal_control)(struct smu_context *smu);
>       int (*read_sensor)(struct smu_context *smu, enum amd_pp_sensors sensor,
>                          void *data, uint32_t *size);
>       int (*set_deep_sleep_dcefclk)(struct smu_context *smu, uint32_t clk);
> @@ -647,6 +648,8 @@ struct smu_funcs
>       ((smu)->ppt_funcs->set_thermal_fan_table ? 
> (smu)->ppt_funcs->set_thermal_fan_table((smu)) : 0)
>  #define smu_start_thermal_control(smu) \
>       ((smu)->funcs->start_thermal_control? 
> (smu)->funcs->start_thermal_control((smu)) : 0)
> +#define smu_stop_thermal_control(smu) \
> +     ((smu)->funcs->stop_thermal_control? 
> (smu)->funcs->stop_thermal_control((smu)) : 0)
>  #define smu_read_sensor(smu, sensor, data, size) \
>       ((smu)->ppt_funcs->read_sensor? (smu)->ppt_funcs->read_sensor((smu), 
> (sensor), (data), (size)) : 0)
>  #define smu_smc_read_sensor(smu, sensor, data, size) \
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index db5e94ce54af..1a38af84394e 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1209,6 +1209,15 @@ static int smu_v11_0_start_thermal_control(struct 
> smu_context *smu)
>       return ret;
>  }
>  
> +static int smu_v11_0_stop_thermal_control(struct smu_context *smu)
> +{
> +     struct amdgpu_device *adev = smu->adev;
> +
> +     WREG32_SOC15(THM, 0, mmTHM_THERMAL_INT_ENA, 0);
> +
> +     return 0;
> +}
> +
>  static uint16_t convert_to_vddc(uint8_t vid)
>  {
>       return (uint16_t) ((6200 - (vid * 25)) / SMU11_VOLTAGE_SCALE);
> @@ -1783,6 +1792,7 @@ static const struct smu_funcs smu_v11_0_funcs = {
>       .get_current_clk_freq = smu_v11_0_get_current_clk_freq,
>       .init_max_sustainable_clocks = smu_v11_0_init_max_sustainable_clocks,
>       .start_thermal_control = smu_v11_0_start_thermal_control,
> +     .stop_thermal_control = smu_v11_0_stop_thermal_control,
>       .read_sensor = smu_v11_0_read_sensor,
>       .set_deep_sleep_dcefclk = smu_v11_0_set_deep_sleep_dcefclk,
>       .display_clock_voltage_request = 
> smu_v11_0_display_clock_voltage_request,
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to