RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)

2021-11-16 Thread Zhou1, Tao
Thanks for your review, I'll add {} before push.

> -Original Message-
> From: Quan, Evan 
> Sent: Wednesday, November 17, 2021 9:50 AM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang,
> Hawking ; Clements, John
> ; Yang, Stanley ; Lazar,
> Lijo ; Wang, Yang(Kevin) 
> Subject: RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
> 
> [AMD Official Use Only]
> 
> With the concern from Guchun addressed, the patch is reviewed-by: Evan Quan
> 
> 
> > -Original Message-
> > From: Zhou1, Tao 
> > Sent: Tuesday, November 16, 2021 6:29 PM
> > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> > ; Clements, John ;
> Yang,
> > Stanley ; Quan, Evan ;
> Lazar,
> > Lijo ; Wang,
> > Yang(Kevin) 
> > Cc: Zhou1, Tao 
> > Subject: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
> >
> > If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
> > reset message.
> >
> > v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran
> > specific currently.
> >
> > Signed-off-by: Tao Zhou 
> > ---
> >  drivers/gpu/drm/amd/pm/inc/smu_v13_0.h|  3 +-
> >  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36
> > ++-
> >  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
> >  3 files changed, 37 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> > b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> > index e5d3b0d1a032..bbc608c990b0 100644
> > --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> > +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> > @@ -29,6 +29,8 @@
> >  #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04  #define
> > SMU13_DRIVER_IF_VERSION_ALDE 0x07
> >
> > +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms
> > +
> >  /* MP Apertures */
> >  #define MP0_Public 0x0380
> >  #define MP0_SRAM   0x0390
> > @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context
> > *smu, enum smu_baco_state state)  int smu_v13_0_baco_enter(struct
> > smu_context *smu);  int smu_v13_0_baco_exit(struct smu_context *smu);
> >
> > -int smu_v13_0_mode1_reset(struct smu_context *smu);  int
> > smu_v13_0_mode2_reset(struct smu_context *smu);
> >
> >  int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum
> > smu_clk_type clk_type, diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index 59a7d276541d..e50d4491aa96 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> > smu_context *smu,
> > return sizeof(struct gpu_metrics_v1_3);  }
> >
> > +static int aldebaran_mode1_reset(struct smu_context *smu) {
> > +   u32 smu_version, fatal_err, param;
> > +   int ret = 0;
> > +   struct amdgpu_device *adev = smu->adev;
> > +   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> > +
> > +   fatal_err = 0;
> > +   param = SMU_RESET_MODE_1;
> > +
> > +   /*
> > +   * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
> > +   */
> > +   smu_cmn_get_smc_version(smu, NULL, &smu_version);
> > +   if (smu_version < 0x00440700)
> > +   ret = smu_cmn_send_smc_msg(smu,
> > SMU_MSG_Mode1Reset, NULL);
> > +   else {
> > +   /* fatal error triggered by ras, PMFW supports the flag
> > +  from 68.44.0 */
> > +   if ((smu_version >= 0x00442c00) && ras &&
> > +   atomic_read(&ras->in_recovery))
> > +   fatal_err = 1;
> > +
> > +   param |= (fatal_err << 16);
> > +   ret = smu_cmn_send_smc_msg_with_param(smu,
> > +   SMU_MSG_GfxDeviceDriverReset,
> > param, NULL);
> > +   }
> > +
> > +   if (!ret)
> > +   msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
> > +
> > +   return ret;
> > +}
> > +
> >  static int aldebaran_mode2_reset(struct smu_context *smu)  {
> > u32 smu_version;
> > @@ -1925,7 +1959,7 @@ static const struct pptable_funcs
> > aldebaran_ppt_funcs = {
> > .get_gpu_metrics = aldebaran_get_gpu_metrics,
> > .mode1_reset_is_support = aldebaran_is_mode1_reset_supported,
> > .mode2_reset_is_support = aldebaran_is_mode2_reset_s

RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)

2021-11-16 Thread Quan, Evan
[AMD Official Use Only]

With the concern from Guchun addressed, the patch is reviewed-by: Evan Quan 


> -Original Message-
> From: Zhou1, Tao 
> Sent: Tuesday, November 16, 2021 6:29 PM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> ; Clements, John ;
> Yang, Stanley ; Quan, Evan
> ; Lazar, Lijo ; Wang,
> Yang(Kevin) 
> Cc: Zhou1, Tao 
> Subject: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
> 
> If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
> reset message.
> 
> v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran
> specific currently.
> 
> Signed-off-by: Tao Zhou 
> ---
>  drivers/gpu/drm/amd/pm/inc/smu_v13_0.h|  3 +-
>  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36
> ++-
>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
>  3 files changed, 37 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> index e5d3b0d1a032..bbc608c990b0 100644
> --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> @@ -29,6 +29,8 @@
>  #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04
>  #define SMU13_DRIVER_IF_VERSION_ALDE 0x07
> 
> +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms
> +
>  /* MP Apertures */
>  #define MP0_Public   0x0380
>  #define MP0_SRAM 0x0390
> @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context
> *smu, enum smu_baco_state state)
>  int smu_v13_0_baco_enter(struct smu_context *smu);
>  int smu_v13_0_baco_exit(struct smu_context *smu);
> 
> -int smu_v13_0_mode1_reset(struct smu_context *smu);
>  int smu_v13_0_mode2_reset(struct smu_context *smu);
> 
>  int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum
> smu_clk_type clk_type,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 59a7d276541d..e50d4491aa96 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> smu_context *smu,
>   return sizeof(struct gpu_metrics_v1_3);
>  }
> 
> +static int aldebaran_mode1_reset(struct smu_context *smu)
> +{
> + u32 smu_version, fatal_err, param;
> + int ret = 0;
> + struct amdgpu_device *adev = smu->adev;
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + fatal_err = 0;
> + param = SMU_RESET_MODE_1;
> +
> + /*
> + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
> + */
> + smu_cmn_get_smc_version(smu, NULL, &smu_version);
> + if (smu_version < 0x00440700)
> + ret = smu_cmn_send_smc_msg(smu,
> SMU_MSG_Mode1Reset, NULL);
> + else {
> + /* fatal error triggered by ras, PMFW supports the flag
> +from 68.44.0 */
> + if ((smu_version >= 0x00442c00) && ras &&
> + atomic_read(&ras->in_recovery))
> + fatal_err = 1;
> +
> + param |= (fatal_err << 16);
> + ret = smu_cmn_send_smc_msg_with_param(smu,
> + SMU_MSG_GfxDeviceDriverReset,
> param, NULL);
> + }
> +
> + if (!ret)
> + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
> +
> + return ret;
> +}
> +
>  static int aldebaran_mode2_reset(struct smu_context *smu)
>  {
>   u32 smu_version;
> @@ -1925,7 +1959,7 @@ static const struct pptable_funcs
> aldebaran_ppt_funcs = {
>   .get_gpu_metrics = aldebaran_get_gpu_metrics,
>   .mode1_reset_is_support = aldebaran_is_mode1_reset_supported,
>   .mode2_reset_is_support = aldebaran_is_mode2_reset_supported,
> - .mode1_reset = smu_v13_0_mode1_reset,
> + .mode1_reset = aldebaran_mode1_reset,
>   .set_mp1_state = aldebaran_set_mp1_state,
>   .mode2_reset = aldebaran_mode2_reset,
>   .wait_for_event = smu_v13_0_wait_for_event,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 35145db6eedf..4d96099a9bb1 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -60,8 +60,6 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin");
> 
>  #define SMU13_VOLTAGE_SCALE 4
> 
> -#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms
> -
>  #define LINK_WIDTH_MAX   6
>  #define LINK_SPEED_MAX

RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)

2021-11-16 Thread Chen, Guchun
[Public]

A coding style problem.

A {} is needed for the path after if (smu_version < 0x00440700).

if (smu_version < 0x00440700)
> + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
> + else {
> + /* fatal error triggered by ras, PMFW supports the flag
> +from 68.44.0 */
> + if ((smu_version >= 0x00442c00) && ras &&
> + atomic_read(&ras->in_recovery))
> + fatal_err = 1;
> +
> + param |= (fatal_err << 16);
> + ret = smu_cmn_send_smc_msg_with_param(smu,
> + SMU_MSG_GfxDeviceDriverReset, param, 
> NULL);
> + }

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Tuesday, November 16, 2021 6:41 PM
To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang, 
Hawking ; Clements, John ; Yang, 
Stanley ; Quan, Evan ; Wang, 
Yang(Kevin) 
Subject: Re: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)



On 11/16/2021 3:58 PM, Tao Zhou wrote:
> If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 
> reset message.
> 
> v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran 
> specific currently.
> 
> Signed-off-by: Tao Zhou 

Reviewed-by: Lijo Lazar 

Thanks,
Lijo

> ---
>   drivers/gpu/drm/amd/pm/inc/smu_v13_0.h|  3 +-
>   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 ++-
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
>   3 files changed, 37 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h 
> b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> index e5d3b0d1a032..bbc608c990b0 100644
> --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
> @@ -29,6 +29,8 @@
>   #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04
>   #define SMU13_DRIVER_IF_VERSION_ALDE 0x07
>   
> +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms
> +
>   /* MP Apertures */
>   #define MP0_Public  0x0380
>   #define MP0_SRAM0x0390
> @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context *smu, 
> enum smu_baco_state state)
>   int smu_v13_0_baco_enter(struct smu_context *smu);
>   int smu_v13_0_baco_exit(struct smu_context *smu);
>   
> -int smu_v13_0_mode1_reset(struct smu_context *smu);
>   int smu_v13_0_mode2_reset(struct smu_context *smu);
>   
>   int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum 
> smu_clk_type clk_type, diff --git 
> a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 59a7d276541d..e50d4491aa96 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct 
> smu_context *smu,
>   return sizeof(struct gpu_metrics_v1_3);
>   }
>   
> +static int aldebaran_mode1_reset(struct smu_context *smu) {
> + u32 smu_version, fatal_err, param;
> + int ret = 0;
> + struct amdgpu_device *adev = smu->adev;
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + fatal_err = 0;
> + param = SMU_RESET_MODE_1;
> +
> + /*
> + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
> + */
> + smu_cmn_get_smc_version(smu, NULL, &smu_version);
> + if (smu_version < 0x00440700)
> + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
> + else {
> + /* fatal error triggered by ras, PMFW supports the flag
> +from 68.44.0 */
> + if ((smu_version >= 0x00442c00) && ras &&
> + atomic_read(&ras->in_recovery))
> + fatal_err = 1;
> +
> + param |= (fatal_err << 16);
> + ret = smu_cmn_send_smc_msg_with_param(smu,
> + SMU_MSG_GfxDeviceDriverReset, param, 
> NULL);
> + }
> +
> + if (!ret)
> + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
> +
> + return ret;
> +}
> +
>   static int aldebaran_mode2_reset(struct smu_context *smu)
>   {
>   u32 smu_version;
> @@ -1925,7 +1959,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = 
> {
>   .get_gpu_metrics = aldebaran_get_gpu_metrics,
>   .mode1_reset_is_support = aldebaran_is_mode1_reset_supported,
>   .mode2_reset_is_support = aldebaran_is_mode2_reset_supported,
> - .mode1_reset = smu_v13_0_mode1_reset,
> + .mode1_reset = aldebaran_mode1_

Re: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)

2021-11-16 Thread Lazar, Lijo




On 11/16/2021 3:58 PM, Tao Zhou wrote:

If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
reset message.

v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran
specific currently.

Signed-off-by: Tao Zhou 


Reviewed-by: Lijo Lazar 

Thanks,
Lijo


---
  drivers/gpu/drm/amd/pm/inc/smu_v13_0.h|  3 +-
  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 ++-
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
  3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h 
b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
index e5d3b0d1a032..bbc608c990b0 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
@@ -29,6 +29,8 @@
  #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04
  #define SMU13_DRIVER_IF_VERSION_ALDE 0x07
  
+#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms

+
  /* MP Apertures */
  #define MP0_Public0x0380
  #define MP0_SRAM  0x0390
@@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context *smu, enum 
smu_baco_state state)
  int smu_v13_0_baco_enter(struct smu_context *smu);
  int smu_v13_0_baco_exit(struct smu_context *smu);
  
-int smu_v13_0_mode1_reset(struct smu_context *smu);

  int smu_v13_0_mode2_reset(struct smu_context *smu);
  
  int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type,

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 59a7d276541d..e50d4491aa96 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct 
smu_context *smu,
return sizeof(struct gpu_metrics_v1_3);
  }
  
+static int aldebaran_mode1_reset(struct smu_context *smu)

+{
+   u32 smu_version, fatal_err, param;
+   int ret = 0;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   fatal_err = 0;
+   param = SMU_RESET_MODE_1;
+
+   /*
+   * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
+   */
+   smu_cmn_get_smc_version(smu, NULL, &smu_version);
+   if (smu_version < 0x00440700)
+   ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
+   else {
+   /* fatal error triggered by ras, PMFW supports the flag
+  from 68.44.0 */
+   if ((smu_version >= 0x00442c00) && ras &&
+   atomic_read(&ras->in_recovery))
+   fatal_err = 1;
+
+   param |= (fatal_err << 16);
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_GfxDeviceDriverReset, param, 
NULL);
+   }
+
+   if (!ret)
+   msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
+
+   return ret;
+}
+
  static int aldebaran_mode2_reset(struct smu_context *smu)
  {
u32 smu_version;
@@ -1925,7 +1959,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
.get_gpu_metrics = aldebaran_get_gpu_metrics,
.mode1_reset_is_support = aldebaran_is_mode1_reset_supported,
.mode2_reset_is_support = aldebaran_is_mode2_reset_supported,
-   .mode1_reset = smu_v13_0_mode1_reset,
+   .mode1_reset = aldebaran_mode1_reset,
.set_mp1_state = aldebaran_set_mp1_state,
.mode2_reset = aldebaran_mode2_reset,
.wait_for_event = smu_v13_0_wait_for_event,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 35145db6eedf..4d96099a9bb1 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -60,8 +60,6 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin");
  
  #define SMU13_VOLTAGE_SCALE 4
  
-#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms

-
  #define LINK_WIDTH_MAX6
  #define LINK_SPEED_MAX3
  
@@ -1424,25 +1422,6 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu)

return ret;
  }
  
-int smu_v13_0_mode1_reset(struct smu_context *smu)

-{
-   u32 smu_version;
-   int ret = 0;
-   /*
-   * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
-   */
-   smu_cmn_get_smc_version(smu, NULL, &smu_version);
-   if (smu_version < 0x00440700)
-   ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
-   else
-   ret = smu_cmn_send_smc_msg_with_param(smu, 
SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
-
-   if (!ret)
-   msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
-
-   return ret;
-}
-
  static int smu_v13_0_wait_for_reset_complete(struct smu_context *smu,
 uint64_t event_

[PATCH] drm/amdgpu: support new mode-1 reset interface (v2)

2021-11-16 Thread Tao Zhou
If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
reset message.

v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran
specific currently.

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/pm/inc/smu_v13_0.h|  3 +-
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 ++-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h 
b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
index e5d3b0d1a032..bbc608c990b0 100644
--- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
+++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
@@ -29,6 +29,8 @@
 #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04
 #define SMU13_DRIVER_IF_VERSION_ALDE 0x07
 
+#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms
+
 /* MP Apertures */
 #define MP0_Public 0x0380
 #define MP0_SRAM   0x0390
@@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context *smu, enum 
smu_baco_state state)
 int smu_v13_0_baco_enter(struct smu_context *smu);
 int smu_v13_0_baco_exit(struct smu_context *smu);
 
-int smu_v13_0_mode1_reset(struct smu_context *smu);
 int smu_v13_0_mode2_reset(struct smu_context *smu);
 
 int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type 
clk_type,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 59a7d276541d..e50d4491aa96 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct 
smu_context *smu,
return sizeof(struct gpu_metrics_v1_3);
 }
 
+static int aldebaran_mode1_reset(struct smu_context *smu)
+{
+   u32 smu_version, fatal_err, param;
+   int ret = 0;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   fatal_err = 0;
+   param = SMU_RESET_MODE_1;
+
+   /*
+   * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
+   */
+   smu_cmn_get_smc_version(smu, NULL, &smu_version);
+   if (smu_version < 0x00440700)
+   ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
+   else {
+   /* fatal error triggered by ras, PMFW supports the flag
+  from 68.44.0 */
+   if ((smu_version >= 0x00442c00) && ras &&
+   atomic_read(&ras->in_recovery))
+   fatal_err = 1;
+
+   param |= (fatal_err << 16);
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_GfxDeviceDriverReset, param, 
NULL);
+   }
+
+   if (!ret)
+   msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
+
+   return ret;
+}
+
 static int aldebaran_mode2_reset(struct smu_context *smu)
 {
u32 smu_version;
@@ -1925,7 +1959,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
.get_gpu_metrics = aldebaran_get_gpu_metrics,
.mode1_reset_is_support = aldebaran_is_mode1_reset_supported,
.mode2_reset_is_support = aldebaran_is_mode2_reset_supported,
-   .mode1_reset = smu_v13_0_mode1_reset,
+   .mode1_reset = aldebaran_mode1_reset,
.set_mp1_state = aldebaran_set_mp1_state,
.mode2_reset = aldebaran_mode2_reset,
.wait_for_event = smu_v13_0_wait_for_event,
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 35145db6eedf..4d96099a9bb1 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -60,8 +60,6 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin");
 
 #define SMU13_VOLTAGE_SCALE 4
 
-#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500  //500ms
-
 #define LINK_WIDTH_MAX 6
 #define LINK_SPEED_MAX 3
 
@@ -1424,25 +1422,6 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu)
return ret;
 }
 
-int smu_v13_0_mode1_reset(struct smu_context *smu)
-{
-   u32 smu_version;
-   int ret = 0;
-   /*
-   * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
-   */
-   smu_cmn_get_smc_version(smu, NULL, &smu_version);
-   if (smu_version < 0x00440700)
-   ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
-   else
-   ret = smu_cmn_send_smc_msg_with_param(smu, 
SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
-
-   if (!ret)
-   msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
-
-   return ret;
-}
-
 static int smu_v13_0_wait_for_reset_complete(struct smu_context *smu,
 uint64_t event_arg)
 {
-- 
2.17.1



Re: [PATCH] drm/amdgpu: support new mode-1 reset interface

2021-11-16 Thread Lazar, Lijo




On 11/16/2021 2:17 PM, Zhou1, Tao wrote:

[AMD Official Use Only]

Hi Lijo,

Your concern is reasonable, but in fact smu_v13_0_mode1_reset is used only by 
ALDEBARAN currently. I assume the PMFW of new smu v13 ASIC in the future will 
follow this design, otherwise we could move the implementation into xxx_ppt.c.



Actually, this is meant to be a common logic for SMU13 based ASICs. The 
version check in a common file is not maintainable. I see there is a 
version check before also, even that is not proper :)


It is better to do it properly when support is added rather than 
thinking of refactoring with future ASICs.


Thanks,
Lijo


Regards,
Tao


-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, November 16, 2021 3:44 PM
To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang,
Hawking ; Clements, John
; Yang, Stanley ; Quan,
Evan 
Subject: Re: [PATCH] drm/amdgpu: support new mode-1 reset interface



On 11/16/2021 12:53 PM, Tao Zhou wrote:

If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
reset message.

Signed-off-by: Tao Zhou 
---
   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21

---

   1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 35145db6eedf..6f3d064a8232 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct
smu_context *smu)

   int smu_v13_0_mode1_reset(struct smu_context *smu)
   {
-   u32 smu_version;
+   u32 smu_version, fatal_err, param;
 int ret = 0;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   fatal_err = 0;
+   param = SMU_RESET_MODE_1;
+
 /*
 * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
 */
 smu_cmn_get_smc_version(smu, NULL, &smu_version);
 if (smu_version < 0x00440700)
 ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset,

NULL);

-   else
-   ret = smu_cmn_send_smc_msg_with_param(smu,

SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);

+   else {
+   /* fatal error triggered by ras, PMFW supports the flag
+  from 68.44.0 */
+   if ((smu_version >= 0x00442c00) && ras &&
+   atomic_read(&ras->in_recovery))
+   fatal_err = 1;
+


  From PMFW version, this looks specific to aldebaran. Since there is version
check as well, the implementation needs to be moved to aldebaran_ppt.c

Thanks,
Lijo


+   param |= (fatal_err << 16);
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_GfxDeviceDriverReset,

param, NULL);

+   }

 if (!ret)
 msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);



RE: [PATCH] drm/amdgpu: support new mode-1 reset interface

2021-11-16 Thread Zhou1, Tao
[AMD Official Use Only]

Hi Lijo,

Your concern is reasonable, but in fact smu_v13_0_mode1_reset is used only by 
ALDEBARAN currently. I assume the PMFW of new smu v13 ASIC in the future will 
follow this design, otherwise we could move the implementation into xxx_ppt.c.

Regards,
Tao

> -Original Message-
> From: Lazar, Lijo 
> Sent: Tuesday, November 16, 2021 3:44 PM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang,
> Hawking ; Clements, John
> ; Yang, Stanley ; Quan,
> Evan 
> Subject: Re: [PATCH] drm/amdgpu: support new mode-1 reset interface
>
>
>
> On 11/16/2021 12:53 PM, Tao Zhou wrote:
> > If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
> > reset message.
> >
> > Signed-off-by: Tao Zhou 
> > ---
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21
> ---
> >   1 file changed, 18 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index 35145db6eedf..6f3d064a8232 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct
> > smu_context *smu)
> >
> >   int smu_v13_0_mode1_reset(struct smu_context *smu)
> >   {
> > -   u32 smu_version;
> > +   u32 smu_version, fatal_err, param;
> > int ret = 0;
> > +   struct amdgpu_device *adev = smu->adev;
> > +   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> > +
> > +   fatal_err = 0;
> > +   param = SMU_RESET_MODE_1;
> > +
> > /*
> > * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
> > */
> > smu_cmn_get_smc_version(smu, NULL, &smu_version);
> > if (smu_version < 0x00440700)
> > ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset,
> NULL);
> > -   else
> > -   ret = smu_cmn_send_smc_msg_with_param(smu,
> SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
> > +   else {
> > +   /* fatal error triggered by ras, PMFW supports the flag
> > +  from 68.44.0 */
> > +   if ((smu_version >= 0x00442c00) && ras &&
> > +   atomic_read(&ras->in_recovery))
> > +   fatal_err = 1;
> > +
>
>  From PMFW version, this looks specific to aldebaran. Since there is version
> check as well, the implementation needs to be moved to aldebaran_ppt.c
>
> Thanks,
> Lijo
>
> > +   param |= (fatal_err << 16);
> > +   ret = smu_cmn_send_smc_msg_with_param(smu,
> > +   SMU_MSG_GfxDeviceDriverReset,
> param, NULL);
> > +   }
> >
> > if (!ret)
> > msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
> >


Re: [PATCH] drm/amdgpu: support new mode-1 reset interface

2021-11-15 Thread Lazar, Lijo




On 11/16/2021 12:53 PM, Tao Zhou wrote:

If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
reset message.

Signed-off-by: Tao Zhou 
---
  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
  1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 35145db6eedf..6f3d064a8232 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu)
  
  int smu_v13_0_mode1_reset(struct smu_context *smu)

  {
-   u32 smu_version;
+   u32 smu_version, fatal_err, param;
int ret = 0;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   fatal_err = 0;
+   param = SMU_RESET_MODE_1;
+
/*
* PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
*/
smu_cmn_get_smc_version(smu, NULL, &smu_version);
if (smu_version < 0x00440700)
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
-   else
-   ret = smu_cmn_send_smc_msg_with_param(smu, 
SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
+   else {
+   /* fatal error triggered by ras, PMFW supports the flag
+  from 68.44.0 */
+   if ((smu_version >= 0x00442c00) && ras &&
+   atomic_read(&ras->in_recovery))
+   fatal_err = 1;
+


From PMFW version, this looks specific to aldebaran. Since there is 
version check as well, the implementation needs to be moved to 
aldebaran_ppt.c


Thanks,
Lijo


+   param |= (fatal_err << 16);
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_GfxDeviceDriverReset, param, 
NULL);
+   }
  
  	if (!ret)

msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);



RE: [PATCH] drm/amdgpu: support new mode-1 reset interface

2021-11-15 Thread Zhang, Hawking
[AMD Official Use Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Zhou1, Tao  
Sent: Tuesday, November 16, 2021 15:24
To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Clements, John ; Yang, Stanley ; 
Quan, Evan 
Cc: Zhou1, Tao 
Subject: [PATCH] drm/amdgpu: support new mode-1 reset interface

If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset 
message.

Signed-off-by: Tao Zhou 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 35145db6eedf..6f3d064a8232 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu)
 
 int smu_v13_0_mode1_reset(struct smu_context *smu)  {
-   u32 smu_version;
+   u32 smu_version, fatal_err, param;
int ret = 0;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   fatal_err = 0;
+   param = SMU_RESET_MODE_1;
+
/*
* PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
*/
smu_cmn_get_smc_version(smu, NULL, &smu_version);
if (smu_version < 0x00440700)
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
-   else
-   ret = smu_cmn_send_smc_msg_with_param(smu, 
SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
+   else {
+   /* fatal error triggered by ras, PMFW supports the flag
+  from 68.44.0 */
+   if ((smu_version >= 0x00442c00) && ras &&
+   atomic_read(&ras->in_recovery))
+   fatal_err = 1;
+
+   param |= (fatal_err << 16);
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_GfxDeviceDriverReset, param, 
NULL);
+   }
 
if (!ret)
msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
--
2.17.1


[PATCH] drm/amdgpu: support new mode-1 reset interface

2021-11-15 Thread Tao Zhou
If gpu reset is triggered by ras fatal error, tell it to smu in mode-1
reset message.

Signed-off-by: Tao Zhou 
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 ---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 35145db6eedf..6f3d064a8232 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu)
 
 int smu_v13_0_mode1_reset(struct smu_context *smu)
 {
-   u32 smu_version;
+   u32 smu_version, fatal_err, param;
int ret = 0;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   fatal_err = 0;
+   param = SMU_RESET_MODE_1;
+
/*
* PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
*/
smu_cmn_get_smc_version(smu, NULL, &smu_version);
if (smu_version < 0x00440700)
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
-   else
-   ret = smu_cmn_send_smc_msg_with_param(smu, 
SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
+   else {
+   /* fatal error triggered by ras, PMFW supports the flag
+  from 68.44.0 */
+   if ((smu_version >= 0x00442c00) && ras &&
+   atomic_read(&ras->in_recovery))
+   fatal_err = 1;
+
+   param |= (fatal_err << 16);
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_GfxDeviceDriverReset, param, 
NULL);
+   }
 
if (!ret)
msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
-- 
2.17.1