RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
Thanks for your review, I'll add {} before push. > -Original Message- > From: Quan, Evan > Sent: Wednesday, November 17, 2021 9:50 AM > To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang, > Hawking ; Clements, John > ; Yang, Stanley ; Lazar, > Lijo ; Wang, Yang(Kevin) > Subject: RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2) > > [AMD Official Use Only] > > With the concern from Guchun addressed, the patch is reviewed-by: Evan Quan > > > > -Original Message- > > From: Zhou1, Tao > > Sent: Tuesday, November 16, 2021 6:29 PM > > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking > > ; Clements, John ; > Yang, > > Stanley ; Quan, Evan ; > Lazar, > > Lijo ; Wang, > > Yang(Kevin) > > Cc: Zhou1, Tao > > Subject: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2) > > > > If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 > > reset message. > > > > v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran > > specific currently. > > > > Signed-off-by: Tao Zhou > > --- > > drivers/gpu/drm/amd/pm/inc/smu_v13_0.h| 3 +- > > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 > > ++- > > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- > > 3 files changed, 37 insertions(+), 23 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > > b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > > index e5d3b0d1a032..bbc608c990b0 100644 > > --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > > +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > > @@ -29,6 +29,8 @@ > > #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 #define > > SMU13_DRIVER_IF_VERSION_ALDE 0x07 > > > > +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms > > + > > /* MP Apertures */ > > #define MP0_Public 0x0380 > > #define MP0_SRAM 0x0390 > > @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context > > *smu, enum smu_baco_state state) int smu_v13_0_baco_enter(struct > > smu_context *smu); int smu_v13_0_baco_exit(struct smu_context *smu); > > > > -int smu_v13_0_mode1_reset(struct smu_context *smu); int > > smu_v13_0_mode2_reset(struct smu_context *smu); > > > > int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum > > smu_clk_type clk_type, diff --git > > a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > index 59a7d276541d..e50d4491aa96 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > > @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct > > smu_context *smu, > > return sizeof(struct gpu_metrics_v1_3); } > > > > +static int aldebaran_mode1_reset(struct smu_context *smu) { > > + u32 smu_version, fatal_err, param; > > + int ret = 0; > > + struct amdgpu_device *adev = smu->adev; > > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > + > > + fatal_err = 0; > > + param = SMU_RESET_MODE_1; > > + > > + /* > > + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 > > + */ > > + smu_cmn_get_smc_version(smu, NULL, &smu_version); > > + if (smu_version < 0x00440700) > > + ret = smu_cmn_send_smc_msg(smu, > > SMU_MSG_Mode1Reset, NULL); > > + else { > > + /* fatal error triggered by ras, PMFW supports the flag > > + from 68.44.0 */ > > + if ((smu_version >= 0x00442c00) && ras && > > + atomic_read(&ras->in_recovery)) > > + fatal_err = 1; > > + > > + param |= (fatal_err << 16); > > + ret = smu_cmn_send_smc_msg_with_param(smu, > > + SMU_MSG_GfxDeviceDriverReset, > > param, NULL); > > + } > > + > > + if (!ret) > > + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); > > + > > + return ret; > > +} > > + > > static int aldebaran_mode2_reset(struct smu_context *smu) { > > u32 smu_version; > > @@ -1925,7 +1959,7 @@ static const struct pptable_funcs > > aldebaran_ppt_funcs = { > > .get_gpu_metrics = aldebaran_get_gpu_metrics, > > .mode1_reset_is_support = aldebaran_is_mode1_reset_supported, > > .mode2_reset_is_support = aldebaran_is_mode2_reset_s
RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
[AMD Official Use Only] With the concern from Guchun addressed, the patch is reviewed-by: Evan Quan > -Original Message- > From: Zhou1, Tao > Sent: Tuesday, November 16, 2021 6:29 PM > To: amd-gfx@lists.freedesktop.org; Zhang, Hawking > ; Clements, John ; > Yang, Stanley ; Quan, Evan > ; Lazar, Lijo ; Wang, > Yang(Kevin) > Cc: Zhou1, Tao > Subject: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2) > > If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 > reset message. > > v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran > specific currently. > > Signed-off-by: Tao Zhou > --- > drivers/gpu/drm/amd/pm/inc/smu_v13_0.h| 3 +- > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 > ++- > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- > 3 files changed, 37 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > index e5d3b0d1a032..bbc608c990b0 100644 > --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > @@ -29,6 +29,8 @@ > #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 > #define SMU13_DRIVER_IF_VERSION_ALDE 0x07 > > +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms > + > /* MP Apertures */ > #define MP0_Public 0x0380 > #define MP0_SRAM 0x0390 > @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context > *smu, enum smu_baco_state state) > int smu_v13_0_baco_enter(struct smu_context *smu); > int smu_v13_0_baco_exit(struct smu_context *smu); > > -int smu_v13_0_mode1_reset(struct smu_context *smu); > int smu_v13_0_mode2_reset(struct smu_context *smu); > > int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum > smu_clk_type clk_type, > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > index 59a7d276541d..e50d4491aa96 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct > smu_context *smu, > return sizeof(struct gpu_metrics_v1_3); > } > > +static int aldebaran_mode1_reset(struct smu_context *smu) > +{ > + u32 smu_version, fatal_err, param; > + int ret = 0; > + struct amdgpu_device *adev = smu->adev; > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > + > + fatal_err = 0; > + param = SMU_RESET_MODE_1; > + > + /* > + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 > + */ > + smu_cmn_get_smc_version(smu, NULL, &smu_version); > + if (smu_version < 0x00440700) > + ret = smu_cmn_send_smc_msg(smu, > SMU_MSG_Mode1Reset, NULL); > + else { > + /* fatal error triggered by ras, PMFW supports the flag > +from 68.44.0 */ > + if ((smu_version >= 0x00442c00) && ras && > + atomic_read(&ras->in_recovery)) > + fatal_err = 1; > + > + param |= (fatal_err << 16); > + ret = smu_cmn_send_smc_msg_with_param(smu, > + SMU_MSG_GfxDeviceDriverReset, > param, NULL); > + } > + > + if (!ret) > + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); > + > + return ret; > +} > + > static int aldebaran_mode2_reset(struct smu_context *smu) > { > u32 smu_version; > @@ -1925,7 +1959,7 @@ static const struct pptable_funcs > aldebaran_ppt_funcs = { > .get_gpu_metrics = aldebaran_get_gpu_metrics, > .mode1_reset_is_support = aldebaran_is_mode1_reset_supported, > .mode2_reset_is_support = aldebaran_is_mode2_reset_supported, > - .mode1_reset = smu_v13_0_mode1_reset, > + .mode1_reset = aldebaran_mode1_reset, > .set_mp1_state = aldebaran_set_mp1_state, > .mode2_reset = aldebaran_mode2_reset, > .wait_for_event = smu_v13_0_wait_for_event, > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > index 35145db6eedf..4d96099a9bb1 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > @@ -60,8 +60,6 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin"); > > #define SMU13_VOLTAGE_SCALE 4 > > -#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms > - > #define LINK_WIDTH_MAX 6 > #define LINK_SPEED_MAX
RE: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
[Public] A coding style problem. A {} is needed for the path after if (smu_version < 0x00440700). if (smu_version < 0x00440700) > + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); > + else { > + /* fatal error triggered by ras, PMFW supports the flag > +from 68.44.0 */ > + if ((smu_version >= 0x00442c00) && ras && > + atomic_read(&ras->in_recovery)) > + fatal_err = 1; > + > + param |= (fatal_err << 16); > + ret = smu_cmn_send_smc_msg_with_param(smu, > + SMU_MSG_GfxDeviceDriverReset, param, > NULL); > + } Regards, Guchun -Original Message- From: amd-gfx On Behalf Of Lazar, Lijo Sent: Tuesday, November 16, 2021 6:41 PM To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Clements, John ; Yang, Stanley ; Quan, Evan ; Wang, Yang(Kevin) Subject: Re: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2) On 11/16/2021 3:58 PM, Tao Zhou wrote: > If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 > reset message. > > v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran > specific currently. > > Signed-off-by: Tao Zhou Reviewed-by: Lijo Lazar Thanks, Lijo > --- > drivers/gpu/drm/amd/pm/inc/smu_v13_0.h| 3 +- > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 ++- > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- > 3 files changed, 37 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > index e5d3b0d1a032..bbc608c990b0 100644 > --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h > @@ -29,6 +29,8 @@ > #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 > #define SMU13_DRIVER_IF_VERSION_ALDE 0x07 > > +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms > + > /* MP Apertures */ > #define MP0_Public 0x0380 > #define MP0_SRAM0x0390 > @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context *smu, > enum smu_baco_state state) > int smu_v13_0_baco_enter(struct smu_context *smu); > int smu_v13_0_baco_exit(struct smu_context *smu); > > -int smu_v13_0_mode1_reset(struct smu_context *smu); > int smu_v13_0_mode2_reset(struct smu_context *smu); > > int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum > smu_clk_type clk_type, diff --git > a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > index 59a7d276541d..e50d4491aa96 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct > smu_context *smu, > return sizeof(struct gpu_metrics_v1_3); > } > > +static int aldebaran_mode1_reset(struct smu_context *smu) { > + u32 smu_version, fatal_err, param; > + int ret = 0; > + struct amdgpu_device *adev = smu->adev; > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > + > + fatal_err = 0; > + param = SMU_RESET_MODE_1; > + > + /* > + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 > + */ > + smu_cmn_get_smc_version(smu, NULL, &smu_version); > + if (smu_version < 0x00440700) > + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); > + else { > + /* fatal error triggered by ras, PMFW supports the flag > +from 68.44.0 */ > + if ((smu_version >= 0x00442c00) && ras && > + atomic_read(&ras->in_recovery)) > + fatal_err = 1; > + > + param |= (fatal_err << 16); > + ret = smu_cmn_send_smc_msg_with_param(smu, > + SMU_MSG_GfxDeviceDriverReset, param, > NULL); > + } > + > + if (!ret) > + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); > + > + return ret; > +} > + > static int aldebaran_mode2_reset(struct smu_context *smu) > { > u32 smu_version; > @@ -1925,7 +1959,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = > { > .get_gpu_metrics = aldebaran_get_gpu_metrics, > .mode1_reset_is_support = aldebaran_is_mode1_reset_supported, > .mode2_reset_is_support = aldebaran_is_mode2_reset_supported, > - .mode1_reset = smu_v13_0_mode1_reset, > + .mode1_reset = aldebaran_mode1_
Re: [PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
On 11/16/2021 3:58 PM, Tao Zhou wrote: If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset message. v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran specific currently. Signed-off-by: Tao Zhou Reviewed-by: Lijo Lazar Thanks, Lijo --- drivers/gpu/drm/amd/pm/inc/smu_v13_0.h| 3 +- .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 ++- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h index e5d3b0d1a032..bbc608c990b0 100644 --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h @@ -29,6 +29,8 @@ #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 #define SMU13_DRIVER_IF_VERSION_ALDE 0x07 +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms + /* MP Apertures */ #define MP0_Public0x0380 #define MP0_SRAM 0x0390 @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state) int smu_v13_0_baco_enter(struct smu_context *smu); int smu_v13_0_baco_exit(struct smu_context *smu); -int smu_v13_0_mode1_reset(struct smu_context *smu); int smu_v13_0_mode2_reset(struct smu_context *smu); int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 59a7d276541d..e50d4491aa96 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu, return sizeof(struct gpu_metrics_v1_3); } +static int aldebaran_mode1_reset(struct smu_context *smu) +{ + u32 smu_version, fatal_err, param; + int ret = 0; + struct amdgpu_device *adev = smu->adev; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + fatal_err = 0; + param = SMU_RESET_MODE_1; + + /* + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 + */ + smu_cmn_get_smc_version(smu, NULL, &smu_version); + if (smu_version < 0x00440700) + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); + else { + /* fatal error triggered by ras, PMFW supports the flag + from 68.44.0 */ + if ((smu_version >= 0x00442c00) && ras && + atomic_read(&ras->in_recovery)) + fatal_err = 1; + + param |= (fatal_err << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GfxDeviceDriverReset, param, NULL); + } + + if (!ret) + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); + + return ret; +} + static int aldebaran_mode2_reset(struct smu_context *smu) { u32 smu_version; @@ -1925,7 +1959,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = { .get_gpu_metrics = aldebaran_get_gpu_metrics, .mode1_reset_is_support = aldebaran_is_mode1_reset_supported, .mode2_reset_is_support = aldebaran_is_mode2_reset_supported, - .mode1_reset = smu_v13_0_mode1_reset, + .mode1_reset = aldebaran_mode1_reset, .set_mp1_state = aldebaran_set_mp1_state, .mode2_reset = aldebaran_mode2_reset, .wait_for_event = smu_v13_0_wait_for_event, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 35145db6eedf..4d96099a9bb1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -60,8 +60,6 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin"); #define SMU13_VOLTAGE_SCALE 4 -#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms - #define LINK_WIDTH_MAX6 #define LINK_SPEED_MAX3 @@ -1424,25 +1422,6 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu) return ret; } -int smu_v13_0_mode1_reset(struct smu_context *smu) -{ - u32 smu_version; - int ret = 0; - /* - * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 - */ - smu_cmn_get_smc_version(smu, NULL, &smu_version); - if (smu_version < 0x00440700) - ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); - else - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); - - if (!ret) - msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); - - return ret; -} - static int smu_v13_0_wait_for_reset_complete(struct smu_context *smu, uint64_t event_
[PATCH] drm/amdgpu: support new mode-1 reset interface (v2)
If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset message. v2: move mode-1 reset function to aldebaran_ppt.c since it's aldebaran specific currently. Signed-off-by: Tao Zhou --- drivers/gpu/drm/amd/pm/inc/smu_v13_0.h| 3 +- .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c| 36 ++- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h index e5d3b0d1a032..bbc608c990b0 100644 --- a/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/inc/smu_v13_0.h @@ -29,6 +29,8 @@ #define SMU13_DRIVER_IF_VERSION_YELLOW_CARP 0x04 #define SMU13_DRIVER_IF_VERSION_ALDE 0x07 +#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms + /* MP Apertures */ #define MP0_Public 0x0380 #define MP0_SRAM 0x0390 @@ -216,7 +218,6 @@ int smu_v13_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state) int smu_v13_0_baco_enter(struct smu_context *smu); int smu_v13_0_baco_exit(struct smu_context *smu); -int smu_v13_0_mode1_reset(struct smu_context *smu); int smu_v13_0_mode2_reset(struct smu_context *smu); int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 59a7d276541d..e50d4491aa96 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1765,6 +1765,40 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu, return sizeof(struct gpu_metrics_v1_3); } +static int aldebaran_mode1_reset(struct smu_context *smu) +{ + u32 smu_version, fatal_err, param; + int ret = 0; + struct amdgpu_device *adev = smu->adev; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + fatal_err = 0; + param = SMU_RESET_MODE_1; + + /* + * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 + */ + smu_cmn_get_smc_version(smu, NULL, &smu_version); + if (smu_version < 0x00440700) + ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); + else { + /* fatal error triggered by ras, PMFW supports the flag + from 68.44.0 */ + if ((smu_version >= 0x00442c00) && ras && + atomic_read(&ras->in_recovery)) + fatal_err = 1; + + param |= (fatal_err << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GfxDeviceDriverReset, param, NULL); + } + + if (!ret) + msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); + + return ret; +} + static int aldebaran_mode2_reset(struct smu_context *smu) { u32 smu_version; @@ -1925,7 +1959,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = { .get_gpu_metrics = aldebaran_get_gpu_metrics, .mode1_reset_is_support = aldebaran_is_mode1_reset_supported, .mode2_reset_is_support = aldebaran_is_mode2_reset_supported, - .mode1_reset = smu_v13_0_mode1_reset, + .mode1_reset = aldebaran_mode1_reset, .set_mp1_state = aldebaran_set_mp1_state, .mode2_reset = aldebaran_mode2_reset, .wait_for_event = smu_v13_0_wait_for_event, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 35145db6eedf..4d96099a9bb1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -60,8 +60,6 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin"); #define SMU13_VOLTAGE_SCALE 4 -#define SMU13_MODE1_RESET_WAIT_TIME_IN_MS 500 //500ms - #define LINK_WIDTH_MAX 6 #define LINK_SPEED_MAX 3 @@ -1424,25 +1422,6 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu) return ret; } -int smu_v13_0_mode1_reset(struct smu_context *smu) -{ - u32 smu_version; - int ret = 0; - /* - * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 - */ - smu_cmn_get_smc_version(smu, NULL, &smu_version); - if (smu_version < 0x00440700) - ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); - else - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); - - if (!ret) - msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); - - return ret; -} - static int smu_v13_0_wait_for_reset_complete(struct smu_context *smu, uint64_t event_arg) { -- 2.17.1
Re: [PATCH] drm/amdgpu: support new mode-1 reset interface
On 11/16/2021 2:17 PM, Zhou1, Tao wrote: [AMD Official Use Only] Hi Lijo, Your concern is reasonable, but in fact smu_v13_0_mode1_reset is used only by ALDEBARAN currently. I assume the PMFW of new smu v13 ASIC in the future will follow this design, otherwise we could move the implementation into xxx_ppt.c. Actually, this is meant to be a common logic for SMU13 based ASICs. The version check in a common file is not maintainable. I see there is a version check before also, even that is not proper :) It is better to do it properly when support is added rather than thinking of refactoring with future ASICs. Thanks, Lijo Regards, Tao -Original Message- From: Lazar, Lijo Sent: Tuesday, November 16, 2021 3:44 PM To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Clements, John ; Yang, Stanley ; Quan, Evan Subject: Re: [PATCH] drm/amdgpu: support new mode-1 reset interface On 11/16/2021 12:53 PM, Tao Zhou wrote: If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset message. Signed-off-by: Tao Zhou --- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 35145db6eedf..6f3d064a8232 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu) int smu_v13_0_mode1_reset(struct smu_context *smu) { - u32 smu_version; + u32 smu_version, fatal_err, param; int ret = 0; + struct amdgpu_device *adev = smu->adev; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + fatal_err = 0; + param = SMU_RESET_MODE_1; + /* * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 */ smu_cmn_get_smc_version(smu, NULL, &smu_version); if (smu_version < 0x00440700) ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); - else - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); + else { + /* fatal error triggered by ras, PMFW supports the flag + from 68.44.0 */ + if ((smu_version >= 0x00442c00) && ras && + atomic_read(&ras->in_recovery)) + fatal_err = 1; + From PMFW version, this looks specific to aldebaran. Since there is version check as well, the implementation needs to be moved to aldebaran_ppt.c Thanks, Lijo + param |= (fatal_err << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GfxDeviceDriverReset, param, NULL); + } if (!ret) msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
RE: [PATCH] drm/amdgpu: support new mode-1 reset interface
[AMD Official Use Only] Hi Lijo, Your concern is reasonable, but in fact smu_v13_0_mode1_reset is used only by ALDEBARAN currently. I assume the PMFW of new smu v13 ASIC in the future will follow this design, otherwise we could move the implementation into xxx_ppt.c. Regards, Tao > -Original Message- > From: Lazar, Lijo > Sent: Tuesday, November 16, 2021 3:44 PM > To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org; Zhang, > Hawking ; Clements, John > ; Yang, Stanley ; Quan, > Evan > Subject: Re: [PATCH] drm/amdgpu: support new mode-1 reset interface > > > > On 11/16/2021 12:53 PM, Tao Zhou wrote: > > If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 > > reset message. > > > > Signed-off-by: Tao Zhou > > --- > > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 > --- > > 1 file changed, 18 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > index 35145db6eedf..6f3d064a8232 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > > @@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct > > smu_context *smu) > > > > int smu_v13_0_mode1_reset(struct smu_context *smu) > > { > > - u32 smu_version; > > + u32 smu_version, fatal_err, param; > > int ret = 0; > > + struct amdgpu_device *adev = smu->adev; > > + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > + > > + fatal_err = 0; > > + param = SMU_RESET_MODE_1; > > + > > /* > > * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 > > */ > > smu_cmn_get_smc_version(smu, NULL, &smu_version); > > if (smu_version < 0x00440700) > > ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, > NULL); > > - else > > - ret = smu_cmn_send_smc_msg_with_param(smu, > SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); > > + else { > > + /* fatal error triggered by ras, PMFW supports the flag > > + from 68.44.0 */ > > + if ((smu_version >= 0x00442c00) && ras && > > + atomic_read(&ras->in_recovery)) > > + fatal_err = 1; > > + > > From PMFW version, this looks specific to aldebaran. Since there is version > check as well, the implementation needs to be moved to aldebaran_ppt.c > > Thanks, > Lijo > > > + param |= (fatal_err << 16); > > + ret = smu_cmn_send_smc_msg_with_param(smu, > > + SMU_MSG_GfxDeviceDriverReset, > param, NULL); > > + } > > > > if (!ret) > > msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); > >
Re: [PATCH] drm/amdgpu: support new mode-1 reset interface
On 11/16/2021 12:53 PM, Tao Zhou wrote: If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset message. Signed-off-by: Tao Zhou --- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 35145db6eedf..6f3d064a8232 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu) int smu_v13_0_mode1_reset(struct smu_context *smu) { - u32 smu_version; + u32 smu_version, fatal_err, param; int ret = 0; + struct amdgpu_device *adev = smu->adev; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + fatal_err = 0; + param = SMU_RESET_MODE_1; + /* * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 */ smu_cmn_get_smc_version(smu, NULL, &smu_version); if (smu_version < 0x00440700) ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); - else - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); + else { + /* fatal error triggered by ras, PMFW supports the flag + from 68.44.0 */ + if ((smu_version >= 0x00442c00) && ras && + atomic_read(&ras->in_recovery)) + fatal_err = 1; + From PMFW version, this looks specific to aldebaran. Since there is version check as well, the implementation needs to be moved to aldebaran_ppt.c Thanks, Lijo + param |= (fatal_err << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GfxDeviceDriverReset, param, NULL); + } if (!ret) msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
RE: [PATCH] drm/amdgpu: support new mode-1 reset interface
[AMD Official Use Only] Reviewed-by: Hawking Zhang Regards, Hawking -Original Message- From: Zhou1, Tao Sent: Tuesday, November 16, 2021 15:24 To: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; Clements, John ; Yang, Stanley ; Quan, Evan Cc: Zhou1, Tao Subject: [PATCH] drm/amdgpu: support new mode-1 reset interface If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset message. Signed-off-by: Tao Zhou --- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 35145db6eedf..6f3d064a8232 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu) int smu_v13_0_mode1_reset(struct smu_context *smu) { - u32 smu_version; + u32 smu_version, fatal_err, param; int ret = 0; + struct amdgpu_device *adev = smu->adev; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + fatal_err = 0; + param = SMU_RESET_MODE_1; + /* * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 */ smu_cmn_get_smc_version(smu, NULL, &smu_version); if (smu_version < 0x00440700) ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); - else - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); + else { + /* fatal error triggered by ras, PMFW supports the flag + from 68.44.0 */ + if ((smu_version >= 0x00442c00) && ras && + atomic_read(&ras->in_recovery)) + fatal_err = 1; + + param |= (fatal_err << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GfxDeviceDriverReset, param, NULL); + } if (!ret) msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); -- 2.17.1
[PATCH] drm/amdgpu: support new mode-1 reset interface
If gpu reset is triggered by ras fatal error, tell it to smu in mode-1 reset message. Signed-off-by: Tao Zhou --- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c| 21 --- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 35145db6eedf..6f3d064a8232 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1426,16 +1426,31 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu) int smu_v13_0_mode1_reset(struct smu_context *smu) { - u32 smu_version; + u32 smu_version, fatal_err, param; int ret = 0; + struct amdgpu_device *adev = smu->adev; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + fatal_err = 0; + param = SMU_RESET_MODE_1; + /* * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07 */ smu_cmn_get_smc_version(smu, NULL, &smu_version); if (smu_version < 0x00440700) ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL); - else - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL); + else { + /* fatal error triggered by ras, PMFW supports the flag + from 68.44.0 */ + if ((smu_version >= 0x00442c00) && ras && + atomic_read(&ras->in_recovery)) + fatal_err = 1; + + param |= (fatal_err << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GfxDeviceDriverReset, param, NULL); + } if (!ret) msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS); -- 2.17.1