Re: [PATCH RESEND] drm/amdgpu: Remove the vega10 from ras support list

2022-01-28 Thread Ma, Jun


On 1/27/2022 10:36 PM, Chen, Guchun wrote:
> [AMD Official Use Only]
> 
> Hi Jun,
> 
> In RAS code, we have this special handling for Vega10. Can you elaborate it 
> please? Any problem you have observed?

Ok, thanks for review. I'll confirm this.

> 
> Regards,
> Guchun
> 
> -Original Message-
> From: Ma, Jun  
> Sent: Thursday, January 27, 2022 7:47 PM
> To: amd-gfx@lists.freedesktop.org; brahma_sw_dev 
> Cc: Zhang, Hawking ; Zhou1, Tao ; 
> Ma, Jun 
> Subject: [PATCH RESEND] drm/amdgpu: Remove the vega10 from ras support list
> 
> Remove vega10 from the ras support check function.
> Base on this change, the ras initial function is optimized.
> 
> Signed-off-by: majun 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
>  1 file changed, 20 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 37e9b7e82993..aa1de974e07e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2129,8 +2129,7 @@ static int amdgpu_ras_recovery_fini(struct 
> amdgpu_device *adev)
>  
>  static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  {
> - return adev->asic_type == CHIP_VEGA10 ||
> - adev->asic_type == CHIP_VEGA20 ||
> + return adev->asic_type == CHIP_VEGA20 ||
>   adev->asic_type == CHIP_ARCTURUS ||
>   adev->asic_type == CHIP_ALDEBARAN ||
>   adev->asic_type == CHIP_SIENNA_CICHLID; @@ -2164,13 +2163,13 @@ 
> static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
>   * we have to initialize ras as normal. but need check if operation is
>   * allowed or not in each function.
>   */
> -static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
> +static bool amdgpu_ras_check_supported(struct amdgpu_device *adev)
>  {
>   adev->ras_hw_enabled = adev->ras_enabled = 0;
>  
>   if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
>   !amdgpu_ras_asic_supported(adev))
> - return;
> + return false;
>  
>   if (!adev->gmc.xgmi.connected_to_cpu) {
>   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { @@ -2203,6 
> +2202,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
>  
>   adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
>   adev->ras_hw_enabled & amdgpu_ras_mask;
> +
> + return true;
>  }
>  
>  static void amdgpu_ras_counte_dw(struct work_struct *work) @@ -2236,6 
> +2237,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   int r;
>   bool df_poison, umc_poison;
>  
> + if (!amdgpu_ras_check_supported(adev))
> + return -EINVAL;
> +
>   if (con)
>   return 0;
>  
> @@ -2250,28 +2254,24 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   INIT_DELAYED_WORK(>ras_counte_delay_work, amdgpu_ras_counte_dw);
>   atomic_set(>ras_ce_count, 0);
>   atomic_set(>ras_ue_count, 0);
> -
>   con->objs = (struct ras_manager *)(con + 1);
> + con->features = 0;
>  
>   amdgpu_ras_set_context(adev, con);
>  
> - amdgpu_ras_check_supported(adev);
> -
> - if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
> - /* set gfx block ras context feature for VEGA20 Gaming
> -  * send ras disable cmd to ras ta during ras late init.
> -  */
> - if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
> + if (!adev->ras_enabled) {
> + /* set gfx block ras context feature for VEGA20 Gaming
> +  * send ras disable cmd to ras ta during ras late init.
> +  */
> + if (adev->asic_type == CHIP_VEGA20) {
>   con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
> -
>   return 0;
> + } else {
> + r = 0;
> + goto release_con;
>   }
> -
> - r = 0;
> - goto release_con;
>   }
>  
> - con->features = 0;
>   INIT_LIST_HEAD(>head);
>   /* Might need get this flag from vbios. */
>   con->flags = RAS_DEFAULT_FLAGS;
> @@ -2545,7 +2545,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>  
>  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  {
> - amdgpu_ras_check_supported(adev);
> + if (!amdgpu_ras_check_supported(adev))
> + return;
> +
>   if (!adev->ras_hw_enabled)
>   return;
>  
> --
> 2.25.1


RE: [PATCH RESEND] drm/amdgpu: Remove the vega10 from ras support list

2022-01-27 Thread Chen, Guchun
[AMD Official Use Only]

Hi Jun,

In RAS code, we have this special handling for Vega10. Can you elaborate it 
please? Any problem you have observed?

Regards,
Guchun

-Original Message-
From: Ma, Jun  
Sent: Thursday, January 27, 2022 7:47 PM
To: amd-gfx@lists.freedesktop.org; brahma_sw_dev 
Cc: Zhang, Hawking ; Zhou1, Tao ; Ma, 
Jun 
Subject: [PATCH RESEND] drm/amdgpu: Remove the vega10 from ras support list

Remove vega10 from the ras support check function.
Base on this change, the ras initial function is optimized.

Signed-off-by: majun 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 37e9b7e82993..aa1de974e07e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2129,8 +2129,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
 
 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  {
-   return adev->asic_type == CHIP_VEGA10 ||
-   adev->asic_type == CHIP_VEGA20 ||
+   return adev->asic_type == CHIP_VEGA20 ||
adev->asic_type == CHIP_ARCTURUS ||
adev->asic_type == CHIP_ALDEBARAN ||
adev->asic_type == CHIP_SIENNA_CICHLID; @@ -2164,13 +2163,13 @@ 
static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
  * we have to initialize ras as normal. but need check if operation is
  * allowed or not in each function.
  */
-static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
+static bool amdgpu_ras_check_supported(struct amdgpu_device *adev)
 {
adev->ras_hw_enabled = adev->ras_enabled = 0;
 
if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
!amdgpu_ras_asic_supported(adev))
-   return;
+   return false;
 
if (!adev->gmc.xgmi.connected_to_cpu) {
if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { @@ -2203,6 
+2202,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
+
+   return true;
 }
 
 static void amdgpu_ras_counte_dw(struct work_struct *work) @@ -2236,6 +2237,9 
@@ int amdgpu_ras_init(struct amdgpu_device *adev)
int r;
bool df_poison, umc_poison;
 
+   if (!amdgpu_ras_check_supported(adev))
+   return -EINVAL;
+
if (con)
return 0;
 
@@ -2250,28 +2254,24 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
INIT_DELAYED_WORK(>ras_counte_delay_work, amdgpu_ras_counte_dw);
atomic_set(>ras_ce_count, 0);
atomic_set(>ras_ue_count, 0);
-
con->objs = (struct ras_manager *)(con + 1);
+   con->features = 0;
 
amdgpu_ras_set_context(adev, con);
 
-   amdgpu_ras_check_supported(adev);
-
-   if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
-   /* set gfx block ras context feature for VEGA20 Gaming
-* send ras disable cmd to ras ta during ras late init.
-*/
-   if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
+   if (!adev->ras_enabled) {
+   /* set gfx block ras context feature for VEGA20 Gaming
+* send ras disable cmd to ras ta during ras late init.
+*/
+   if (adev->asic_type == CHIP_VEGA20) {
con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
-
return 0;
+   } else {
+   r = 0;
+   goto release_con;
}
-
-   r = 0;
-   goto release_con;
}
 
-   con->features = 0;
INIT_LIST_HEAD(>head);
/* Might need get this flag from vbios. */
con->flags = RAS_DEFAULT_FLAGS;
@@ -2545,7 +2545,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  {
-   amdgpu_ras_check_supported(adev);
+   if (!amdgpu_ras_check_supported(adev))
+   return;
+
if (!adev->ras_hw_enabled)
return;
 
--
2.25.1


[PATCH RESEND] drm/amdgpu: Remove the vega10 from ras support list

2022-01-27 Thread majun
Remove vega10 from the ras support check function.
Base on this change, the ras initial function is optimized.

Signed-off-by: majun 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 37e9b7e82993..aa1de974e07e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2129,8 +2129,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
 
 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
 {
-   return adev->asic_type == CHIP_VEGA10 ||
-   adev->asic_type == CHIP_VEGA20 ||
+   return adev->asic_type == CHIP_VEGA20 ||
adev->asic_type == CHIP_ARCTURUS ||
adev->asic_type == CHIP_ALDEBARAN ||
adev->asic_type == CHIP_SIENNA_CICHLID;
@@ -2164,13 +2163,13 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
  * we have to initialize ras as normal. but need check if operation is
  * allowed or not in each function.
  */
-static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
+static bool amdgpu_ras_check_supported(struct amdgpu_device *adev)
 {
adev->ras_hw_enabled = adev->ras_enabled = 0;
 
if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
!amdgpu_ras_asic_supported(adev))
-   return;
+   return false;
 
if (!adev->gmc.xgmi.connected_to_cpu) {
if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
@@ -2203,6 +2202,8 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
 
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
+
+   return true;
 }
 
 static void amdgpu_ras_counte_dw(struct work_struct *work)
@@ -2236,6 +2237,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
int r;
bool df_poison, umc_poison;
 
+   if (!amdgpu_ras_check_supported(adev))
+   return -EINVAL;
+
if (con)
return 0;
 
@@ -2250,28 +2254,24 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
INIT_DELAYED_WORK(>ras_counte_delay_work, amdgpu_ras_counte_dw);
atomic_set(>ras_ce_count, 0);
atomic_set(>ras_ue_count, 0);
-
con->objs = (struct ras_manager *)(con + 1);
+   con->features = 0;
 
amdgpu_ras_set_context(adev, con);
 
-   amdgpu_ras_check_supported(adev);
-
-   if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
-   /* set gfx block ras context feature for VEGA20 Gaming
-* send ras disable cmd to ras ta during ras late init.
-*/
-   if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
+   if (!adev->ras_enabled) {
+   /* set gfx block ras context feature for VEGA20 Gaming
+* send ras disable cmd to ras ta during ras late init.
+*/
+   if (adev->asic_type == CHIP_VEGA20) {
con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
-
return 0;
+   } else {
+   r = 0;
+   goto release_con;
}
-
-   r = 0;
-   goto release_con;
}
 
-   con->features = 0;
INIT_LIST_HEAD(>head);
/* Might need get this flag from vbios. */
con->flags = RAS_DEFAULT_FLAGS;
@@ -2545,7 +2545,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
-   amdgpu_ras_check_supported(adev);
+   if (!amdgpu_ras_check_supported(adev))
+   return;
+
if (!adev->ras_hw_enabled)
return;
 
-- 
2.25.1