RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block

2021-12-06 Thread Zhou1, Tao
[AMD Official Use Only]

The error injection has no difference among RAS blocks except GFX and XGMI.
I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think 
it's necessary to implement specific error injection functions for all other 
RAS blocks.

Regards,
Tao

> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Chai,
> Thomas 
> Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from
> amdgpu_ras.c to each block
> 
> Move each block error inject function from amdgpu_ras.c to each block.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 62 +---
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 18 +++
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 16 ++
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c  | 16 ++
> drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c  | 16 ++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_1.c| 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c| 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v8_7.c| 16 ++
>  12 files changed, 201 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2e38bd3d3d45..87b625d305c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>   return 0;
>  }
> 
> -/* Trigger XGMI/WAFL error */
> -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> -  struct ta_ras_trigger_error_input *block_info)
> -{
> - int ret;
> -
> - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> - dev_warn(adev->dev, "Failed to disallow df cstate");
> 
> - if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> - dev_warn(adev->dev, "Failed to disallow XGMI power down");
> -
> - ret = psp_ras_trigger_error(&adev->psp, block_info);
> -
> - if (amdgpu_ras_intr_triggered())
> - return ret;
> -
> - if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> - dev_warn(adev->dev, "Failed to allow XGMI power down");
> -
> - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> - dev_warn(adev->dev, "Failed to allow df cstate");
> -
> - return ret;
> -}
> 
>  /* wrapper of psp_ras_trigger_error */
>  int amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41
> +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>   if (!obj)
>   return -EINVAL;
> 
> + if (!block_obj || !block_obj->ops)  {
> + dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> + return -EINVAL;
> + }
> +
>   /* Calculate XGMI relative offset */
>   if (adev->gmc.xgmi.num_physical_nodes > 1) {
> - block_info.address =
> - amdgpu_xgmi_get_relative_phy_addr(adev,
> -   block_info.address);
> + block_info.address =
> amdgpu_xgmi_get_relative_phy_addr(adev,
> +block_info.address);
>   }
> 
> - switch (info->head.block) {
> - case AMDGPU_RAS_BLOCK__GFX:
> - if (!block_obj || !block_obj->ops)  {
> - dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> - return -EINVAL;
> - }
> - if (block_obj->ops->ras_error_inject)
> + if (block_obj->ops->ras_error_inject) {
> + if(info->head.block == AMDGPU_RAS_BLOCK__GFX)
>   ret = block_obj->ops->ras_error_inject(adev, info);
> - break;
> - case AMDGPU_RAS_BLOCK__UMC:
> - case AMDGPU_RAS_BLOCK__SDMA:
> - case AMDGPU_RAS_BLOCK__MMHUB:
> - case AMDGPU_RAS_BLOCK__PCIE_BIF:
> - case AMDGPU_RAS_BLOCK__MCA:
> - ret = psp_ras_trigger_error(&adev->psp, &block_info);
> - break;
> - case AMDGPU_RAS_BLOCK__XGMI_WAFL:
> - ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
> - break;
> - default:
> - dev_info(adev->dev, "%s error injection is not supported yet\n",
> -  get_ras_block_str(&info->head));
> - ret = -EINVAL;
> + else
> + ret = block_obj->ops->ras_error_inject(adev,
> &block_info);
>   }
> 
>   if (ret)
> - dev_err(adev->dev, "ras inject %s failed %d\n",
> - get_ras_block_str(&info->head), ret);
> + dev_

RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block

2021-12-06 Thread Chai, Thomas
I can add a default error injection function in amdgpuras_c,  if some block 
don't define special  . ras_error_inject function, it will use default error 
injection in amdgpuras_c.

-Original Message-
From: Zhou1, Tao  
Sent: Monday, December 6, 2021 3:34 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from 
amdgpu_ras.c to each block

[AMD Official Use Only]

The error injection has no difference among RAS blocks except GFX and XGMI.
I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think 
it's necessary to implement specific error injection functions for all other 
RAS blocks.

Regards,
Tao

> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Chai, Thomas 
> 
> Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from 
> amdgpu_ras.c to each block
> 
> Move each block error inject function from amdgpu_ras.c to each block.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 62 
> +--- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 
> +++
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 18 +++
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 16 ++ 
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c  | 16 ++ 
> drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c  | 16 ++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_1.c| 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c| 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v8_7.c| 16 ++
>  12 files changed, 201 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2e38bd3d3d45..87b625d305c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>   return 0;
>  }
> 
> -/* Trigger XGMI/WAFL error */
> -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> -  struct ta_ras_trigger_error_input *block_info)
> -{
> - int ret;
> -
> - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> - dev_warn(adev->dev, "Failed to disallow df cstate");
> 
> - if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> - dev_warn(adev->dev, "Failed to disallow XGMI power down");
> -
> - ret = psp_ras_trigger_error(&adev->psp, block_info);
> -
> - if (amdgpu_ras_intr_triggered())
> - return ret;
> -
> - if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> - dev_warn(adev->dev, "Failed to allow XGMI power down");
> -
> - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> - dev_warn(adev->dev, "Failed to allow df cstate");
> -
> - return ret;
> -}
> 
>  /* wrapper of psp_ras_trigger_error */  int 
> amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41
> +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>   if (!obj)
>   return -EINVAL;
> 
> + if (!block_obj || !block_obj->ops)  {
> + dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> + return -EINVAL;
> + }
> +
>   /* Calculate XGMI relative offset */
>   if (adev->gmc.xgmi.num_physical_nodes > 1) {
> - block_info.address =
> - amdgpu_xgmi_get_relative_phy_addr(adev,
> -   block_info.address);
> + block_info.address =
> amdgpu_xgmi_get_relative_phy_addr(adev,
> +block_info.address);
>   }
> 
> - switch (info->head.block) {
> - case AMDGPU_RAS_BLOCK__GFX:
> - if (!block_obj || !block_obj->ops)  {
> - dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> - return -EINVAL;
> - }
> - if (block_obj->ops->ras_error_inject)
> + if (block_obj->ops->ras_error_inject) {
> + if(info->head.block == AMDGPU_RAS_BLOCK__GFX)
>   ret = block_obj->ops->ras_error_inject(adev, info);
&