RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block
I can add a default error injection function in amdgpuras_c, if some block don't define special . ras_error_inject function, it will use default error injection in amdgpuras_c. -Original Message- From: Zhou1, Tao Sent: Monday, December 6, 2021 3:34 PM To: Chai, Thomas ; amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking Subject: RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block [AMD Official Use Only] The error injection has no difference among RAS blocks except GFX and XGMI. I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think it's necessary to implement specific error injection functions for all other RAS blocks. Regards, Tao > -Original Message- > From: Chai, Thomas > Sent: Wednesday, December 1, 2021 6:53 PM > To: amd-gfx@lists.freedesktop.org > Cc: Chai, Thomas ; Zhang, Hawking > ; Zhou1, Tao ; Chai, Thomas > > Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from > amdgpu_ras.c to each block > > Move each block error inject function from amdgpu_ras.c to each block. > > Signed-off-by: yipechai > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 > +--- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 > +++ > drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 18 +++ > drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_1.c| 16 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c| 16 ++ > drivers/gpu/drm/amd/amdgpu/umc_v8_7.c| 16 ++ > 12 files changed, 201 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2e38bd3d3d45..87b625d305c9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct > amdgpu_device *adev, > return 0; > } > > -/* Trigger XGMI/WAFL error */ > -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, > - struct ta_ras_trigger_error_input *block_info) > -{ > - int ret; > - > - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) > - dev_warn(adev->dev, "Failed to disallow df cstate"); > > - if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) > - dev_warn(adev->dev, "Failed to disallow XGMI power down"); > - > - ret = psp_ras_trigger_error(&adev->psp, block_info); > - > - if (amdgpu_ras_intr_triggered()) > - return ret; > - > - if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) > - dev_warn(adev->dev, "Failed to allow XGMI power down"); > - > - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) > - dev_warn(adev->dev, "Failed to allow df cstate"); > - > - return ret; > -} > > /* wrapper of psp_ras_trigger_error */ int > amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41 > +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, > if (!obj) > return -EINVAL; > > + if (!block_obj || !block_obj->ops) { > + dev_info(adev->dev, "%s don't config ras function \n", > get_ras_block_str(&info->head)); > + return -EINVAL; > + } > + > /* Calculate XGMI relative offset */ > if (adev->gmc.xgmi.num_physical_nodes > 1) { > - block_info.address = > - amdgpu_xgmi_get_relative_phy_addr(adev, > - block_info.address); > + block_info.address = > amdgpu_xgmi_get_relative_phy_addr(adev, > +block_info.address); > } > > - switch (info->head.block) { > - case AMDGPU_RAS_BLOCK__GFX: > - if (!block_obj || !block_obj->ops) { > - dev_info(adev->dev, "%s don't config ras function \n", > get_ras_block_str(&info->head)); > - return -EINVAL; > - } > - if (block_obj->ops->ras_error_inject) > + if (block_obj->ops->ras_error_inject) { > + if(info->head.block == AMDGPU_RAS_BLOCK__GFX) > ret = block_obj->ops->ras_error_inject(adev, info); &
RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block
[AMD Official Use Only] The error injection has no difference among RAS blocks except GFX and XGMI. I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think it's necessary to implement specific error injection functions for all other RAS blocks. Regards, Tao > -Original Message- > From: Chai, Thomas > Sent: Wednesday, December 1, 2021 6:53 PM > To: amd-gfx@lists.freedesktop.org > Cc: Chai, Thomas ; Zhang, Hawking > ; Zhou1, Tao ; Chai, > Thomas > Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from > amdgpu_ras.c to each block > > Move each block error inject function from amdgpu_ras.c to each block. > > Signed-off-by: yipechai > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 +--- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++ > drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 18 +++ > drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c | 16 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_1.c| 16 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c| 16 ++ > drivers/gpu/drm/amd/amdgpu/umc_v8_7.c| 16 ++ > 12 files changed, 201 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2e38bd3d3d45..87b625d305c9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct > amdgpu_device *adev, > return 0; > } > > -/* Trigger XGMI/WAFL error */ > -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, > - struct ta_ras_trigger_error_input *block_info) > -{ > - int ret; > - > - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) > - dev_warn(adev->dev, "Failed to disallow df cstate"); > > - if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) > - dev_warn(adev->dev, "Failed to disallow XGMI power down"); > - > - ret = psp_ras_trigger_error(&adev->psp, block_info); > - > - if (amdgpu_ras_intr_triggered()) > - return ret; > - > - if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) > - dev_warn(adev->dev, "Failed to allow XGMI power down"); > - > - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) > - dev_warn(adev->dev, "Failed to allow df cstate"); > - > - return ret; > -} > > /* wrapper of psp_ras_trigger_error */ > int amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41 > +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, > if (!obj) > return -EINVAL; > > + if (!block_obj || !block_obj->ops) { > + dev_info(adev->dev, "%s don't config ras function \n", > get_ras_block_str(&info->head)); > + return -EINVAL; > + } > + > /* Calculate XGMI relative offset */ > if (adev->gmc.xgmi.num_physical_nodes > 1) { > - block_info.address = > - amdgpu_xgmi_get_relative_phy_addr(adev, > - block_info.address); > + block_info.address = > amdgpu_xgmi_get_relative_phy_addr(adev, > +block_info.address); > } > > - switch (info->head.block) { > - case AMDGPU_RAS_BLOCK__GFX: > - if (!block_obj || !block_obj->ops) { > - dev_info(adev->dev, "%s don't config ras function \n", > get_ras_block_str(&info->head)); > - return -EINVAL; > - } > - if (block_obj->ops->ras_error_inject) > + if (block_obj->ops->ras_error_inject) { > + if(info->head.block == AMDGPU_RAS_BLOCK__GFX) > ret = block_obj->ops->ras_error_inject(adev, info); > - break; > - case AMDGPU_RAS_BLOCK__UMC: > - case AMDGPU_RAS_BLOCK__SDMA: > - case AMDGPU_RAS_BLOCK__MMHUB: > - case AMDGPU_RAS_BLOCK__PCIE_BIF: > - case AMDGPU_RAS_BLOCK__MCA: > - ret = psp_ras_trigger_error(&adev->psp, &block_info); > - break; > - case AMDGPU_RAS_BLOCK__XGMI_WAFL: > - ret = amdgpu_ras_error_inject_xgmi(ad
[PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block
Move each block error inject function from amdgpu_ras.c to each block. Signed-off-by: yipechai --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++ drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 18 +++ drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 16 ++ drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 16 ++ drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c | 16 ++ drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 16 ++ drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 ++ drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c | 16 ++ drivers/gpu/drm/amd/amdgpu/umc_v6_1.c| 16 ++ drivers/gpu/drm/amd/amdgpu/umc_v6_7.c| 16 ++ drivers/gpu/drm/amd/amdgpu/umc_v8_7.c| 16 ++ 12 files changed, 201 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2e38bd3d3d45..87b625d305c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, return 0; } -/* Trigger XGMI/WAFL error */ -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, -struct ta_ras_trigger_error_input *block_info) -{ - int ret; - - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) - dev_warn(adev->dev, "Failed to disallow df cstate"); - if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) - dev_warn(adev->dev, "Failed to disallow XGMI power down"); - - ret = psp_ras_trigger_error(&adev->psp, block_info); - - if (amdgpu_ras_intr_triggered()) - return ret; - - if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) - dev_warn(adev->dev, "Failed to allow XGMI power down"); - - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) - dev_warn(adev->dev, "Failed to allow df cstate"); - - return ret; -} /* wrapper of psp_ras_trigger_error */ int amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41 +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, if (!obj) return -EINVAL; + if (!block_obj || !block_obj->ops) { + dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head)); + return -EINVAL; + } + /* Calculate XGMI relative offset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { - block_info.address = - amdgpu_xgmi_get_relative_phy_addr(adev, - block_info.address); + block_info.address = amdgpu_xgmi_get_relative_phy_addr(adev, block_info.address); } - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__GFX: - if (!block_obj || !block_obj->ops) { - dev_info(adev->dev, "%s don't config ras function \n", get_ras_block_str(&info->head)); - return -EINVAL; - } - if (block_obj->ops->ras_error_inject) + if (block_obj->ops->ras_error_inject) { + if(info->head.block == AMDGPU_RAS_BLOCK__GFX) ret = block_obj->ops->ras_error_inject(adev, info); - break; - case AMDGPU_RAS_BLOCK__UMC: - case AMDGPU_RAS_BLOCK__SDMA: - case AMDGPU_RAS_BLOCK__MMHUB: - case AMDGPU_RAS_BLOCK__PCIE_BIF: - case AMDGPU_RAS_BLOCK__MCA: - ret = psp_ras_trigger_error(&adev->psp, &block_info); - break; - case AMDGPU_RAS_BLOCK__XGMI_WAFL: - ret = amdgpu_ras_error_inject_xgmi(adev, &block_info); - break; - default: - dev_info(adev->dev, "%s error injection is not supported yet\n", -get_ras_block_str(&info->head)); - ret = -EINVAL; + else + ret = block_obj->ops->ras_error_inject(adev, &block_info); } if (ret) - dev_err(adev->dev, "ras inject %s failed %d\n", - get_ras_block_str(&info->head), ret); + dev_err(adev->dev, "ras inject %s failed %d\n", get_ras_block_str(&info->head), ret); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index da541c7b1ec2..298742afba99 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -940,6 +940,33 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, err_data->ce_count += ce_cnt; } +/* Trigger XGMI/WAFL error */ +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, +void *inject_if) +{ + int ret = 0;; + struct ta_ras