RE: [PATCH V3 12/12] drm/amdgpu: Removed redundant ras code

2021-12-28 Thread Zhang, Hawking
[AMD Official Use Only]

Thank you Thomas. V3 looks good to me. @Zhou1, Tao/@Clements, John/@Yang, 
Stanley please also take a look and raise concern if any.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of yipechai
Sent: Wednesday, December 29, 2021 14:32
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: [PATCH V3 12/12] drm/amdgpu: Removed redundant ras code

Removed redundant ras code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 82 ++---
 1 file changed, 20 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 21765e05b003..17de79be6d8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -910,51 +910,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
return -EINVAL;

block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n",
+   get_ras_block_str(>head));
+   return -EINVAL;
+   }

-   switch (info->head.block) {
-   case AMDGPU_RAS_BLOCK__UMC:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
+   if (block_obj->hw_ops->query_ras_error_count)
+   block_obj->hw_ops->query_ras_error_count(adev, _data);

-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
-   /* umc query_ras_error_address is also responsible for clearing
-* error status
-*/
-   if (block_obj->hw_ops->query_ras_error_address)
-   block_obj->hw_ops->query_ras_error_address(adev, 
_data);
-   break;
-   case AMDGPU_RAS_BLOCK__SDMA:
-   case AMDGPU_RAS_BLOCK__GFX:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
+   if (info->head.block == AMDGPU_RAS_BLOCK__UMC)
+   block_obj->hw_ops->query_ras_error_address(adev, _data);

+   if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+   (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+   (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
-   break;
-   case AMDGPU_RAS_BLOCK__PCIE_BIF:
-   case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-   case AMDGPU_RAS_BLOCK__HDP:
-   case AMDGPU_RAS_BLOCK__MCA:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
-   break;
-   default:
-   break;
}

obj->err_data.ue_count += err_data.ue_count; @@ -1016,32 +988,18 @@ int 
amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
if (!amdgpu_ras_is_supported(adev, block))
return -EINVAL;

-   switch (block) {
-   case AMDGPU_RAS_BLOCK__GFX:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function 
\n", ras_block_str(block));
-   return -EINVAL;
-   }
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n", 
ras_block_str(block));
+   return -EINVAL;
+   }

-   if (block_obj->hw_ops->reset_ras_error_count)
-   block_obj->hw_ops->reset_ras_error_count(adev);
+   if (block_obj->hw_ops->reset_ras_error_count)
+   block_obj->hw_ops->reset_ras_error_count(adev);

+   if ((block == AMDGPU_RAS_BLOCK__GFX) ||
+   (block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->reset_ras_error_status)

[PATCH V3 12/12] drm/amdgpu: Removed redundant ras code

2021-12-28 Thread yipechai
Removed redundant ras code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 82 ++---
 1 file changed, 20 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 21765e05b003..17de79be6d8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -910,51 +910,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
return -EINVAL;
 
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n",
+   get_ras_block_str(>head));
+   return -EINVAL;
+   }
 
-   switch (info->head.block) {
-   case AMDGPU_RAS_BLOCK__UMC:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
+   if (block_obj->hw_ops->query_ras_error_count)
+   block_obj->hw_ops->query_ras_error_count(adev, _data);
 
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
-   /* umc query_ras_error_address is also responsible for clearing
-* error status
-*/
-   if (block_obj->hw_ops->query_ras_error_address)
-   block_obj->hw_ops->query_ras_error_address(adev, 
_data);
-   break;
-   case AMDGPU_RAS_BLOCK__SDMA:
-   case AMDGPU_RAS_BLOCK__GFX:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
+   if (info->head.block == AMDGPU_RAS_BLOCK__UMC)
+   block_obj->hw_ops->query_ras_error_address(adev, _data);
 
+   if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+   (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+   (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
-   break;
-   case AMDGPU_RAS_BLOCK__PCIE_BIF:
-   case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-   case AMDGPU_RAS_BLOCK__HDP:
-   case AMDGPU_RAS_BLOCK__MCA:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
-   break;
-   default:
-   break;
}
 
obj->err_data.ue_count += err_data.ue_count;
@@ -1016,32 +988,18 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
if (!amdgpu_ras_is_supported(adev, block))
return -EINVAL;
 
-   switch (block) {
-   case AMDGPU_RAS_BLOCK__GFX:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function 
\n", ras_block_str(block));
-   return -EINVAL;
-   }
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n", 
ras_block_str(block));
+   return -EINVAL;
+   }
 
-   if (block_obj->hw_ops->reset_ras_error_count)
-   block_obj->hw_ops->reset_ras_error_count(adev);
+   if (block_obj->hw_ops->reset_ras_error_count)
+   block_obj->hw_ops->reset_ras_error_count(adev);
 
+   if ((block == AMDGPU_RAS_BLOCK__GFX) ||
+   (block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->reset_ras_error_status)
block_obj->hw_ops->reset_ras_error_status(adev);
-   break;
-   case AMDGPU_RAS_BLOCK__SDMA:
-   case AMDGPU_RAS_BLOCK__HDP:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function 
\n", ras_block_str(block));
-   return -EINVAL;
-   }
-
-   if (block_obj->hw_ops->reset_ras_error_count)
-   

[PATCH V3 11/12] drm/amdgpu: Adjust error inject function code style in amdgpu_ras.c

2021-12-28 Thread yipechai
1. Move xgmi special error inject function from amdgpu_ras.c to xgmi block.
2. Support to use psp_ras_trigger_error as default error inject function in 
amdgpu_ras.c. If .ras_error_inject isn't defined in ras block, default error 
inject function will take effect.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 60 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 +++
 2 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index edba3ee292c8..21765e05b003 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1047,32 +1047,6 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
return 0;
 }
 
-/* Trigger XGMI/WAFL error */
-static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
-struct ta_ras_trigger_error_input *block_info)
-{
-   int ret;
-
-   if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
-   dev_warn(adev->dev, "Failed to disallow df cstate");
-
-   if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
-   dev_warn(adev->dev, "Failed to disallow XGMI power down");
-
-   ret = psp_ras_trigger_error(>psp, block_info);
-
-   if (amdgpu_ras_intr_triggered())
-   return ret;
-
-   if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
-   dev_warn(adev->dev, "Failed to allow XGMI power down");
-
-   if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
-   dev_warn(adev->dev, "Failed to allow df cstate");
-
-   return ret;
-}
-
 /* wrapper of psp_ras_trigger_error */
 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info)
@@ -1091,6 +1065,11 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
if (!obj)
return -EINVAL;
 
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n", 
get_ras_block_str(>head));
+   return -EINVAL;
+   }
+
/* Calculate XGMI relative offset */
if (adev->gmc.xgmi.num_physical_nodes > 1) {
block_info.address =
@@ -1098,30 +1077,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
  block_info.address);
}
 
-   switch (info->head.block) {
-   case AMDGPU_RAS_BLOCK__GFX:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function 
\n", get_ras_block_str(>head));
-   return -EINVAL;
-   }
-
+   if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
if (block_obj->hw_ops->ras_error_inject)
ret = block_obj->hw_ops->ras_error_inject(adev, info);
-   break;
-   case AMDGPU_RAS_BLOCK__UMC:
-   case AMDGPU_RAS_BLOCK__SDMA:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   case AMDGPU_RAS_BLOCK__PCIE_BIF:
-   case AMDGPU_RAS_BLOCK__MCA:
-   ret = psp_ras_trigger_error(>psp, _info);
-   break;
-   case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-   ret = amdgpu_ras_error_inject_xgmi(adev, _info);
-   break;
-   default:
-   dev_info(adev->dev, "%s error injection is not supported yet\n",
-get_ras_block_str(>head));
-   ret = -EINVAL;
+   } else {
+   /* If defined special ras_error_inject(e.g: xgmi), implement 
special ras_error_inject */
+   if (block_obj->hw_ops->ras_error_inject)
+   ret = block_obj->hw_ops->ras_error_inject(adev, 
_info);
+   else  /*If not defined .ras_error_inject, use default 
ras_error_inject*/
+   ret = psp_ras_trigger_error(>psp, _info);
}
 
if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e3c6898c5d13..a2bd3a2abe72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -940,9 +940,36 @@ static void amdgpu_xgmi_query_ras_error_count(struct 
amdgpu_device *adev,
err_data->ce_count += ce_cnt;
 }
 
+/* Trigger XGMI/WAFL error */
+static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,  void 
*inject_if)
+{
+   int ret = 0;;
+   struct ta_ras_trigger_error_input *block_info =  (struct 
ta_ras_trigger_error_input *)inject_if;
+
+   if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
+   dev_warn(adev->dev, "Failed to disallow df cstate");
+
+   if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
+   dev_warn(adev->dev, "Failed to disallow XGMI power down");
+
+   ret = psp_ras_trigger_error(>psp, block_info);
+
+   if 

[PATCH V3 09/12] drm/amdgpu: Modify sdma block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify sdma block to fit for the unified ras block data and ops.
2.Change amdgpu_sdma_ras_funcs to amdgpu_sdma_ras, and the corresponding 
variable name remove _funcs suffix.
3.Remove the const flag of sdma ras variable so that sdma ras block can be able 
to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register sdma ras block into 
amdgpu device ras block link list.
5.Remove the redundant code about sdma in amdgpu_ras.c after using the unified 
ras block.
6.Fill unified ras block .name .block .ras_late_init and .ras_fini for all of 
sdma versions. If .ras_late_init and .ras_fini had been defined by the selected 
sdma version, the defined functions will take effect; if not defined, default 
fill them with amdgpu_sdma_ras_late_init and amdgpu_sdma_ras_fini.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  9 
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 12 ++---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 58 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 25 --
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4.h   |  2 +-
 5 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5a8fccfdb0bb..4ca51f623751 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -953,12 +953,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
block_obj->hw_ops->query_ras_error_address(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
-   if (adev->sdma.funcs->query_ras_error_count) {
-   for (i = 0; i < adev->sdma.num_instances; i++)
-   adev->sdma.funcs->query_ras_error_count(adev, i,
-   
_data);
-   }
-   break;
case AMDGPU_RAS_BLOCK__GFX:
case AMDGPU_RAS_BLOCK__MMHUB:
if (!block_obj || !block_obj->hw_ops)   {
@@ -1064,9 +1058,6 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device 
*adev,
block_obj->hw_ops->reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__SDMA:
-   if (adev->sdma.funcs->reset_ras_error_count)
-   adev->sdma.funcs->reset_ras_error_count(adev);
-   break;
case AMDGPU_RAS_BLOCK__HDP:
if (!block_obj || !block_obj->hw_ops)   {
dev_info(adev->dev, "%s doesn't config ras function 
\n", ras_block_str(block));
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index f8fb755e3aa6..eaee12ab6518 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -23,6 +23,7 @@
 
 #ifndef __AMDGPU_SDMA_H__
 #define __AMDGPU_SDMA_H__
+#include "amdgpu_ras.h"
 
 /* max number of IP instances */
 #define AMDGPU_MAX_SDMA_INSTANCES  8
@@ -50,13 +51,8 @@ struct amdgpu_sdma_instance {
boolburst_nop;
 };
 
-struct amdgpu_sdma_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev,
-   void *ras_ih_info);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   int (*query_ras_error_count)(struct amdgpu_device *adev,
-   uint32_t instance, void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+struct amdgpu_sdma_ras {
+   struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_sdma {
@@ -73,7 +69,7 @@ struct amdgpu_sdma {
uint32_tsrbm_soft_reset;
boolhas_page_queue;
struct ras_common_if*ras_if;
-   const struct amdgpu_sdma_ras_funcs  *funcs;
+   struct amdgpu_sdma_ras  *ras;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 69c9e460c1eb..5500f93f6ecd 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1898,13 +1898,13 @@ static int sdma_v4_0_late_init(void *handle)
sdma_v4_0_setup_ulv(adev);
 
if (!amdgpu_persistent_edc_harvesting_supported(adev)) {
-   if (adev->sdma.funcs &&
-   adev->sdma.funcs->reset_ras_error_count)
-   adev->sdma.funcs->reset_ras_error_count(adev);
+   if (adev->sdma.ras && adev->sdma.ras->ras_block.hw_ops &&
+   adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count)
+   
adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count(adev);
}
 
-   if (adev->sdma.funcs && adev->sdma.funcs->ras_late_init)
-   return adev->sdma.funcs->ras_late_init(adev, _info);
+   if (adev->sdma.ras && 

[PATCH V3 10/12] drm/amdgpu: Modify mca block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify mca block to fit for the unified ras block data and ops.
2.Define special .ras_block_match function for mca block to identify itself.
3.Change amdgpu_mca_ras_funcs to amdgpu_mca_ras_block(amdgpu_mca_ras had been 
used), and the corresponding variable name remove _funcs suffix.
4.Remove the const flag of cma ras variable so that cma ras block can be able 
to be inserted into amdgpu device ras block link list.
5.Invoke amdgpu_ras_register_ras_block function to register cma ras block into 
amdgpu device ras block link list.
6.Remove the redundant code about cma in amdgpu_ras.c after using the unified 
ras block.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 11 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 14 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 85 ++---
 5 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 429d89188d94..b7306724898d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -463,23 +463,20 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
return r;
}
 
-   if (adev->mca.mp0.ras_funcs &&
-   adev->mca.mp0.ras_funcs->ras_late_init) {
-   r = adev->mca.mp0.ras_funcs->ras_late_init(adev);
+   if (adev->mca.mp0.ras && adev->mca.mp0.ras->ras_block.ras_late_init) {
+   r = adev->mca.mp0.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
 
-   if (adev->mca.mp1.ras_funcs &&
-   adev->mca.mp1.ras_funcs->ras_late_init) {
-   r = adev->mca.mp1.ras_funcs->ras_late_init(adev);
+   if (adev->mca.mp1.ras && adev->mca.mp1.ras->ras_block.ras_late_init) {
+   r = adev->mca.mp1.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
 
-   if (adev->mca.mpio.ras_funcs &&
-   adev->mca.mpio.ras_funcs->ras_late_init) {
-   r = adev->mca.mpio.ras_funcs->ras_late_init(adev);
+   if (adev->mca.mpio.ras && adev->mca.mpio.ras->ras_block.ras_late_init) {
+   r = adev->mca.mpio.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index ce538f4819f9..52a60c2316a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -74,20 +74,23 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device 
*adev,
 int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
 struct amdgpu_mca_ras *mca_dev)
 {
+   char sysfs_name[32] = {0};
int r;
struct ras_ih_if ih_info = {
.cb = NULL,
};
-   struct ras_fs_if fs_info = {
-   .sysfs_name = mca_dev->ras_funcs->sysfs_name,
+   struct ras_fs_if fs_info= {
+   .sysfs_name = sysfs_name,
};
 
+   snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", 
mca_dev->ras->ras_block.name);
+
if (!mca_dev->ras_if) {
mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
if (!mca_dev->ras_if)
return -ENOMEM;
-   mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
-   mca_dev->ras_if->sub_block_index = 
mca_dev->ras_funcs->ras_sub_block;
+   mca_dev->ras_if->block = mca_dev->ras->ras_block.block;
+   mca_dev->ras_if->sub_block_index = 
mca_dev->ras->ras_block.sub_block_index;
mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
}
ih_info.head = fs_info.head = *mca_dev->ras_if;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index c74bc7177066..be030c4031d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -21,21 +21,13 @@
 #ifndef __AMDGPU_MCA_H__
 #define __AMDGPU_MCA_H__
 
-struct amdgpu_mca_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   void (*query_ras_error_address)(struct amdgpu_device *adev,
-   void *ras_error_status);
-   uint32_t ras_block;
-   uint32_t ras_sub_block;
-   const char* sysfs_name;
+struct amdgpu_mca_ras_block {
+   struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_mca_ras {
struct ras_common_if *ras_if;
-   const struct 

[PATCH V3 08/12] drm/amdgpu: Modify umc block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify umc block to fit for the unified ras block data and ops.
2.Change amdgpu_umc_ras_funcs to amdgpu_umc_ras, and the corresponding variable 
name remove _funcs suffix.
3.Remove the const flag of umc ras variable so that umc ras block can be able 
to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register umc ras block into 
amdgpu device ras block link list.
5.Remove the redundant code about umc in amdgpu_ras.c after using the unified 
ras block.
6.Fill unified ras block .name .block .ras_late_init and .ras_fini for all of 
umc versions. If .ras_late_init and .ras_fini had been defined by the selected 
umc version, the defined functions will take effect; if not defined, default 
fill them with amdgpu_umc_ras_late_init and amdgpu_umc_ras_fini.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 14 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 16 +++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 21 ++---
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c   | 12 
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 12 
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/umc_v8_7.c   | 12 
 drivers/gpu/drm/amd/amdgpu/umc_v8_7.h   |  2 +-
 12 files changed, 92 insertions(+), 53 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index b12da46c483a..429d89188d94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -434,9 +434,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 {
int r;
 
-   if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->ras_late_init) {
-   r = adev->umc.ras_funcs->ras_late_init(adev);
+   if (adev->umc.ras && adev->umc.ras->ras_block.ras_late_init) {
+   r = adev->umc.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
@@ -490,9 +489,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
 
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
 {
-   if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->ras_fini)
-   adev->umc.ras_funcs->ras_fini(adev);
+   if (adev->umc.ras && adev->umc.ras->ras_block.ras_fini)
+   adev->umc.ras->ras_block.ras_fini(adev);
 
if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ras_fini)
adev->mmhub.ras->ras_block.ras_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 26bd9da31ffc..5a8fccfdb0bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -938,15 +938,19 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
 
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
-   if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_error_count)
-   adev->umc.ras_funcs->query_ras_error_count(adev, 
_data);
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n",
+   get_ras_block_str(>head));
+   return -EINVAL;
+   }
+
+   if (block_obj->hw_ops->query_ras_error_count)
+   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
/* umc query_ras_error_address is also responsible for clearing
 * error status
 */
-   if (adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_error_address)
-   adev->umc.ras_funcs->query_ras_error_address(adev, 
_data);
+   if (block_obj->hw_ops->query_ras_error_address)
+   block_obj->hw_ops->query_ras_error_address(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) {
@@ -2374,12 +2378,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Init poison supported flag, the default value is false */
if (adev->df.funcs &&
adev->df.funcs->query_ras_poison_mode &&
-   adev->umc.ras_funcs &&
-   adev->umc.ras_funcs->query_ras_poison_mode) {
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
df_poison =
adev->df.funcs->query_ras_poison_mode(adev);
umc_poison =
-   adev->umc.ras_funcs->query_ras_poison_mode(adev);
+   

[PATCH V3 07/12] drm/amdgpu: Modify nbio block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify nbio block to fit for the unified ras block data and ops.
2.Change amdgpu_nbio_ras_funcs to amdgpu_nbio_ras, and the corresponding 
variable name remove _funcs suffix.
3.Remove the const flag of mmhub ras variable so that nbio ras block can be 
able to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register nbio ras block into 
amdgpu device ras block link list.
5.Remove the redundant code about nbio in amdgpu_ras.c after using the unified 
ras block.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c  | 12 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 22 ++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 17 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/soc15.c   | 18 --
 7 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 5208b2dd176a..24feceb51289 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -208,13 +208,13 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg)
 * ack the interrupt if it is there
 */
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) {
-   if (adev->nbio.ras_funcs &&
-   adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring)
-   
adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring(adev);
+   if (adev->nbio.ras &&
+   adev->nbio.ras->handle_ras_controller_intr_no_bifring)
+   
adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
 
-   if (adev->nbio.ras_funcs &&
-   
adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring)
-   
adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring(adev);
+   if (adev->nbio.ras &&
+   adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
+   
adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
}
 
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index 6201a5f4b4fa..f54c183f1b1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -22,7 +22,7 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
-int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev)
+int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
int r;
struct ras_ih_if ih_info = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 843052205bd5..4afb76d3cd97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -47,15 +47,12 @@ struct nbio_hdp_flush_reg {
u32 ref_and_mask_sdma7;
 };
 
-struct amdgpu_nbio_ras_funcs {
+struct amdgpu_nbio_ras {
+   struct amdgpu_ras_block_object ras_block;
void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device 
*adev);
void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device 
*adev);
int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio_funcs {
@@ -104,9 +101,9 @@ struct amdgpu_nbio {
struct amdgpu_irq_src ras_err_event_athub_irq;
struct ras_common_if *ras_if;
const struct amdgpu_nbio_funcs *funcs;
-   const struct amdgpu_nbio_ras_funcs *ras_funcs;
+   struct amdgpu_nbio_ras  *ras;
 };
 
-int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev);
+int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, void *ras_info);
 void amdgpu_nbio_ras_fini(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fa0ebd484c7e..26bd9da31ffc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -969,10 +969,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
block_obj->hw_ops->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__PCIE_BIF:
-   if (adev->nbio.ras_funcs &&
-   adev->nbio.ras_funcs->query_ras_error_count)
-   adev->nbio.ras_funcs->query_ras_error_count(adev, 
_data);
-   break;

[PATCH V3 03/12] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify gfx block to fit for the unified ras block data and ops.
2.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding variable 
name remove _funcs suffix.
3.Remove the const flag of gfx ras variable so that gfx ras block can be able 
to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register gfx ras block into 
amdgpu device ras block link list.
5.Remove the redundant code about gfx in amdgpu_ras.c after using the unified 
ras block.
6.Fill unified ras block .name .block .ras_late_init and .ras_fini for all of 
gfx versions. If .ras_late_init and .ras_fini had been defined by the selected 
gfx version, the defined functions will take effect; if not defined, default 
fill with amdgpu_gfx_ras_late_init and amdgpu_gfx_ras_fini.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  8 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 17 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 61 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 65 -
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 24 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 25 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
 8 files changed, 121 insertions(+), 83 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 1795d448c700..18e4106aa03b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -622,7 +622,7 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, 
uint32_t *value)
return r;
 }
 
-int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev)
+int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
int r;
struct ras_fs_if fs_info = {
@@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device 
*adev,
 */
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-   if (adev->gfx.ras_funcs &&
-   adev->gfx.ras_funcs->query_ras_error_count)
-   adev->gfx.ras_funcs->query_ras_error_count(adev, 
err_data);
+   if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops &&
+   adev->gfx.ras->ras_block.hw_ops->query_ras_error_count)
+   
adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
amdgpu_ras_reset_gpu(adev);
}
return AMDGPU_RAS_SUCCESS;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 6b78b4a0e182..35f2a724616e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -31,6 +31,7 @@
 #include "amdgpu_ring.h"
 #include "amdgpu_rlc.h"
 #include "soc15.h"
+#include "amdgpu_ras.h"
 
 /* GFX current status */
 #define AMDGPU_GFX_NORMAL_MODE 0xL
@@ -213,16 +214,8 @@ struct amdgpu_cu_info {
uint32_t bitmap[4][4];
 };
 
-struct amdgpu_gfx_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   int (*ras_error_inject)(struct amdgpu_device *adev,
-   void *inject_if);
-   int (*query_ras_error_count)(struct amdgpu_device *adev,
-void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
-   void (*query_ras_error_status)(struct amdgpu_device *adev);
-   void (*reset_ras_error_status)(struct amdgpu_device *adev);
+struct amdgpu_gfx_ras {
+   struct amdgpu_ras_block_object  ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
 };
 
@@ -348,7 +341,7 @@ struct amdgpu_gfx {
 
/*ras */
struct ras_common_if*ras_if;
-   const struct amdgpu_gfx_ras_funcs   *ras_funcs;
+   struct amdgpu_gfx_ras   *ras;
 };
 
 #define amdgpu_gfx_get_gpu_clock_counter(adev) 
(adev)->gfx.funcs->get_gpu_clock_counter((adev))
@@ -410,7 +403,7 @@ bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device 
*adev, int me,
int pipe, int queue);
 void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable);
 int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value);
-int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev);
+int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info);
 void amdgpu_gfx_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 67a08629711c..8ca1f294c202 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ 

[PATCH V3 06/12] drm/amdgpu: Modify mmhub block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify mmhub block to fit for the unified ras block data and ops.
2.Change amdgpu_mmhub_ras_funcs to amdgpu_mmhub_ras, and the corresponding 
variable name remove _funcs suffix.
3.Remove the const flag of mmhub ras variable so that mmhub ras block can be 
able to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register mmhub ras block 
into amdgpu device ras block link list. 5.Remove the redundant code about mmhub 
in amdgpu_ras.c after using the unified ras block.
5.Remove the redundant code about mmhub in amdgpu_ras.c after using the unified 
ras block.
6.Fill unified ras block .name .block .ras_late_init and .ras_fini for all of 
mmhub versions. If .ras_late_init and .ras_fini had been defined by the 
selected mmhub version, the defined functions will take effect; if not defined, 
default fill them with amdgpu_mmhub_ras_late_init and amdgpu_mmhub_ras_fini.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c| 10 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h  | 14 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 47 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 28 ++---
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c| 10 +++--
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.h|  2 +-
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c| 10 +++--
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.h|  2 +-
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c| 10 +++--
 drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.h|  2 +-
 12 files changed, 74 insertions(+), 75 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0980396ee709..feb93880c63d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3377,9 +3377,9 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
if (adev->asic_reset_res)
goto fail;
 
-   if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->reset_ras_error_count)
-   adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
+   adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
+   
adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
} else {
 
task_barrier_full(>tb);
@@ -4705,9 +4705,9 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
-   if (tmp_adev->mmhub.ras_funcs &&
-   tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
-   
tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
+   if (tmp_adev->mmhub.ras && 
tmp_adev->mmhub.ras->ras_block.hw_ops &&
+   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
+   
tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
}
 
amdgpu_ras_intr_cleared();
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index b8902ad7e375..b12da46c483a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -441,9 +441,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
return r;
}
 
-   if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->ras_late_init) {
-   r = adev->mmhub.ras_funcs->ras_late_init(adev);
+   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ras_late_init) {
+   r = adev->mmhub.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
@@ -495,9 +494,8 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
adev->umc.ras_funcs->ras_fini)
adev->umc.ras_funcs->ras_fini(adev);
 
-   if (adev->mmhub.ras_funcs &&
-   adev->mmhub.ras_funcs->ras_fini)
-   adev->mmhub.ras_funcs->ras_fini(adev);
+   if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ras_fini)
+   adev->mmhub.ras->ras_block.ras_fini(adev);
 
if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_fini)
adev->gmc.xgmi.ras->ras_block.ras_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
index ead3dc572ec5..d7e7708f0360 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
@@ -24,7 +24,7 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
-int 

[PATCH V3 05/12] drm/amdgpu: Modify hdp block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify hdp block to fit for the unified ras block data and ops.
2.Change amdgpu_hdp_ras_funcs to amdgpu_hdp_ras, and the corresponding variable 
name remove _funcs suffix.
3.Remove the const flag of hdp ras variable so that hdp ras block can be able 
to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register hdp ras block into 
amdgpu device ras block link list.
5.Remove the redundant code about hdp in amdgpu_ras.c after using the unified 
ras block.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  9 +
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   | 14 +++---
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.h   |  2 +-
 7 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index af65ec46f783..b8902ad7e375 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -459,9 +459,8 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
return r;
}
 
-   if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->ras_late_init) {
-   r = adev->hdp.ras_funcs->ras_late_init(adev);
+   if (adev->hdp.ras && adev->hdp.ras->ras_block.ras_late_init) {
+   r = adev->hdp.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
@@ -503,9 +502,8 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_fini)
adev->gmc.xgmi.ras->ras_block.ras_fini(adev);
 
-   if (adev->hdp.ras_funcs &&
-   adev->hdp.ras_funcs->ras_fini)
-   adev->hdp.ras_funcs->ras_fini(adev);
+   if (adev->hdp.ras && adev->hdp.ras->ras_block.ras_fini)
+   adev->hdp.ras->ras_block.ras_fini(adev);
 }
 
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
index 1d50d534d77c..5e6b57de3e1c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
@@ -24,7 +24,7 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
-int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev)
+int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, void *ras_info)
 {
int r;
struct ras_ih_if ih_info = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index 7ec99d591584..c94a4b3c8d6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -22,13 +22,10 @@
  */
 #ifndef __AMDGPU_HDP_H__
 #define __AMDGPU_HDP_H__
+#include "amdgpu_ras.h"
 
-struct amdgpu_hdp_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   void (*query_ras_error_count)(struct amdgpu_device *adev,
- void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+struct amdgpu_hdp_ras{
+   struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_hdp_funcs {
@@ -43,9 +40,9 @@ struct amdgpu_hdp_funcs {
 struct amdgpu_hdp {
struct ras_common_if*ras_if;
const struct amdgpu_hdp_funcs   *funcs;
-   const struct amdgpu_hdp_ras_funcs   *ras_funcs;
+   struct amdgpu_hdp_ras   *ras;
 };
 
-int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev);
+int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, void *ras_info);
 void amdgpu_hdp_ras_fini(struct amdgpu_device *adev);
 #endif /* __AMDGPU_HDP_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8b0d11bb8186..ecb35d9994ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -982,6 +982,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
adev->nbio.ras_funcs->query_ras_error_count(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
+   case AMDGPU_RAS_BLOCK__HDP:
if (!block_obj || !block_obj->hw_ops)   {
dev_info(adev->dev, "%s doesn't config ras function \n",
get_ras_block_str(>head));
@@ -990,11 +991,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
if (block_obj->hw_ops->query_ras_error_count)
block_obj->hw_ops->query_ras_error_count(adev, 
_data);
break;
-   case AMDGPU_RAS_BLOCK__HDP:
-   if (adev->hdp.ras_funcs &&
-   

[PATCH V3 02/12] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h

2021-12-28 Thread yipechai
Modify the compilation failed problem when other ras blocks' .h include 
amdgpu_ras.h.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 ---
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9dd698354e04..67a08629711c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2773,6 +2773,28 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
 }
 }
 #endif
+
+/* check if ras is supported on block, say, sdma, gfx */
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+   unsigned int block)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (block >= AMDGPU_RAS_BLOCK_COUNT)
+   return 0;
+   return ras && (adev->ras_enabled & (1 << block));
+}
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
+   schedule_work(>recovery_work);
+   return 0;
+}
+
+
 /* Register each ip ras block into amdgpu ras */
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object* ras_block_obj)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 9dbe8d49b891..3d7a45ec4d9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -513,16 +513,6 @@ struct amdgpu_ras_block_hw_ops {
 #define amdgpu_ras_get_context(adev)   ((adev)->psp.ras_context.ras)
 #define amdgpu_ras_set_context(adev, ras_con)  ((adev)->psp.ras_context.ras = 
(ras_con))
 
-/* check if ras is supported on block, say, sdma, gfx */
-static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
-   unsigned int block)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (block >= AMDGPU_RAS_BLOCK_COUNT)
-   return 0;
-   return ras && (adev->ras_enabled & (1 << block));
-}
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 
@@ -539,15 +529,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 
 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
 
-static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
-   schedule_work(>recovery_work);
-   return 0;
-}
-
 static inline enum ta_ras_block
 amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
switch (block) {
@@ -679,5 +660,9 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block);
 
 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
 
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,unsigned int 
block);
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
+
 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct 
amdgpu_ras_block_object* ras_block_obj);
 #endif
-- 
2.25.1



[PATCH V3 04/12] drm/amdgpu: Modify xgmi block to fit for the unified ras block data and ops

2021-12-28 Thread yipechai
1.Modify gmc block to fit for the unified ras block data and ops.
2.Change amdgpu_xgmi_ras_funcs to amdgpu_xgmi_ras, and the corresponding 
variable name remove _funcs suffix.
3.Remove the const flag of gmc ras variable so that gmc ras block can be able 
to be inserted into amdgpu device ras block link list.
4.Invoke amdgpu_ras_register_ras_block function to register gmc ras block into 
amdgpu device ras block link list.
5.Remove the redundant code about gmc in amdgpu_ras.c after using the unified 
ras block.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  | 16 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  | 11 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 10 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 26 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  4 ++--
 5 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 83f26bca7dac..af65ec46f783 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -448,12 +448,13 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
return r;
}
 
-   if (!adev->gmc.xgmi.connected_to_cpu)
-   adev->gmc.xgmi.ras_funcs = _ras_funcs;
+   if (!adev->gmc.xgmi.connected_to_cpu) {
+   adev->gmc.xgmi.ras = _ras;
+   amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
+   }
 
-   if (adev->gmc.xgmi.ras_funcs &&
-   adev->gmc.xgmi.ras_funcs->ras_late_init) {
-   r = adev->gmc.xgmi.ras_funcs->ras_late_init(adev);
+   if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
+   r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
if (r)
return r;
}
@@ -499,9 +500,8 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev)
adev->mmhub.ras_funcs->ras_fini)
adev->mmhub.ras_funcs->ras_fini(adev);
 
-   if (adev->gmc.xgmi.ras_funcs &&
-   adev->gmc.xgmi.ras_funcs->ras_fini)
-   adev->gmc.xgmi.ras_funcs->ras_fini(adev);
+   if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_fini)
+   adev->gmc.xgmi.ras->ras_block.ras_fini(adev);
 
if (adev->hdp.ras_funcs &&
adev->hdp.ras_funcs->ras_fini)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index e55201134a01..923db5ff5859 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -29,6 +29,7 @@
 #include 
 
 #include "amdgpu_irq.h"
+#include "amdgpu_ras.h"
 
 /* VA hole for 48bit addresses on Vega10 */
 #define AMDGPU_GMC_HOLE_START  0x8000ULL
@@ -135,12 +136,8 @@ struct amdgpu_gmc_funcs {
unsigned int (*get_vbios_fb_size)(struct amdgpu_device *adev);
 };
 
-struct amdgpu_xgmi_ras_funcs {
-   int (*ras_late_init)(struct amdgpu_device *adev);
-   void (*ras_fini)(struct amdgpu_device *adev);
-   int (*query_ras_error_count)(struct amdgpu_device *adev,
-void *ras_error_status);
-   void (*reset_ras_error_count)(struct amdgpu_device *adev);
+struct amdgpu_xgmi_ras {
+   struct amdgpu_ras_block_object ras_block;
 };
 
 struct amdgpu_xgmi {
@@ -159,7 +156,7 @@ struct amdgpu_xgmi {
struct ras_common_if *ras_if;
bool connected_to_cpu;
bool pending_reset;
-   const struct amdgpu_xgmi_ras_funcs *ras_funcs;
+   struct amdgpu_xgmi_ras *ras;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8ca1f294c202..8b0d11bb8186 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -982,9 +982,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
adev->nbio.ras_funcs->query_ras_error_count(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-   if (adev->gmc.xgmi.ras_funcs &&
-   adev->gmc.xgmi.ras_funcs->query_ras_error_count)
-   adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, 
_data);
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n",
+   get_ras_block_str(>head));
+   return -EINVAL;
+   }
+   if (block_obj->hw_ops->query_ras_error_count)
+   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
break;
case AMDGPU_RAS_BLOCK__HDP:
if (adev->hdp.ras_funcs &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 0d149f5f000e..e3c6898c5d13 100644

[PATCH V3 01/12] drm/amdgpu: Unify ras block interface for each ras block

2021-12-28 Thread yipechai
1. Define unified ops interface for each block.
2. Add ras_block_match function pointer in ops interface, each ras block can 
customize specail match function to identify itself.
3. Add amdgpu_ras_block_match_default new function. If a ras block doesn't 
define .ras_block_match, default execute amdgpu_ras_block_match_default to 
identify this ras block.
4. Define unified basic ras block data for each ras block.
5. Create dedicated amdgpu device ras block link list to manage all of the ras 
blocks.
6. Add amdgpu_ras_register_ras_block new function interface for each ras block 
to register itself to ras controlling block.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 46 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 28 +
 4 files changed, 78 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index db1505455761..eddf230856e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1151,6 +1151,8 @@ struct amdgpu_device {
boolbarrier_has_auto_waitcnt;
 
struct amdgpu_reset_control *reset_cntl;
+
+   struct list_headras_list;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 73ec46140d68..0980396ee709 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3578,6 +3578,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
INIT_LIST_HEAD(>reset_list);
 
+   INIT_LIST_HEAD(>ras_list);
+
INIT_DELAYED_WORK(>delayed_init_work,
  amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(>gfx.gfx_off_delay_work,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 90f0db3b4f65..9dd698354e04 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -862,6 +862,40 @@ static int amdgpu_ras_enable_all_features(struct 
amdgpu_device *adev,
 }
 /* feature ctl end */
 
+int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object* block_obj, 
enum amdgpu_ras_block block)
+{
+   if(!block_obj)
+   return -EINVAL;
+
+   if (block_obj->block == block)
+   return 0;
+
+   return -EINVAL;
+}
+
+static struct amdgpu_ras_block_object* amdgpu_ras_get_ras_block(struct 
amdgpu_device *adev,
+   enum amdgpu_ras_block block, uint32_t 
sub_block_index)
+{
+   struct amdgpu_ras_block_object *obj, *tmp;
+
+   if (block >= AMDGPU_RAS_BLOCK__LAST)
+   return NULL;
+
+   if (!amdgpu_ras_is_supported(adev, block))
+   return NULL;
+
+   list_for_each_entry_safe(obj, tmp, >ras_list, node) {
+   if (obj->ras_block_match) {
+   if (obj->ras_block_match(obj, block, sub_block_index) 
== 0)
+   return obj;
+   } else {
+   if (amdgpu_ras_block_match_default(obj, block) == 0)
+   return obj;
+   }
+   }
+
+   return NULL;
+}
 
 void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
   struct ras_common_if *ras_block,
@@ -2739,3 +2773,15 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
 }
 }
 #endif
+/* Register each ip ras block into amdgpu ras */
+int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
+   struct amdgpu_ras_block_object* ras_block_obj)
+{
+   if (!adev || !ras_block_obj)
+   return -EINVAL;
+
+   INIT_LIST_HEAD(_block_obj->node);
+   list_add_tail(_block_obj->node, >ras_list);
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cdd0010a5389..9dbe8d49b891 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,6 +469,33 @@ struct ras_debug_if {
};
int op;
 };
+
+struct amdgpu_ras_block_object {
+   /* block name */
+   char name[32];
+
+   enum amdgpu_ras_block block;
+
+   uint32_t sub_block_index;
+
+   /* ras block link */
+   struct list_head node;
+
+   int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum 
amdgpu_ras_block block, uint32_t sub_block_index);
+   int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info);
+   void (*ras_fini)(struct amdgpu_device *adev);
+   const struct amdgpu_ras_block_hw_ops *hw_ops;
+};
+
+struct amdgpu_ras_block_hw_ops {
+   int  (*ras_error_inject)(struct amdgpu_device *adev, 

RE: [PATCH] drm/amdkfd: enable sdma ecc interrupt event can be handled by event_interrupt_wq_v9

2021-12-28 Thread Zhang, Hawking
[AMD Official Use Only]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of yipechai
Sent: Wednesday, December 29, 2021 10:39
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: [PATCH] drm/amdkfd: enable sdma ecc interrupt event can be handled by 
event_interrupt_wq_v9

Enable sdma ecc interrupt event can be handled by event_interrupt_wq_v9.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index d15fe087ba55..1d92e1b7f8d4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -220,6 +220,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 */
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP ||
+   source_id == SOC15_INTSRC_SDMA_ECC ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
KFD_IRQ_IS_FENCE(client_id, source_id) ||
--
2.25.1



[PATCH] drm/amdkfd: enable sdma ecc interrupt event can be handled by event_interrupt_wq_v9

2021-12-28 Thread yipechai
Enable sdma ecc interrupt event can be handled by event_interrupt_wq_v9.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index d15fe087ba55..1d92e1b7f8d4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -220,6 +220,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 */
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP ||
+   source_id == SOC15_INTSRC_SDMA_ECC ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
KFD_IRQ_IS_FENCE(client_id, source_id) ||
-- 
2.25.1