Add umc page retirement for umc v12_0.

V2:
  1. Changed umc page retirement check condition
     to call umc_v12_0_is_uncorrectable_error.
  2. Use memset to clear the contents of the umc
     error address structure.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 56 ++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |  4 ++
 2 files changed, 60 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 8430888760ba..7458a218e89d 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -26,6 +26,7 @@
 #include "amdgpu.h"
 #include "umc/umc_12_0_0_offset.h"
 #include "umc/umc_12_0_0_sh_mask.h"
+#include "mp/mp_13_0_6_sh_mask.h"
 
 const uint32_t
        umc_v12_0_channel_idx_tbl[]
@@ -370,6 +371,59 @@ static int umc_v12_0_err_cnt_init_per_channel(struct 
amdgpu_device *adev,
        return 0;
 }
 
+static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device 
*adev,
+                                       void *ras_error_status)
+{
+       amdgpu_mca_smu_log_ras_error(adev,
+               AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, 
ras_error_status);
+       amdgpu_mca_smu_log_ras_error(adev,
+               AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, 
ras_error_status);
+}
+
+static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device 
*adev,
+                                       void *ras_error_status)
+{
+       struct ras_err_node *err_node;
+       uint64_t mc_umc_status;
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+
+       for_each_ras_error(err_node, err_data) {
+               mc_umc_status = err_node->err_info.err_addr.err_status;
+               if (!mc_umc_status)
+                       continue;
+
+               if (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status)) {
+                       uint64_t mca_addr, err_addr, mca_ipid;
+                       uint32_t InstanceIdLo;
+                       struct amdgpu_smuio_mcm_config_info *mcm_info;
+
+                       mcm_info = &err_node->err_info.mcm_info;
+                       mca_addr = err_node->err_info.err_addr.err_addr;
+                       mca_ipid = err_node->err_info.err_addr.err_ipid;
+
+                       err_addr =  REG_GET_FIELD(mca_addr, 
MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+                       InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, 
InstanceIdLo);
+
+                       dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, 
ch:%d, err_addr:0x%llx\n",
+                               mca_ipid,
+                               mcm_info->die_id,
+                               MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+                               MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+                               err_addr);
+
+                       umc_v12_0_convert_error_address(adev,
+                               err_data, err_addr,
+                               MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
+                               MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
+                               mcm_info->die_id);
+
+                       /* Clear umc error address content */
+                       memset(&err_node->err_info.err_addr,
+                               0, sizeof(err_node->err_info.err_addr));
+               }
+       }
+}
+
 static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev)
 {
        amdgpu_umc_loop_channels(adev,
@@ -396,4 +450,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
        },
        .err_cnt_init = umc_v12_0_err_cnt_init,
        .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
+       .ecc_info_query_ras_error_count = 
umc_v12_0_ecc_info_query_ras_error_count,
+       .ecc_info_query_ras_error_address = 
umc_v12_0_ecc_info_query_ras_error_address,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 17b4b52d6f13..e8de3a92251a 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -117,6 +117,10 @@
                (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << 
UMC_V12_0_PA_CH6_BIT); \
        } while (0)
 
+#define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \
+                       (((_ipid_lo) >> 12) & 0xF))
+#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
+
 bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status);
 bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status);
 
-- 
2.34.1

Reply via email to