Add ras helper function to query boot time gpu
errors.

Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..db44ec857a31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
        WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32)
+#define AMDGPU_SMN_CROSS_AID (1ULL << 34)
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 39399d0f2ce5..5f302b7693b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
        return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_92    0x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+                                                u32 instance, u32 boot_error)
+{
+       u32 socket_id, aid_id, hbm_id;
+       u32 reg_data;
+       u64 reg_addr;
+
+       socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+       aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+       hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+       if (instance)
+               reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+                          AMDGPU_SMN_TARGET_AID(instance) +
+                          AMDGPU_SMN_CROSS_AID;
+       else
+               reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
+
+       reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+       dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+               socket_id, aid_id, reg_data);
+
+       if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+                        socket_id, aid_id, hbm_id);
+
+       if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+                        socket_id, aid_id);
+
+       if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+                        socket_id, aid_id);
+
+       if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+                        socket_id, aid_id);
+
+       if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+                        socket_id, aid_id);
+
+       if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+                        socket_id, aid_id);
+
+       if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+                        socket_id, aid_id, hbm_id);
+
+       if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+               dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+                        socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+                                            u32 instance, u32 *boot_error)
+{
+       u32 reg_addr;
+       u32 reg_data;
+       int retry_loop;
+
+       if (instance)
+               reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+                          AMDGPU_SMN_TARGET_AID(instance) +
+                          AMDGPU_SMN_CROSS_AID;
+       else
+               reg_addr = (mmMP0_SMN_C2PMSG_126 << 2);
+
+       for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+               reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+               if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+                       *boot_error = reg_data;
+                       return 0;
+               }
+               msleep(1);
+       }
+
+       *boot_error = reg_data;
+       return -ETIME;
+}
+
+void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 
num_instances)
+{
+       u32 boot_error = 0;
+       u32 i;
+
+       for (i = 0; i < num_instances; i++) {
+               if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
+                       amdgpu_ras_boot_time_error_reporting(adev, i, 
boot_error);
+       }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 76fb85628716..5785b705c692 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -32,6 +32,19 @@
 
 struct amdgpu_iv_entry;
 
+#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x)             AMDGPU_GET_REG_FIELD(x, 
0, 0)
+#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x)                  AMDGPU_GET_REG_FIELD(x, 
1, 1)
+#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x)       AMDGPU_GET_REG_FIELD(x, 
2, 2)
+#define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x)       AMDGPU_GET_REG_FIELD(x, 
3, 3)
+#define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x)     AMDGPU_GET_REG_FIELD(x, 
4, 4)
+#define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x)     AMDGPU_GET_REG_FIELD(x, 
5, 5)
+#define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x)             AMDGPU_GET_REG_FIELD(x, 
6, 6)
+#define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x)            AMDGPU_GET_REG_FIELD(x, 
7, 7)
+#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)                        
AMDGPU_GET_REG_FIELD(x, 10, 8)
+#define AMDGPU_RAS_GPU_ERR_AID_ID(x)                   AMDGPU_GET_REG_FIELD(x, 
12, 11)
+#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)                   AMDGPU_GET_REG_FIELD(x, 
13, 13)
+#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)              AMDGPU_GET_REG_FIELD(x, 
31, 31)
+
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS          (0x1 << 0)
 /* position of instance value in sub_block_index of
  * ta_ras_trigger_error_input, the sub block uses lower 12 bits
@@ -818,5 +831,5 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data 
*err_data,
 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
                struct amdgpu_smuio_mcm_config_info *mcm_info,
                struct ras_err_addr *err_addr, u64 count);
-
+void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 
num_instances);
 #endif
-- 
2.17.1

Reply via email to