RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

2024-04-24 Thread Chai, Thomas
[AMD Official Use Only - General]

amdgpu_umc_fill_error_record  is called in umc_v12_0_convert_error_address 
directly to prepare for page retirement,
The new path need to check if these converted pages already exist before 
filling the error page,  umc_v12_0_convert_error_address is not suitable for 
new requirements, so I created a new interface.

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, April 25, 2024 11:03 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

[AMD Official Use Only - General]

I might lose some context here. Can you please elaborate why we don't leverage 
the existing umc_v12_0_convert_error_address implementation?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

Umc v12_0 converts error address.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +-  
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 81435533c4a7..085dcfe16b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
}
 }

+static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
+   struct ta_ras_query_address_input *addr_in,
+   uint64_t *pfns, int len) {
+   uint32_t col, row, row_xor, bank, channel_index;
+   uint64_t soc_pa, retired_page, column, err_addr;
+   struct ta_ras_query_address_output addr_out;
+   uint32_t pos = 0;
+
+   err_addr = addr_in->ma.err_addr;
+   addr_in->addr_type = TA_RAS_MCA_TO_PA;
+   if (psp_ras_query_address(>psp, addr_in, _out)) {
+   dev_warn(adev->dev, "Failed to query RAS physical address for 
0x%llx",
+   err_addr);
+   return 0;
+   }
+
+   soc_pa = addr_out.pa.pa;
+   bank = addr_out.pa.bank;
+   channel_index = addr_out.pa.channel_idx;
+
+   col = (err_addr >> 1) & 0x1fULL;
+   row = (err_addr >> 10) & 0x3fffULL;
+   row_xor = row ^ (0x1ULL << 13);
+   /* clear [C3 C2] in soc physical address */
+   soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+   /* clear [C4] in soc physical address */
+   soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+   /* loop for all possibilities of [C4 C3 C2] */
+   for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+   retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+   retired_page |= (((column & 0x4) >> 2) <<
+ UMC_V12_0_PA_C4_BIT);
+
+   if (pos >= len)
+   return 0;
+   pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+   /* include column bit 0 and 1 */
+   col &= 0x3;
+   col |= (column << 2);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row, col, bank, channel_index);
+
+   /* shift R13 bit */
+   retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+
+   if (pos >= len)
+   return 0;
+   pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row_xor, col, bank, channel_index);
+   }
+
+   return pos;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
uint32_t node_inst, uint32_t umc_inst,
uint32_t ch_inst, void *data) @@ -482,8 
+542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, 
struct ras_common  static int umc_v12_0_update_ecc_status(struct amdgpu_device 
*adev,
uint64_t status, uint64_t ipid, uint64_t addr)  {
-   uint16_t hwid, mcatype;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint16_t hwid, mcatype;
+   struct ta_ras_query_address_input addr_in;
+   uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PE

RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

2024-04-24 Thread Zhang, Hawking
[AMD Official Use Only - General]

I might lose some context here. Can you please elaborate why we don't leverage 
the existing umc_v12_0_convert_error_address implementation?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

Umc v12_0 converts error address.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +-  
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 81435533c4a7..085dcfe16b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
}
 }

+static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
+   struct ta_ras_query_address_input *addr_in,
+   uint64_t *pfns, int len)
+{
+   uint32_t col, row, row_xor, bank, channel_index;
+   uint64_t soc_pa, retired_page, column, err_addr;
+   struct ta_ras_query_address_output addr_out;
+   uint32_t pos = 0;
+
+   err_addr = addr_in->ma.err_addr;
+   addr_in->addr_type = TA_RAS_MCA_TO_PA;
+   if (psp_ras_query_address(>psp, addr_in, _out)) {
+   dev_warn(adev->dev, "Failed to query RAS physical address for 
0x%llx",
+   err_addr);
+   return 0;
+   }
+
+   soc_pa = addr_out.pa.pa;
+   bank = addr_out.pa.bank;
+   channel_index = addr_out.pa.channel_idx;
+
+   col = (err_addr >> 1) & 0x1fULL;
+   row = (err_addr >> 10) & 0x3fffULL;
+   row_xor = row ^ (0x1ULL << 13);
+   /* clear [C3 C2] in soc physical address */
+   soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+   /* clear [C4] in soc physical address */
+   soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+   /* loop for all possibilities of [C4 C3 C2] */
+   for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+   retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+   retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+
+   if (pos >= len)
+   return 0;
+   pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+   /* include column bit 0 and 1 */
+   col &= 0x3;
+   col |= (column << 2);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row, col, bank, channel_index);
+
+   /* shift R13 bit */
+   retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+
+   if (pos >= len)
+   return 0;
+   pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row_xor, col, bank, channel_index);
+   }
+
+   return pos;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
uint32_t node_inst, uint32_t umc_inst,
uint32_t ch_inst, void *data)
@@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device 
*adev, struct ras_common  static int umc_v12_0_update_ecc_status(struct 
amdgpu_device *adev,
uint64_t status, uint64_t ipid, uint64_t addr)  {
-   uint16_t hwid, mcatype;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint16_t hwid, mcatype;
+   struct ta_ras_query_address_input addr_in;
+   uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
+   uint64_t err_addr;
+   int count;

hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -497,6 +561,34 
@@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
if (!umc_v12_0_is_deferred_error(adev, status))
return 0;

+   err_addr = REG_GET_FIELD(addr,
+   MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
+
+   dev_info(adev->dev,
+   "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, 
err_addr:0x%llx\n",
+   ipid,
+   MCA_IPID_2_SOCKET_ID(ipid),
+   MCA_IPID_2_DIE_ID(ipid),
+   MCA_IPID_2_UMC_INST(ipid),
+   MCA_IPID_2_UMC_CH(ipid),
+   err_addr);
+
+