RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address
[AMD Official Use Only - General] amdgpu_umc_fill_error_record is called in umc_v12_0_convert_error_address directly to prepare for page retirement, The new path need to check if these converted pages already exist before filling the error page, umc_v12_0_convert_error_address is not suitable for new requirements, so I created a new interface. - Best Regards, Thomas -Original Message- From: Zhang, Hawking Sent: Thursday, April 25, 2024 11:03 AM To: Chai, Thomas ; amd-gfx@lists.freedesktop.org Cc: Zhou1, Tao ; Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley Subject: RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address [AMD Official Use Only - General] I might lose some context here. Can you please elaborate why we don't leverage the existing umc_v12_0_convert_error_address implementation? Regards, Hawking -Original Message- From: Chai, Thomas Sent: Thursday, April 18, 2024 10:58 To: amd-gfx@lists.freedesktop.org Cc: Chai, Thomas ; Zhang, Hawking ; Zhou1, Tao ; Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley ; Chai, Thomas Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address Umc v12_0 converts error address. Signed-off-by: YiPeng Chai --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 81435533c4a7..085dcfe16b5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, } } +static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev, + struct ta_ras_query_address_input *addr_in, + uint64_t *pfns, int len) { + uint32_t col, row, row_xor, bank, channel_index; + uint64_t soc_pa, retired_page, column, err_addr; + struct ta_ras_query_address_output addr_out; + uint32_t pos = 0; + + err_addr = addr_in->ma.err_addr; + addr_in->addr_type = TA_RAS_MCA_TO_PA; + if (psp_ras_query_address(>psp, addr_in, _out)) { + dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx", + err_addr); + return 0; + } + + soc_pa = addr_out.pa.pa; + bank = addr_out.pa.bank; + channel_index = addr_out.pa.channel_idx; + + col = (err_addr >> 1) & 0x1fULL; + row = (err_addr >> 10) & 0x3fffULL; + row_xor = row ^ (0x1ULL << 13); + /* clear [C3 C2] in soc physical address */ + soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); + /* clear [C4] in soc physical address */ + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + + /* loop for all possibilities of [C4 C3 C2] */ + for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { + retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); + retired_page |= (((column & 0x4) >> 2) << + UMC_V12_0_PA_C4_BIT); + + if (pos >= len) + return 0; + pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + + /* include column bit 0 and 1 */ + col &= 0x3; + col |= (column << 2); + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row, col, bank, channel_index); + + /* shift R13 bit */ + retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); + + if (pos >= len) + return 0; + pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row_xor, col, bank, channel_index); + } + + return pos; +} + static int umc_v12_0_query_error_address(struct amdgpu_device *adev, uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, void *data) @@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, uint64_t status, uint64_t ipid, uint64_t addr) { - uint16_t hwid, mcatype; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + uint16_t hwid, mcatype; + struct ta_ras_query_address_input addr_in; + uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PE
RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address
[AMD Official Use Only - General] I might lose some context here. Can you please elaborate why we don't leverage the existing umc_v12_0_convert_error_address implementation? Regards, Hawking -Original Message- From: Chai, Thomas Sent: Thursday, April 18, 2024 10:58 To: amd-gfx@lists.freedesktop.org Cc: Chai, Thomas ; Zhang, Hawking ; Zhou1, Tao ; Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley ; Chai, Thomas Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address Umc v12_0 converts error address. Signed-off-by: YiPeng Chai --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 81435533c4a7..085dcfe16b5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, } } +static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev, + struct ta_ras_query_address_input *addr_in, + uint64_t *pfns, int len) +{ + uint32_t col, row, row_xor, bank, channel_index; + uint64_t soc_pa, retired_page, column, err_addr; + struct ta_ras_query_address_output addr_out; + uint32_t pos = 0; + + err_addr = addr_in->ma.err_addr; + addr_in->addr_type = TA_RAS_MCA_TO_PA; + if (psp_ras_query_address(>psp, addr_in, _out)) { + dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx", + err_addr); + return 0; + } + + soc_pa = addr_out.pa.pa; + bank = addr_out.pa.bank; + channel_index = addr_out.pa.channel_idx; + + col = (err_addr >> 1) & 0x1fULL; + row = (err_addr >> 10) & 0x3fffULL; + row_xor = row ^ (0x1ULL << 13); + /* clear [C3 C2] in soc physical address */ + soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); + /* clear [C4] in soc physical address */ + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + + /* loop for all possibilities of [C4 C3 C2] */ + for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { + retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); + retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); + + if (pos >= len) + return 0; + pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + + /* include column bit 0 and 1 */ + col &= 0x3; + col |= (column << 2); + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row, col, bank, channel_index); + + /* shift R13 bit */ + retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); + + if (pos >= len) + return 0; + pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + + dev_info(adev->dev, + "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + retired_page, row_xor, col, bank, channel_index); + } + + return pos; +} + static int umc_v12_0_query_error_address(struct amdgpu_device *adev, uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, void *data) @@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, uint64_t status, uint64_t ipid, uint64_t addr) { - uint16_t hwid, mcatype; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + uint16_t hwid, mcatype; + struct ta_ras_query_address_input addr_in; + uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; + uint64_t err_addr; + int count; hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -497,6 +561,34 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, if (!umc_v12_0_is_deferred_error(adev, status)) return 0; + err_addr = REG_GET_FIELD(addr, + MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + + dev_info(adev->dev, + "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n", + ipid, + MCA_IPID_2_SOCKET_ID(ipid), + MCA_IPID_2_DIE_ID(ipid), + MCA_IPID_2_UMC_INST(ipid), + MCA_IPID_2_UMC_CH(ipid), + err_addr); + +