RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
[AMD Official Use Only - General] - Best Regards, Thomas _ From: Zhou1, Tao Sent: Thursday, January 18, 2024 11:24 AM To: Chai, Thomas ; Zhang, Hawking ; amd-gfx@lists.freedesktop.org Cc: Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning [AMD Official Use Only - General] _ From: Chai, Thomas mailto:yipeng.c...@amd.com>> Sent: Thursday, January 18, 2024 11:06 AM To: Zhang, Hawking mailto:hawking.zh...@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice mailto:candice...@amd.com>>; Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Yang, Stanley mailto:stanley.y...@amd.com>> Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning [AMD Official Use Only - General] - Best Regards, Thomas _ From: Zhang, Hawking mailto:hawking.zh...@amd.com>> Sent: Wednesday, January 17, 2024 7:54 PM To: Chai, Thomas mailto:yipeng.c...@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice mailto:candice...@amd.com>>; Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Yang, Stanley mailto:stanley.y...@amd.com>> Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning [AMD Official Use Only - General] Please check my comments inline Regards, Hawking -Original Message- From: Chai, Thomas mailto:yipeng.c...@amd.com>> Sent: Tuesday, January 16, 2024 16:21 To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice mailto:candice...@amd.com>>; Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Yang, Stanley mailto:stanley.y...@amd.com>>; Chai, Thomas mailto:yipeng.c...@amd.com>> Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning Use asynchronous polling to handle umc_v12_0 poisoning. Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 + 3 files changed, 120 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 856206e95842..44929281840e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_read(&con->page_retirement_req_cnt)); atomic_dec(&con->page_retirement_req_cnt); + + amdgpu_umc_poison_retire_page_polling_timeout(adev, + false, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9d1cf41cf483..2dde29cb807d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -23,6 +23,7 @@ #include "amdgpu.h" #include "umc_v6_7.h" +#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, return ret; } -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, - void *ras_error_status, - struct amdgpu_iv_entry *entry, - bool reset) +static void amdgpu_umc_handle_bad_pages(struct amdgpu_devi
RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
[AMD Official Use Only - General] _ From: Chai, Thomas Sent: Thursday, January 18, 2024 11:06 AM To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org Cc: Zhou1, Tao ; Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning [AMD Official Use Only - General] - Best Regards, Thomas _ From: Zhang, Hawking mailto:hawking.zh...@amd.com>> Sent: Wednesday, January 17, 2024 7:54 PM To: Chai, Thomas mailto:yipeng.c...@amd.com>>; amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice mailto:candice...@amd.com>>; Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Yang, Stanley mailto:stanley.y...@amd.com>> Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning [AMD Official Use Only - General] Please check my comments inline Regards, Hawking -Original Message- From: Chai, Thomas mailto:yipeng.c...@amd.com>> Sent: Tuesday, January 16, 2024 16:21 To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice mailto:candice...@amd.com>>; Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Yang, Stanley mailto:stanley.y...@amd.com>>; Chai, Thomas mailto:yipeng.c...@amd.com>> Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning Use asynchronous polling to handle umc_v12_0 poisoning. Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 + 3 files changed, 120 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 856206e95842..44929281840e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_read(&con->page_retirement_req_cnt)); atomic_dec(&con->page_retirement_req_cnt); + + amdgpu_umc_poison_retire_page_polling_timeout(adev, + false, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9d1cf41cf483..2dde29cb807d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -23,6 +23,7 @@ #include "amdgpu.h" #include "umc_v6_7.h" +#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, return ret; } -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, - void *ras_error_status, - struct amdgpu_iv_entry *entry, - bool reset) +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, + void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int ret = 0; unsigned long err_count; - - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + mutex_lock(&con->page_retirement_lock); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); if (ret == -EOPNOTSUPP) { if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ -163,19 +161,86 @@ static int amdgpu_umc_do_page_retir
RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
[AMD Official Use Only - General] - Best Regards, Thomas _ From: Zhang, Hawking Sent: Wednesday, January 17, 2024 7:54 PM To: Chai, Thomas ; amd-gfx@lists.freedesktop.org Cc: Zhou1, Tao ; Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning [AMD Official Use Only - General] Please check my comments inline Regards, Hawking -Original Message- From: Chai, Thomas mailto:yipeng.c...@amd.com>> Sent: Tuesday, January 16, 2024 16:21 To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice mailto:candice...@amd.com>>; Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Yang, Stanley mailto:stanley.y...@amd.com>>; Chai, Thomas mailto:yipeng.c...@amd.com>> Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning Use asynchronous polling to handle umc_v12_0 poisoning. Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 + 3 files changed, 120 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 856206e95842..44929281840e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_read(&con->page_retirement_req_cnt)); atomic_dec(&con->page_retirement_req_cnt); + + amdgpu_umc_poison_retire_page_polling_timeout(adev, + false, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9d1cf41cf483..2dde29cb807d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -23,6 +23,7 @@ #include "amdgpu.h" #include "umc_v6_7.h" +#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, return ret; } -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, - void *ras_error_status, - struct amdgpu_iv_entry *entry, - bool reset) +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, + void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int ret = 0; unsigned long err_count; - - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + mutex_lock(&con->page_retirement_lock); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); if (ret == -EOPNOTSUPP) { if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ -163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, con->update_channel_flag = false; } } - - if (reset) { - /* use mode-2 reset for poison consumption */ - if (!entry) - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; - amdgpu_ras_reset_gpu(adev); - } } kfree(err_data->err_addr); + + mutex_unlock(&con->page_retirement_lock); +} + +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, + void *ras_error_status, + struct amdgpu_iv_entry *entry, + bool reset) +{ + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_umc_handle_bad_pages(ade
RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
[AMD Official Use Only - General] Please check my comments inline Regards, Hawking -Original Message- From: Chai, Thomas Sent: Tuesday, January 16, 2024 16:21 To: amd-gfx@lists.freedesktop.org Cc: Chai, Thomas ; Zhang, Hawking ; Zhou1, Tao ; Li, Candice ; Wang, Yang(Kevin) ; Yang, Stanley ; Chai, Thomas Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning Use asynchronous polling to handle umc_v12_0 poisoning. Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 + 3 files changed, 120 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 856206e95842..44929281840e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_read(&con->page_retirement_req_cnt)); atomic_dec(&con->page_retirement_req_cnt); + + amdgpu_umc_poison_retire_page_polling_timeout(adev, + false, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9d1cf41cf483..2dde29cb807d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -23,6 +23,7 @@ #include "amdgpu.h" #include "umc_v6_7.h" +#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, return ret; } -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, - void *ras_error_status, - struct amdgpu_iv_entry *entry, - bool reset) +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, + void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int ret = 0; unsigned long err_count; - - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + mutex_lock(&con->page_retirement_lock); ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); if (ret == -EOPNOTSUPP) { if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ -163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, con->update_channel_flag = false; } } - - if (reset) { - /* use mode-2 reset for poison consumption */ - if (!entry) - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; - amdgpu_ras_reset_gpu(adev); - } } kfree(err_data->err_addr); + + mutex_unlock(&con->page_retirement_lock); +} + +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, + void *ras_error_status, + struct amdgpu_iv_entry *entry, + bool reset) +{ + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_umc_handle_bad_pages(adev, ras_error_status); + + if (err_data->ue_count && reset) { + /* use mode-2 reset for poison consumption */ + if (!entry) + con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; [Hawking]: Shall we do further check on con->poison_supported flag to decide issuing mode-2 or mode-1. + amdgpu_ras_reset_gpu(adev); + } + return AMDGPU_RAS_SUCCESS; } +int amdgpu_umc_poison_retire_page_polling_timeout(struct amdgpu_device *adev, + bool reset, uint32_t timeout_ms) [Hawking] int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, boot reset, uint32_t timeout_ms) +{ + struct ras_err_data err_data; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }