read bad page data from pmfw eeprom when retirement is triggered, use timestamp read from eeprom
Signed-off-by: Gangliang Xie <[email protected]> --- drivers/gpu/drm/amd/ras/rascore/ras_aca.c | 31 +++++++++----- .../gpu/drm/amd/ras/rascore/ras_eeprom_fw.c | 40 +++++++++++++++++++ .../gpu/drm/amd/ras/rascore/ras_eeprom_fw.h | 2 + .../gpu/drm/amd/ras/rascore/ras_umc_v12_0.c | 3 ++ 4 files changed, 66 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c index e433c70d2989..67a35409ff0e 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_aca.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c @@ -234,16 +234,27 @@ static int aca_log_bad_bank(struct ras_core_context *ras_core, bank_ecc->de_count) { struct ras_bank_ecc ras_ecc = {0}; - ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); - ras_ecc.addr = bank_ecc->bank_info.addr; - ras_ecc.ipid = bank_ecc->bank_info.ipid; - ras_ecc.status = bank_ecc->bank_info.status; - ras_ecc.seq_no = bank->seq_no; - - if (ras_core_gpu_in_reset(ras_core)) - ras_umc_log_bad_bank_pending(ras_core, &ras_ecc); - else - ras_umc_log_bad_bank(ras_core, &ras_ecc); + if (ras_fw_eeprom_supported(ras_core)) { + ret = ras_fw_eeprom_update_record(ras_core, &ras_ecc); + if (!ret) { + ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); + ras_ecc.status = bank_ecc->bank_info.status; + ras_ecc.seq_no = bank->seq_no; + } + } else { + ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); + ras_ecc.addr = bank_ecc->bank_info.addr; + ras_ecc.ipid = bank_ecc->bank_info.ipid; + ras_ecc.status = bank_ecc->bank_info.status; + ras_ecc.seq_no = bank->seq_no; + } + + if (!ret) { + if (ras_core_gpu_in_reset(ras_core)) + ras_umc_log_bad_bank_pending(ras_core, &ras_ecc); + else + ras_umc_log_bad_bank(ras_core, &ras_ecc); + } } aca_report_ecc_info(ras_core, diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c index 69e1aef67ab9..7a2c14f276b5 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c @@ -24,6 +24,8 @@ #include "ras.h" +#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */ + void ras_fw_init_feature_flags(struct ras_core_context *ras_core) { struct ras_mp1 *mp1 = &ras_core->ras_mp1; @@ -329,3 +331,41 @@ uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core) return ras_core->ras_fw_eeprom.ras_num_recs; } + +int ras_fw_eeprom_update_record(struct ras_core_context *ras_core, + struct ras_bank_ecc *ras_ecc) +{ + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom; + int ret, retry = 20; + u32 recs_num_new = control->ras_num_recs; + + do { + /* 1000ms timeout is long enough, smu_get_badpage_count won't + * return -EBUSY before timeout. + */ + ret = ras_fw_get_badpage_count(ras_core, + &recs_num_new, RAS_SMU_MESSAGE_TIMEOUT_MS); + if (!ret && + (recs_num_new == control->ras_num_recs)) { + /* record number update in PMFW needs some time, + * smu_get_badpage_count may return immediately without + * count update, sleep for a while and retry again. + */ + msleep(50); + retry--; + } else { + break; + } + } while (retry); + + if (ret) + return ret; + + if (recs_num_new > control->ras_num_recs) + ret = ras_fw_eeprom_read_idx(ras_core, 0, + ras_ecc, control->ras_num_recs, 1); + else + ret = -EINVAL; + + return ret; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h index 7daf903ad5aa..ed8ebdf22156 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h @@ -75,5 +75,7 @@ int ras_fw_eeprom_read_idx(struct ras_core_context *ras_core, struct ras_bank_ecc *ras_ecc, u32 rec_idx, const u32 num); uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core); +int ras_fw_eeprom_update_record(struct ras_core_context *ras_core, + struct ras_bank_ecc *ras_ecc); #endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c index 53dc59e4de0c..b809a2f21d73 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c @@ -373,6 +373,9 @@ static int umc_v12_0_bank_to_eeprom_record(struct ras_core_context *ras_core, ACA_ADDR_2_ERR_ADDR(bank->addr), ACA_IPID_2_UMC_INST(bank->ipid), &nps_addr, bank->nps, record); + if (ras_fw_eeprom_supported(ras_core) && bank->ts) + record->ts = bank->ts; + lookup_bad_pages_in_a_row(ras_core, record, bank->nps, NULL, 0, bank->seq_no, true); -- 2.34.1
