add eeprom data checksum check after data writing, after gpu reset, and before driver unload. reset eeprom and save correct data to eeprom when check failed
Signed-off-by: ganglxie <gangl...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +++++++++++++++++++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 1 + 6 files changed, 48 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 84fcaf84fead..2503f2300472 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6410,6 +6410,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; int r = 0; bool need_emergency_restart = false; + enum amd_reset_method tmp_reset_method; if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain)) return 0; @@ -6506,6 +6507,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!r) drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); + if (reset_context->method == AMD_RESET_METHOD_MODE1 || + reset_context->method == AMD_RESET_METHOD_MODE2) { + amdgpu_ras_eeprom_check_and_recover(adev); + } else { + if (reset_context->method == AMD_RESET_METHOD_NONE) { + tmp_reset_method = adev->asic_funcs->reset_method(adev); + if (tmp_reset_method == AMD_RESET_METHOD_MODE1 || + tmp_reset_method == AMD_RESET_METHOD_MODE2) + amdgpu_ras_eeprom_check_and_recover(adev); + } + } + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 571b70da4562..1009b60f6ae4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2560,6 +2560,7 @@ amdgpu_pci_remove(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); + amdgpu_ras_eeprom_check_and_recover(adev); amdgpu_xcp_dev_unplug(adev); amdgpu_gmc_prepare_nps_mode_change(adev); drm_dev_unplug(dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f8a8c8502013..e03550be45b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -196,6 +196,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre amdgpu_ras_add_bad_pages(adev, err_data.err_addr, err_data.err_addr_cnt, false); amdgpu_ras_save_bad_pages(adev, NULL); + amdgpu_ras_eeprom_check_and_recover(adev); } amdgpu_ras_error_data_fini(&err_data); @@ -3539,9 +3540,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) /* The format action is only applied to new ASICs */ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && control->tbl_hdr.version < RAS_TABLE_VER_V3) - if (!amdgpu_ras_eeprom_reset_table(control)) + if (!amdgpu_ras_eeprom_reset_table(control)) { if (amdgpu_ras_save_bad_pages(adev, NULL)) dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); + else + amdgpu_ras_eeprom_check_and_recover(adev); + } + } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 2af14c369bb9..2458c67526c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1522,3 +1522,28 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) return res < 0 ? res : 0; } + +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control; + int res = 0; + + if (!control->is_eeprom_valid) + return; + res = __verify_ras_table_checksum(control); + if (res) { + dev_warn(adev->dev, + "RAS table incorrect checksum or error:%d, try to recover\n", + res); + if (!amdgpu_ras_eeprom_reset_table(control)) + if (!amdgpu_ras_save_bad_pages(adev, NULL)) + if (!__verify_ras_table_checksum(control)) { + dev_info(adev->dev, "RAS table recovery succeed\n"); + return; + } + dev_err(adev->dev, "RAS table recovery failed\n"); + control->is_eeprom_valid = false; + } + return; +} \ No newline at end of file diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 35c69ac3dbeb..ebfca4cb5688 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -161,6 +161,8 @@ void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control); +void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev); + extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index bfc86f1e84e5..081f7fd29394 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -80,6 +80,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, amdgpu_ras_add_bad_pages(adev, err_data.err_addr, err_data.err_addr_cnt, false); amdgpu_ras_save_bad_pages(adev, NULL); + amdgpu_ras_eeprom_check_and_recover(adev); } out_free_err_addr: -- 2.34.1