amdgpu: flush all cached ras bad pages to eeprom

YiPeng Chai Wed, 03 Jul 2024 01:42:38 -0700

Before uninstalling gpu driver, flush all cached ras
bad pages to eeprom.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)


diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 26098b03447b..1b6f5b26957b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 
 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
 
+#define MAX_FLUSH_RETIRE_DWORK_TIMES  200
+
 enum amdgpu_ras_retire_page_reservation {
        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
        AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -3176,11 +3178,26 @@ static int amdgpu_ras_recovery_fini(struct 
amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data = con->eh_data;
+       int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
+       int ret;
 
        /* recovery_init failed to init it, fini is useless */
        if (!data)
                return 0;
 
+       /* Save all cached bad pages to eeprom */
+       do {
+               flush_delayed_work(&con->page_retirement_dwork);
+
+               mutex_lock(&con->umc_ecc_log.lock);
+               ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+                               UMC_ECC_NEW_DETECTED_TAG);
+               mutex_unlock(&con->umc_ecc_log.lock);
+
+               if (ret)
+                       schedule_delayed_work(&con->page_retirement_dwork, 0);
+       } while (ret && max_flush_timeout--);
+
        if (con->page_retirement_thread)
                kthread_stop(con->page_retirement_thread);
 
-- 
2.34.1

[PATCH 1/2] drm/amdgpu: flush all cached ras bad pages to eeprom

Reply via email to