amdgpu: move the call of ras recovery_init and bad page reserve to proper place

Chen, Guchun Thu, 05 Sep 2019 00:57:21 -0700

Except one spelling typo, series is: Reviewed-by: Guchun Chen 
<guchun.c...@amd.com>


Regards,
Guchun

-----Original Message-----
From: Zhou1, Tao <tao.zh...@amd.com> 
Sent: Thursday, September 5, 2019 12:04 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey 
<andrey.grodzov...@amd.com>; Chen, Guchun <guchun.c...@amd.com>; Li, Dennis 
<dennis...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; Clements, John 
<john.cleme...@amd.com>
Cc: Zhou1, Tao <tao.zh...@amd.com>
Subject: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad 
page reserve to proper place

ras recovery_init should be called after ttm init, bad page reserve should be 
put in front of gpu reset since i2c may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini

v2: add more comment and print.
    remove cancel_work_sync in recovery_init.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
Reviewed-by: Guchun Chen <guchun.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 39 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    | 12 +++++++
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e50861e16cf5..22cd3deab731 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3629,11 +3629,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
                                                break;
                                }
                        }
-
-                       list_for_each_entry(tmp_adev, device_list_handle,
-                                       gmc.xgmi.head) {
-                               amdgpu_ras_reserve_bad_pages(tmp_adev);
-                       }
                }
        }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e68f43d1cfea..c2b2b0e3515c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1491,16 +1491,17 @@ static int amdgpu_ras_release_bad_pages(struct 
amdgpu_device *adev)
        return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data **data = &con->eh_data;
        int ret;
 
-       *data = kmalloc(sizeof(**data),
-                       GFP_KERNEL|__GFP_ZERO);
-       if (!*data)
-               return -ENOMEM;
+       *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+       if (!*data) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
        mutex_init(&con->recovery_lock);
        INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); @@ -1509,18 
+1510,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
        ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
        if (ret)
-               return ret;
+               goto free;
 
        if (adev->psp.ras.ras->eeprom_control.num_recs) {
                ret = amdgpu_ras_load_bad_pages(adev);
                if (ret)
-                       return ret;
+                       goto free;
                ret = amdgpu_ras_reserve_bad_pages(adev);
                if (ret)
-                       return ret;
+                       goto release;
        }
 
        return 0;
+
+release:
+       amdgpu_ras_release_bad_pages(adev);
+free:
+       con->eh_data = NULL;
+       kfree((*data)->bps);
+       kfree((*data)->bps_bo);
+       kfree(*data);
+out:
+       DRM_WARN("Failed to initilaize ras recovery!\n");
[Guchun] One spelling typo of initialize.
+
+       return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ -1528,12 
+1541,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data = con->eh_data;
 
+       /* recovery_init failed to init it, fini is useless */
+       if (!data)
+               return 0;
+
        cancel_work_sync(&con->recovery_work);
        amdgpu_ras_release_bad_pages(adev);
 
        mutex_lock(&con->recovery_lock);
        con->eh_data = NULL;
        kfree(data->bps);
+       kfree(data->bps_bo);
        kfree(data);
        mutex_unlock(&con->recovery_lock);
 
@@ -1625,9 +1643,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                        return r;
        }
 
-       if (amdgpu_ras_recovery_init(adev))
-               goto recovery_out;
-
        amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
        if (amdgpu_ras_fs_init(adev))
@@ -1642,8 +1657,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                        con->hw_supported, con->supported);
        return 0;
 fs_out:
-       amdgpu_ras_recovery_fini(adev);
-recovery_out:
        amdgpu_ras_set_context(adev, NULL);
        kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 96210e18191e..012034d2ae06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,
        return ras && (ras->supported & (1 << block));  }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
                unsigned int block);
 
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct 
amdgpu_device *adev,  {
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+       /* save bad page to eeprom before gpu reset,
+        * i2c may be unstable in gpu reset
+        */
+       amdgpu_ras_reserve_bad_pages(adev);
        if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
                schedule_work(&ras->recovery_work);
        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcd32d01a579..c05638cf3f3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -49,6 +49,7 @@
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
 #include "bif/bif_4_1_d.h"
 
 static int amdgpu_map_buffer(struct ttm_buffer_object *bo, @@ -1772,6 +1773,17 
@@ int amdgpu_ttm_init(struct amdgpu_device *adev)
                                                adev->gmc.visible_vram_size);
 #endif
 
+       /*
+        * retired pages will be loaded from eeprom and reserved here,
+        * it should be called after ttm init since new bo may be created,
+        * recovery_init may fail, but it can free all resources allocated by
+        * itself and its failure should not stop amdgpu init process.
+        *
+        * Note: theoretically, this should be called before all vram 
allocations
+        * to protect retired page from abusing
+        */
+       amdgpu_ras_recovery_init(adev);
+
        /*
         *The reserved vram for firmware must be pinned to the specified
         *place on the VRAM, so reserve it early.
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

Reply via email to