When a gpu in hive is performing ras reset, other
gpus in hive do not need to schedule recovery work
to reset the gpu.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 34226ae010c7..cbb4d6ccc420 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2489,6 +2489,7 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
        struct amdgpu_device *adev = ras->adev;
        struct list_head device_list, *device_list_handle =  NULL;
        struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+       struct amdgpu_ras *tmp_ras;
 
        if (hive) {
                atomic_set(&hive->ras_recovery, 1);
@@ -2499,11 +2500,19 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
                 * as part of recovery.
                 */
                list_for_each_entry(remote_adev, &hive->device_list,
-                                   gmc.xgmi.head)
+                                   gmc.xgmi.head) {
+                       tmp_ras = amdgpu_ras_get_context(remote_adev);
+                       /* When a gpu in hive is performing ras reset, other
+                        * gpus in hive do not need to schedule recovery work
+                        * to reset the gpu.
+                        */
+                       atomic_set(&tmp_ras->in_recovery, 1);
+
                        if (amdgpu_ras_get_fed_status(remote_adev)) {
                                amdgpu_ras_set_fed_all(adev, hive, true);
                                break;
                        }
+               }
        }
        if (!ras->disable_ras_err_cnt_harvest) {
 
@@ -2556,6 +2565,15 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
 
                amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
        }
+
+       if (hive) {
+               list_for_each_entry(remote_adev, &hive->device_list,
+                                               gmc.xgmi.head) {
+                       tmp_ras = amdgpu_ras_get_context(remote_adev);
+                       atomic_set(&tmp_ras->in_recovery, 0);
+               }
+       }
+
        atomic_set(&ras->in_recovery, 0);
        if (hive) {
                atomic_set(&hive->ras_recovery, 0);
-- 
2.34.1

Reply via email to