amdgpu: Set fatal errror detected flag earlier

Lijo Lazar Wed, 27 Mar 2024 19:36:24 -0700

In case of fatal errors, set FED status when interrupt is received. Set
the flag on other devices in the hive before RAS recovery work.


Signed-off-by: Lijo Lazar <lijo.la...@amd.com>
---
v2: Avoid accessing hive in interrupt handler as it may take mutex path (Kevin)

 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +++++++++++++++++--------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b8c7d0bf8fb1..352ce16a0963 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device 
*adev,
        return ret;
 }
 
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+                                  struct amdgpu_hive_info *hive, bool status)
+{
+       struct amdgpu_device *tmp_adev;
+
+       if (hive) {
+               list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+                       amdgpu_ras_set_fed(tmp_adev, status);
+       } else {
+               amdgpu_ras_set_fed(adev, status);
+       }
+}
+
 static void amdgpu_ras_do_recovery(struct work_struct *work)
 {
        struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
        struct list_head device_list, *device_list_handle =  NULL;
        struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
-       if (hive)
+       if (hive) {
                atomic_set(&hive->ras_recovery, 1);
+
+               /* If any device which is part of the hive received RAS fatal
+                * error interrupt, set fatal error status on all. This
+                * condition will need a recovery, and flag will be cleared
+                * as part of recovery.
+                */
+               list_for_each_entry(remote_adev, &hive->device_list,
+                                   gmc.xgmi.head)
+                       if (amdgpu_ras_get_fed_status(remote_adev)) {
+                               amdgpu_ras_set_fed_all(adev, hive, true);
+                               break;
+                       }
+       }
        if (!ras->disable_ras_err_cnt_harvest) {
 
                /* Build list of devices to query RAS related errors */
@@ -2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
                                ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                                set_bit(AMDGPU_NEED_FULL_RESET, 
&reset_context.flags);
 
-                               /* For any RAS error that needs a full reset to
-                                * recover, set the fatal error status
-                                */
-                               if (hive) {
-                                       list_for_each_entry(remote_adev,
-                                                           &hive->device_list,
-                                                           gmc.xgmi.head)
-                                               amdgpu_ras_set_fed(remote_adev,
-                                                                  true);
-                               } else {
-                                       amdgpu_ras_set_fed(adev, true);
-                               }
                                psp_fatal_error_recovery_quirk(&adev->psp);
                        }
                }
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
                RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
                              "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
 
+               amdgpu_ras_set_fed(adev, true);
                ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                amdgpu_ras_reset_gpu(adev);
        }
-- 
2.25.1

[PATCH v2 1/4] drm/amdgpu: Set fatal errror detected flag earlier

Reply via email to