When gfx ras poison consumption causes gpu reset on gfx v11_0_3,
the sequence of gpu reset is "soft reset -> mode2 reset -> mode1 reset".
If the previous reset fails, fall back to the next reset.

Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 ++++++++++++++++------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a5086be4d7dd..c8d2a281098f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4770,13 +4770,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
        if (job && job->vm)
                drm_sched_increase_karma(&job->base);
 
-       r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
-       /* If reset handler not implemented, continue; otherwise return */
-       if (r == -ENOSYS)
-               r = 0;
-       else
-               return r;
-
        /* Don't suspend on bare metal if we are not going to HW reset the ASIC 
*/
        if (!amdgpu_sriov_vf(adev)) {
 
@@ -4789,12 +4782,23 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
                        r = amdgpu_device_ip_soft_reset(adev);
                        amdgpu_device_ip_post_soft_reset(adev);
                        if (r || amdgpu_device_ip_check_soft_reset(adev)) {
-                               dev_info(adev->dev, "soft reset failed, will 
fallback to full reset!\n");
+                               struct amdgpu_ras *ras = 
amdgpu_ras_get_context(adev);
+
+                               if (ras->reset_by_gfx_poison) {
+                                       reset_context->method = 
AMD_RESET_METHOD_MODE2;
+                                       dev_info(adev->dev, "soft reset failed, 
will fallback to mode2 reset!\n");
+                               } else {
+                                       dev_info(adev->dev, "soft reset failed, 
will fallback to full reset!\n");
+                               }
                                need_full_reset = true;
                        }
                }
 
-               if (need_full_reset)
+               /* IP suspend will affect mode2 reset, so ip suspend is skipped
+                * when mode2 reset is enabled.
+                */
+               if (need_full_reset &&
+                   (reset_context->method != AMD_RESET_METHOD_MODE2))
                        r = amdgpu_device_ip_suspend(adev);
                if (need_full_reset)
                        set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -4803,6 +4807,11 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
*adev,
                                  &reset_context->flags);
        }
 
+       r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
+       /* If reset handler not implemented, continue; otherwise return */
+       if (r == -ENOSYS)
+               r = 0;
+
        return r;
 }
 
@@ -4892,7 +4901,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
        /* If reset handler not implemented, continue; otherwise return */
        if (r == -ENOSYS)
                r = 0;
-       else
+       else if (!r) /* Mode2 reset successful, return */
                return r;
 
        /* Reset handler not implemented, use the default method */
@@ -4904,6 +4913,17 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
                test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) 
&&
                        test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 
+       /* If mode2 reset is enabled, ip suspend is skipped in previous
+        * amdgpu_device_pre_asic_reset function. but for mode1 reset,
+        * ip suspend must be called.
+        */
+       if (need_full_reset &&
+          (reset_context->method == AMD_RESET_METHOD_MODE2)) {
+               list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+                       amdgpu_device_ip_suspend(tmp_adev);
+               }
+       }
+
        /*
         * ASIC reset has to be done on all XGMI hive nodes ASAP
         * to allow proper links negotiation in FW (within 1 sec)
-- 
2.34.1

Reply via email to