If device suspend has failed, add a recovery flow that will attempt
to unwind the suspend and get things back up and running.

Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
Signed-off-by: Mario Limonciello (AMD) <[email protected]>
---
v5:
 * Add missing drm client notification (should fix console resume)
 * Add missing RAS notification
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 65 +++++++++++++++++++---
 1 file changed, 58 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5945f441d01e..d8a2dcf6100a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5274,7 +5274,7 @@ void amdgpu_device_complete(struct drm_device *dev)
 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 {
        struct amdgpu_device *adev = drm_to_adev(dev);
-       int r = 0;
+       int r, rec;
 
        if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
                return 0;
@@ -5290,8 +5290,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
notify_clients)
                        return r;
        }
 
-       if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
-               dev_warn(adev->dev, "smart shift update failed\n");
+       r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
+       if (r)
+               goto unwind_sriov;
 
        if (notify_clients)
                drm_client_dev_suspend(adev_to_drm(adev), false);
@@ -5302,16 +5303,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
notify_clients)
 
        r = amdgpu_device_ip_suspend_phase1(adev);
        if (r)
-               return r;
+               goto unwind_smartshift;
 
        amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
        r = amdgpu_userq_suspend(adev);
        if (r)
-               return r;
+               goto unwind_ip_phase1;
 
        r = amdgpu_device_evict_resources(adev);
        if (r)
-               return r;
+               goto unwind_userq;
 
        amdgpu_ttm_set_buffer_funcs_status(adev, false);
 
@@ -5319,12 +5320,62 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
notify_clients)
 
        r = amdgpu_device_ip_suspend_phase2(adev);
        if (r)
-               return r;
+               goto unwind_evict;
 
        if (amdgpu_sriov_vf(adev))
                amdgpu_virt_release_full_gpu(adev, false);
 
        return 0;
+
+unwind_evict:
+       if (adev->mman.buffer_funcs_ring->sched.ready)
+               amdgpu_ttm_set_buffer_funcs_status(adev, true);
+       amdgpu_fence_driver_hw_init(adev);
+
+unwind_userq:
+       rec = amdgpu_userq_resume(adev);
+       if (rec) {
+               dev_warn(adev->dev, "failed to re-initialize user queues: 
%d\n", rec);
+               return r;
+       }
+       rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && 
!adev->in_runpm);
+       if (rec) {
+               dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
+               return r;
+       }
+
+unwind_ip_phase1:
+       /* suspend phase 1 = resume phase 3 */
+       rec = amdgpu_device_ip_resume_phase3(adev);
+       if (rec) {
+               dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", 
rec);
+               return r;
+       }
+
+unwind_smartshift:
+       rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
+       if (rec) {
+               dev_warn(adev->dev, "failed to re-update smart shift: %d\n", 
rec);
+               return r;
+       }
+
+       if (notify_clients)
+               drm_client_dev_resume(adev_to_drm(adev), false);
+
+       amdgpu_ras_resume(adev);
+
+unwind_sriov:
+       if (amdgpu_sriov_vf(adev)) {
+               rec = amdgpu_virt_request_full_gpu(adev, true);
+               if (rec) {
+                       dev_warn(adev->dev, "failed to reinitialize sriov: 
%d\n", rec);
+                       return r;
+               }
+       }
+
+       adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
+
+       return r;
 }
 
 static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
-- 
2.51.1

Reply via email to