If the GPU fails to suspend the return code is passed up to the caller
but it's left in an inconsistent state.  This could lead to hangs
if userspace tries to access it.

Instead of leaving it in this state, attempt to resume using
amdgpu_device_resume().  IP resume functions check the HW status
and thus should only resume the IP that got suspended if a failure
happened part way through.

Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
Signed-off-by: Mario Limonciello <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a99185ed0642..59672b880d75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5227,7 +5227,7 @@ void amdgpu_device_complete(struct drm_device *dev)
 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 {
        struct amdgpu_device *adev = drm_to_adev(dev);
-       int r = 0;
+       int r, rec;
 
        if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
                return 0;
@@ -5240,7 +5240,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
notify_clients)
                amdgpu_virt_fini_data_exchange(adev);
                r = amdgpu_virt_request_full_gpu(adev, false);
                if (r)
-                       return r;
+                       goto resume;
        }
 
        if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
@@ -5255,16 +5255,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
notify_clients)
 
        r = amdgpu_device_ip_suspend_phase1(adev);
        if (r)
-               return r;
+               goto resume;
 
        amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
        r = amdgpu_userq_suspend(adev);
        if (r)
-               return r;
+               goto resume;
 
        r = amdgpu_device_evict_resources(adev);
        if (r)
-               return r;
+               goto resume;
 
        amdgpu_ttm_set_buffer_funcs_status(adev, false);
 
@@ -5272,16 +5272,22 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
notify_clients)
 
        r = amdgpu_device_ip_suspend_phase2(adev);
        if (r)
-               return r;
+               goto resume;
 
        if (amdgpu_sriov_vf(adev))
                amdgpu_virt_release_full_gpu(adev, false);
 
        r = amdgpu_dpm_notify_rlc_state(adev, false);
        if (r)
-               return r;
+               goto resume;
 
        return 0;
+resume:
+       rec = amdgpu_device_resume(dev, notify_clients);
+       if (rec)
+               dev_err(adev->dev, "amdgpu_device_resume failed: %d\n", rec);
+
+       return r;
 }
 
 static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
-- 
2.51.0

Reply via email to