If the GPU fails to suspend the return code is passed up to the caller but it's left in an inconsistent state. This could lead to hangs if userspace tries to access it.
Instead of leaving it in this state, attempt to resume using amdgpu_device_resume(). IP resume functions check the HW status and thus should only resume the IP that got suspended if a failure happened part way through. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627 Signed-off-by: Mario Limonciello <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a99185ed0642..59672b880d75 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5227,7 +5227,7 @@ void amdgpu_device_complete(struct drm_device *dev) int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) { struct amdgpu_device *adev = drm_to_adev(dev); - int r = 0; + int r, rec; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -5240,7 +5240,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); if (r) - return r; + goto resume; } if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3)) @@ -5255,16 +5255,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) r = amdgpu_device_ip_suspend_phase1(adev); if (r) - return r; + goto resume; amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); r = amdgpu_userq_suspend(adev); if (r) - return r; + goto resume; r = amdgpu_device_evict_resources(adev); if (r) - return r; + goto resume; amdgpu_ttm_set_buffer_funcs_status(adev, false); @@ -5272,16 +5272,22 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) r = amdgpu_device_ip_suspend_phase2(adev); if (r) - return r; + goto resume; if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); r = amdgpu_dpm_notify_rlc_state(adev, false); if (r) - return r; + goto resume; return 0; +resume: + rec = amdgpu_device_resume(dev, notify_clients); + if (rec) + dev_err(adev->dev, "amdgpu_device_resume failed: %d\n", rec); + + return r; } static inline int amdgpu_virt_resume(struct amdgpu_device *adev) -- 2.51.0
