found recover_vram_from_shadow sometimes get executed
in paralle with SDMA scheduler, should stop all
schedulers before doing gpu reset/recover

Change-Id: Ibaef3e3c015f3cf88f84b2eaf95cda95ae1a64e3
Signed-off-by: Monk Liu <monk....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 +++++++++++-------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 75d1733..e9d81a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2649,22 +2649,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 
        /* block TTM */
        resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+
        /* store modesetting */
        if (amdgpu_device_has_dc_support(adev))
                state = drm_atomic_helper_suspend(adev->ddev);
 
-       /* block scheduler */
+       /* block all schedulers and reset given job's ring */
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                struct amdgpu_ring *ring = adev->rings[i];
 
                if (!ring || !ring->sched.thread)
                        continue;
 
-               /* only focus on the ring hit timeout if &job not NULL */
+               kthread_park(ring->sched.thread);
+
                if (job && job->ring->idx != i)
                        continue;
 
-               kthread_park(ring->sched.thread);
                drm_sched_hw_job_reset(&ring->sched, &job->base);
 
                /* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
@@ -2707,33 +2708,22 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
                        }
                        dma_fence_put(fence);
                }
+       }
 
-               for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-                       struct amdgpu_ring *ring = adev->rings[i];
-
-                       if (!ring || !ring->sched.thread)
-                               continue;
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
 
-                       /* only focus on the ring hit timeout if &job not NULL 
*/
-                       if (job && job->ring->idx != i)
-                               continue;
+               if (!ring || !ring->sched.thread)
+                       continue;
 
+               /* only need recovery sched of the given job's ring
+                * or all rings (in the case @job is NULL)
+                * after above amdgpu_reset accomplished
+                */
+               if ((!job || job->ring->idx == i) && !r)
                        drm_sched_job_recovery(&ring->sched);
-                       kthread_unpark(ring->sched.thread);
-               }
-       } else {
-               for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-                       struct amdgpu_ring *ring = adev->rings[i];
 
-                       if (!ring || !ring->sched.thread)
-                               continue;
-
-                       /* only focus on the ring hit timeout if &job not NULL 
*/
-                       if (job && job->ring->idx != i)
-                               continue;
-
-                       kthread_unpark(adev->rings[i]->sched.thread);
-               }
+               kthread_unpark(ring->sched.thread);
        }
 
        if (amdgpu_device_has_dc_support(adev)) {
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to