[PATCH] drm/amdgpu: Fix repeatly flr issue

Emily . Deng Wed, 12 Aug 2020 02:20:42 -0700

From: jqdeng <emily.d...@amd.com>

Only for no job running test case need to do recover in
flr notification.
For having job in mirror list, then let guest driver to
hit job timeout, and then do recover.


Signed-off-by: jqdeng <emily.d...@amd.com>
Change-Id: Ic6234fce46fa1655ba81c4149235eeac75e75868
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 20 +++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 ++++++++++++++++++++--
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index fe31cbeccfe9..12fe5164aaf3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -238,6 +238,9 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
        struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
        struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
        int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
+       int i;
+       bool need_do_recover = true;
+       struct drm_sched_job *job;
 
        /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
         * otherwise the mailbox msg will be ruined/reseted by
@@ -258,10 +261,25 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
 
 flr_done:
        up_read(&adev->reset_sem);
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               spin_lock(&ring->sched.job_list_lock);
+               job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
+               struct drm_sched_job, node);
+               spin_unlock(&ring->sched.job_list_lock);
+               if (job) {
+                       need_do_recover = false;
+                       break;
+               }
+       }
 
        /* Trigger recovery for world switch failure if no TDR */
        if (amdgpu_device_should_recover_gpu(adev)
-               && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
+               && (need_do_recover || adev->sdma_timeout == 
MAX_SCHEDULE_TIMEOUT))
                amdgpu_device_gpu_recover(adev, NULL);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6f55172e8337..fc92c494df0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -259,6 +259,9 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
        struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
flr_work);
        struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
virt);
        int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
+       int i;
+       bool need_do_recover = true;
+       struct drm_sched_job *job;
 
        /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
         * otherwise the mailbox msg will be ruined/reseted by
@@ -279,10 +282,25 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
 
 flr_done:
        up_read(&adev->reset_sem);
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               spin_lock(&ring->sched.job_list_lock);
+               job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
+                                       struct drm_sched_job, node);
+               spin_unlock(&ring->sched.job_list_lock);
+               if (job) {
+                       need_do_recover = false;
+                       break;
+               }
+       }
 
        /* Trigger recovery for world switch failure if no TDR */
-       if (amdgpu_device_should_recover_gpu(adev)
-               && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+       if (amdgpu_device_should_recover_gpu(adev) && (need_do_recover ||
+               adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
                adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
                adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
                adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Fix repeatly flr issue

Reply via email to