amdgpu: Avoid HW reset if guilty job already signaled.

Christian König Fri, 12 Apr 2019 00:39:44 -0700

Am 11.04.19 um 18:03 schrieb Andrey Grodzovsky:

Also reject TDRs if another one already running.


v2:
Stop all schedulers across device and entire XGMI hive before
force signaling HW fences.

Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 125 ++++++++++++++++++++---------
  1 file changed, 88 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index aabd043..ce9c494 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3327,7 +3327,8 @@ bool amdgpu_device_should_recover_gpu(struct 
amdgpu_device *adev)

static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,

                                        struct amdgpu_job *job,
-                                       bool *need_full_reset_arg)
+                                       bool *need_full_reset_arg,
+                                       bool job_signaled)
  {
        int i, r = 0;
        bool need_full_reset  = *need_full_reset_arg;
@@ -3339,8 +3340,6 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
                if (!ring || !ring->sched.thread)
                        continue;

- drm_sched_stop(&ring->sched, &job->base);

-
                /* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
                amdgpu_fence_driver_force_completion(ring);
        }
@@ -3358,7 +3357,8 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,

- if (!amdgpu_sriov_vf(adev)) {

+       /* Don't suspend on bare metal if we are not going to HW reset the ASIC 
*/
+       if (!amdgpu_sriov_vf(adev) && !job_signaled) {

if (!need_full_reset)

                        need_full_reset = 
amdgpu_device_ip_need_full_reset(adev);
@@ -3495,7 +3495,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
        return r;
  }

-static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)

+static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, bool 
job_signaled)
  {
        int i;

@@ -3505,7 +3505,8 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)

                if (!ring || !ring->sched.thread)
                        continue;

- if (!adev->asic_reset_res)

+               /* No point to resubmit jobs if we didn't HW reset*/
+               if (!adev->asic_reset_res && !job_signaled)
                        drm_sched_resubmit_jobs(&ring->sched);

drm_sched_start(&ring->sched, !adev->asic_reset_res);

@@ -3518,14 +3519,21 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev)
        adev->asic_reset_res = 0;
  }

-static void amdgpu_device_lock_adev(struct amdgpu_device *adev)

+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
  {
-       mutex_lock(&adev->lock_reset);
+       if (trylock) {
+               if (!mutex_trylock(&adev->lock_reset))
+                       return false;
+       } else
+               mutex_lock(&adev->lock_reset);
+
        atomic_inc(&adev->gpu_reset_counter);
        adev->in_gpu_reset = 1;
        /* Block kfd: SRIOV would do it separately */
        if (!amdgpu_sriov_vf(adev))
                  amdgpu_amdkfd_pre_reset(adev);
+
+       return true;
  }

static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)

@@ -3553,38 +3561,43 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                              struct amdgpu_job *job)
  {
-       int r;
+       int r, i;

BTW: Usual kernel coding style is to use reverse xmas tree. E.g.variables like "int i, r" last in the declarations.

        struct amdgpu_hive_info *hive = NULL;
-       bool need_full_reset = false;
        struct amdgpu_device *tmp_adev = NULL;
        struct list_head device_list, *device_list_handle =  NULL;
+       bool xgmi_topology_present, need_full_reset, job_signaled;

+ need_full_reset = job_signaled = false;

        INIT_LIST_HEAD(&device_list);

dev_info(adev->dev, "GPU reset begin!\n");+ hive = amdgpu_get_xgmi_hive(adev, 0);


The second parameter should actually be a bool, so please use false here.

+       xgmi_topology_present = hive && adev->gmc.xgmi.num_physical_nodes > 1;


Why are we actually checking num_physical_nodes here?

+
        /*
-        * In case of XGMI hive disallow concurrent resets to be triggered
-        * by different nodes. No point also since the one node already 
executing
-        * reset will also reset all the other nodes in the hive.
+        * Here we trylock to avoid chain of resets executing from
+        * either trigger by jobs on different adevs in XGMI hive or jobs on
+        * different schedulers for same device while this tdr is running.

Can we please completely drop the name "tdr" and just write "timeouthandler"?

+        * We always reset all schedulers for device and all devices for XGMI
+        * hive so that should take care of them too.
         */
-       hive = amdgpu_get_xgmi_hive(adev, 0);
-       if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
-           !mutex_trylock(&hive->reset_lock))
+
+       if (xgmi_topology_present && !mutex_trylock(&hive->reset_lock)) {
+               DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another 
already in progress",
+                        job->base.id, hive->hive_id);
                return 0;
+       }

/* Start with adev pre asic reset first for soft reset check.*/

-       amdgpu_device_lock_adev(adev);
-       r = amdgpu_device_pre_asic_reset(adev,
-                                        job,
-                                        &need_full_reset);
-       if (r) {
-               /*TODO Should we stop ?*/
-               DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s 
",
-                         r, adev->ddev->unique);
-               adev->asic_reset_res = r;
+       if (!amdgpu_device_lock_adev(adev, !xgmi_topology_present)) {
+               DRM_INFO("Bailing on TDR for s_job:%llx, as another already in 
progress",
+                                        job->base.id);
+               return 0;
        }

+ need_full_reset = amdgpu_device_ip_need_full_reset(adev);

+
        /* Build list of devices to reset */
        if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {

We should probably unconditionally stop all nodes on all device firstand then try to figure out if we actually need a full reset.

                if (!hive) {
@@ -3603,16 +3616,52 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
                device_list_handle = &device_list;
        }

+ /* block all schedulers and reset given job's ring */

+       list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+               for (i = 0; i < AMDGPU_MAX_RINGSBuild list of devices to reset
; ++i) {
+                       struct amdgpu_ring *ring = tmp_adev->rings[i];
+
+                       if (!ring || !ring->sched.thread)
+                               continue;
+
+                       drm_sched_stop(&ring->sched, &job->base);
+               }
+       }
+
+       /*
+        * Must check guilty signal here since after this point all old
+        * HW fences are force signaled.
+        *
+        * job->base holds a reference to parent fence
+        */
+       if (job && job->base.s_fence->parent &&
+           dma_fence_is_signaled(job->base.s_fence->parent))
+               job_signaled = true;

It would probably be better if we have a "goto abort;" or somethingsimilar here.


This also avoids giving the job_signaled variable to all the sub functions.

Christian.

+
+
+       /* Guilty job will be freed after this*/
+       r = amdgpu_device_pre_asic_reset(adev,
+                                        job,
+                                        &need_full_reset,
+                                        job_signaled);
+       if (r) {
+               /*TODO Should we stop ?*/
+               DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s 
",
+                         r, adev->ddev->unique);
+               adev->asic_reset_res = r;
+       }
+
  retry:        /* Rest of adevs pre asic reset from XGMI hive. */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

if (tmp_adev == adev)

                        continue;

- amdgpu_device_lock_adev(tmp_adev);

+               amdgpu_device_lock_adev(tmp_adev, false);
                r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                 NULL,
-                                                &need_full_reset);
+                                                &need_full_reset,
+                                                job_signaled);
                /*TODO Should we stop ?*/
                if (r) {
                        DRM_ERROR("GPU pre asic reset failed with err, %d for drm 
dev, %s ",
@@ -3623,19 +3672,21 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,

/* Actual ASIC resets if needed.*/

        /* TODO Implement XGMI hive reset logic for SRIOV */
-       if (amdgpu_sriov_vf(adev)) {
-               r = amdgpu_device_reset_sriov(adev, job ? false : true);
-               if (r)
-                       adev->asic_reset_res = r;
-       } else {
-               r  = amdgpu_do_asic_reset(hive, device_list_handle, 
&need_full_reset);
-               if (r && r == -EAGAIN)
-                       goto retry;
+       if (!job || !job_signaled) {
+               if (amdgpu_sriov_vf(adev)) {
+                       r = amdgpu_device_reset_sriov(adev, job ? false : true);
+                       if (r)
+                               adev->asic_reset_res = r;
+               } else {
+                       r  = amdgpu_do_asic_reset(hive, device_list_handle, 
&need_full_reset);
+                       if (r && r == -EAGAIN)
+                               goto retry;
+               }
        }

/* Post ASIC reset for all devs .*/

        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               amdgpu_device_post_asic_reset(tmp_adev);
+               amdgpu_device_post_asic_reset(tmp_adev, job_signaled);

if (r) {

                        /* bad news, how to tell it to userspace ? */
@@ -3648,7 +3699,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                amdgpu_device_unlock_adev(tmp_adev);
        }

- if (hive && adev->gmc.xgmi.num_physical_nodes > 1)

+       if (xgmi_topology_present)
                mutex_unlock(&hive->reset_lock);

if (r)


_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 3/4] drm/amdgpu: Avoid HW reset if guilty job already signaled.

Reply via email to