amdgpu: Drop concurrent GPU reset protection for device

Andrey Grodzovsky Tue, 08 Feb 2022 16:24:25 -0800

Since now all GPU resets are serialzied there is no need for this.

This patch also reverts 'drm/amdgpu: race issue when jobs on 2 ring timeout'


Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
Reviewed-by: Christian König <christian.koe...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 89 ++--------------------
 1 file changed, 7 insertions(+), 82 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7e92f2432087..e3c0ec684a85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4817,11 +4817,10 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
        return r;
 }
 
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
+static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
                                struct amdgpu_hive_info *hive)
 {
-       if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
-               return false;
+       atomic_set(&adev->in_gpu_reset, 1);
 
        if (hive) {
                down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
@@ -4840,8 +4839,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device 
*adev,
                adev->mp1_state = PP_MP1_STATE_NONE;
                break;
        }
-
-       return true;
 }
 
 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
@@ -4852,46 +4849,6 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
        up_write(&adev->reset_sem);
 }
 
-/*
- * to lockup a list of amdgpu devices in a hive safely, if not a hive
- * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
- *
- * unlock won't require roll back.
- */
-static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct 
amdgpu_hive_info *hive)
-{
-       struct amdgpu_device *tmp_adev = NULL;
-
-       if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
-               if (!hive) {
-                       dev_err(adev->dev, "Hive is NULL while device has 
multiple xgmi nodes");
-                       return -ENODEV;
-               }
-               list_for_each_entry(tmp_adev, &hive->device_list, 
gmc.xgmi.head) {
-                       if (!amdgpu_device_lock_adev(tmp_adev, hive))
-                               goto roll_back;
-               }
-       } else if (!amdgpu_device_lock_adev(adev, hive))
-               return -EAGAIN;
-
-       return 0;
-roll_back:
-       if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
-               /*
-                * if the lockup iteration break in the middle of a hive,
-                * it may means there may has a race issue,
-                * or a hive device locked up independently.
-                * we may be in trouble and may not, so will try to roll back
-                * the lock and give out a warnning.
-                */
-               dev_warn(tmp_adev->dev, "Hive lock iteration broke in the 
middle. Rolling back to unlock");
-               list_for_each_entry_continue_reverse(tmp_adev, 
&hive->device_list, gmc.xgmi.head) {
-                       amdgpu_device_unlock_adev(tmp_adev);
-               }
-       }
-       return -EAGAIN;
-}
-
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
 {
        struct pci_dev *p = NULL;
@@ -5078,22 +5035,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
        reset_context.hive = hive;
        clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
 
-       /*
-        * lock the device before we try to operate the linked list
-        * if didn't get the device lock, don't touch the linked list since
-        * others may iterating it.
-        */
-       r = amdgpu_device_lock_hive_adev(adev, hive);
-       if (r) {
-               dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another 
already in progress",
-                                       job ? job->base.id : -1);
-
-               /* even we skipped this reset, still need to set the job to 
guilty */
-               if (job && job->vm)
-                       drm_sched_increase_karma(&job->base);
-               goto skip_recovery;
-       }
-
        /*
         * Build list of devices to reset.
         * In case we are in XGMI hive mode, resort the device list
@@ -5113,6 +5054,9 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
 
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+
+               amdgpu_device_lock_adev(tmp_adev, hive);
+
                /*
                 * Try to put the audio codec into suspend state
                 * before gpu reset started.
@@ -5264,13 +5208,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
                amdgpu_device_unlock_adev(tmp_adev);
        }
 
-skip_recovery:
        if (hive) {
                mutex_unlock(&hive->hive_lock);
                amdgpu_put_xgmi_hive(hive);
        }
 
-       if (r && r != -EAGAIN)
+       if (r)
                dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
        return r;
 }
@@ -5493,20 +5436,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
        return 0;
 }
 
-static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
-{
-       int i;
-
-       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
-               struct amdgpu_ring *ring = adev->rings[i];
-
-               if (!ring || !ring->sched.thread)
-                       continue;
-
-               cancel_delayed_work_sync(&ring->sched.work_tdr);
-       }
-}
-
 /**
  * amdgpu_pci_error_detected - Called when a PCI error is detected.
  * @pdev: PCI device struct
@@ -5537,14 +5466,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct 
pci_dev *pdev, pci_channel_sta
        /* Fatal error, prepare for slot reset */
        case pci_channel_io_frozen:
                /*
-                * Cancel and wait for all TDRs in progress if failing to
-                * set  adev->in_gpu_reset in amdgpu_device_lock_adev
-                *
                 * Locking adev->reset_sem will prevent any external access
                 * to GPU during PCI error recovery
                 */
-               while (!amdgpu_device_lock_adev(adev, NULL))
-                       amdgpu_cancel_all_tdr(adev);
+               amdgpu_device_lock_adev(adev, NULL);
 
                /*
                 * Block any work scheduling as we do for regular GPU reset
-- 
2.25.1

[RFC v4 06/11] drm/amdgpu: Drop concurrent GPU reset protection for device

Reply via email to