amdgpu: Finilise device fences on device remove.

Andrey Grodzovsky Wed, 28 Apr 2021 08:14:22 -0700

Make sure all fecens dependent on HW present are force signaled
when handling device removal. This helpes later to scope all HW
accesing code such as IOCTLs in drm_dev_enter/exit and use
drm_dev_unplug as synchronization point past which we know HW
will not be accessed anymore outside of pci remove driver callback.


Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 12 +--
 4 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0db0ba4fba89..df6c5ed676b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1374,6 +1374,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev);
 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev);
 bool amdgpu_device_load_pci_state(struct pci_dev *pdev);
 
+void amdgpu_finilize_device_fences(struct drm_device *dev);
+
 #include "amdgpu_object.h"
 
 static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 33e8e9e1d1fe..55afc11c17e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3692,15 +3692,12 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
                amdgpu_virt_fini_data_exchange(adev);
        }
 
-       /* disable all interrupts */
-       amdgpu_irq_disable_all(adev);
        if (adev->mode_info.mode_config_initialized){
                if (!amdgpu_device_has_dc_support(adev))
                        drm_helper_force_disable_all(adev_to_drm(adev));
                else
                        drm_atomic_helper_shutdown(adev_to_drm(adev));
        }
-       amdgpu_fence_driver_fini_hw(adev);
 
        if (adev->pm_sysfs_en)
                amdgpu_pm_sysfs_fini(adev);
@@ -4567,14 +4564,19 @@ static bool amdgpu_device_lock_adev(struct 
amdgpu_device *adev,
        return true;
 }
 
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unlock_adev_imp(struct amdgpu_device *adev, bool 
skip_in_gpu_reset)
 {
        amdgpu_vf_error_trans_all(adev);
        adev->mp1_state = PP_MP1_STATE_NONE;
-       atomic_set(&adev->in_gpu_reset, 0);
+       !skip_in_gpu_reset ? atomic_set(&adev->in_gpu_reset, 0) : 0;
        up_write(&adev->reset_sem);
 }
 
+static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+{
+       amdgpu_device_unlock_adev_imp(adev, false);
+}
+
 /*
  * to lockup a list of amdgpu devices in a hive safely, if not a hive
  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
@@ -5321,3 +5323,89 @@ bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
 }
 
 
+static void amdgpu_finilize_schedulded_fences(struct amdgpu_ctx_mgr *mgr)
+{
+       struct amdgpu_ctx *ctx;
+       struct idr *idp;
+       uint32_t id, i, j;
+
+       idp = &mgr->ctx_handles;
+
+       idr_for_each_entry(idp, ctx, id) {
+               for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+                       for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+                               struct drm_sched_entity *entity;
+
+                               if (!ctx->entities[i][j])
+                                       continue;
+
+                               entity = &ctx->entities[i][j]->entity;
+                               drm_sched_entity_kill_jobs(entity);
+                       }
+               }
+       }
+}
+
+/**
+ * amdgpu_finilize_device_fences() - Finilize all device fences
+ * @pdev: pointer to PCI device
+ *
+ * Will disable and finilise ISRs and will signal all fences
+ * that might hang if HW is gone
+ */
+void amdgpu_finilize_device_fences(struct drm_device *dev)
+{
+       struct amdgpu_device *adev = drm_to_adev(dev);
+       struct drm_file *file;
+
+       /*
+        *  Block TDRs from further execution by setting adev->in_gpu_reset
+        *  instead of holding full reset lock in order to not deadlock
+        *  further ahead against any thread locking the reset lock when we
+        *  wait for it's completion
+        */
+       while (!amdgpu_device_lock_adev(adev, NULL))
+               amdgpu_cancel_all_tdr(adev);
+       amdgpu_device_unlock_adev_imp(adev, true);
+
+
+       /* disable all HW interrupts */
+       amdgpu_irq_disable_all(adev);
+
+       /* stop and flush all in flight HW interrupts handlers */
+       disable_irq(pci_irq_vector(adev->pdev, 0));
+
+       /*
+        * Stop SW GPU schedulers and force completion on all HW fences. Since
+        * in the prev. step all ISRs were disabled and completed the
+        * HW fence array is idle (no insertions or extractions) and so it's
+        * safe to iterate it bellow.
+        * After this step all HW fences in the system are signaled. As a result
+        * also all the scheduler 'finished' fences are also signaled.
+        */
+       amdgpu_fence_driver_fini_hw(adev);
+
+       /*
+        * Reject any further jobs to any scheduler entity queue. After this
+        * step no new insertions and because schedulers are stopped also no
+        * new extractions.
+        */
+       down_read(&adev->sched_fence_completion_sem);
+       adev->stop_job_submissions = true;
+       up_read(&adev->sched_fence_completion_sem);
+
+       /*
+        * Complete all scheduler 'scheduled' fences currently pending.
+        * It's OK if new contexts and sched entities are concurrently
+        * still created as they will fail in pushing jobs to SW queues
+        * and their schedule fences will be signaled with error
+        */
+       mutex_lock(&adev->ddev.filelist_mutex);
+       list_for_each_entry(file, &adev->ddev.filelist, lhead) {
+               struct amdgpu_fpriv *fpriv = file->driver_priv;
+               amdgpu_finilize_schedulded_fences(&fpriv->ctx_mgr);
+       }
+       mutex_unlock(&adev->ddev.filelist_mutex);
+}
+
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f799c40d7e72..8a19b8dd02ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1249,6 +1249,12 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 {
        struct drm_device *dev = pci_get_drvdata(pdev);
 
+       /*
+        * Force completion of all device related fences that might hang us when
+        * synchronizing SRCU in the following step.
+        */
+       amdgpu_finilize_device_fences(dev);
+
        drm_dev_unplug(dev);
        amdgpu_driver_unload_kms(dev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2670201e78d3..af592b28cd35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -526,7 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
  */
 void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
 {
-       int i, r;
+       int i;
 
        for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
                struct amdgpu_ring *ring = adev->rings[i];
@@ -535,18 +535,10 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device 
*adev)
                        continue;
 
                /* Stop any new job submissions from sched before flushing the 
ring */
-               /* TODO Handle amdgpu_job_submit_direct and 
amdgpu_amdkfd_submit_ib */
                if (!ring->no_scheduler)
                        drm_sched_fini(&ring->sched);
 
-               /* You can't wait for HW to signal if it's gone */
-               if (!drm_dev_is_unplugged(&adev->ddev))
-                       r = amdgpu_fence_wait_empty(ring);
-               else
-                       r = -ENODEV;
-               /* no need to trigger GPU reset as we are unloading */
-               if (r)
-                       amdgpu_fence_driver_force_completion(ring);
+               amdgpu_fence_driver_force_completion(ring);
 
                if (ring->fence_drv.irq_src)
                        amdgpu_irq_put(adev, ring->fence_drv.irq_src,
-- 
2.25.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v5 19/27] drm/amdgpu: Finilise device fences on device remove.

Reply via email to