amdgpu: fix locking scope when flushing tlb

Yunxiang Li Thu, 30 May 2024 14:48:46 -0700

Which method is used to flush tlb does not depend on whether a reset is
in progress or not. We should skip flush altogether if the GPU will get
reset. So put both path under reset_domain read lock.


Signed-off-by: Yunxiang Li <yunxiang...@amd.com>
Reviewed-by: Christian König <christian.koe...@amd.com>
CC: sta...@vger.kernel.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 603c0738fd03..4edd10b10a92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -684,12 +684,17 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device 
*adev, uint16_t pasid,
        struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
        struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
        unsigned int ndw;
-       signed long r;
+       int r;
        uint32_t seq;
 
-       if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
-           !down_read_trylock(&adev->reset_domain->sem)) {
+       /*
+        * A GPU reset should flush all TLBs anyway, so no need to do
+        * this while one is ongoing.
+        */
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return 0;
 
+       if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready) {
                if (adev->gmc.flush_tlb_needs_extra_type_2)
                        adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
                                                                 2, all_hub,
@@ -703,43 +708,40 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device 
*adev, uint16_t pasid,
                adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
                                                         flush_type, all_hub,
                                                         inst);
-               return 0;
-       }
+               r = 0;
+       } else {
+               /* 2 dwords flush + 8 dwords fence */
+               ndw = kiq->pmf->invalidate_tlbs_size + 8;
 
-       /* 2 dwords flush + 8 dwords fence */
-       ndw = kiq->pmf->invalidate_tlbs_size + 8;
+               if (adev->gmc.flush_tlb_needs_extra_type_2)
+                       ndw += kiq->pmf->invalidate_tlbs_size;
 
-       if (adev->gmc.flush_tlb_needs_extra_type_2)
-               ndw += kiq->pmf->invalidate_tlbs_size;
+               if (adev->gmc.flush_tlb_needs_extra_type_0)
+                       ndw += kiq->pmf->invalidate_tlbs_size;
 
-       if (adev->gmc.flush_tlb_needs_extra_type_0)
-               ndw += kiq->pmf->invalidate_tlbs_size;
+               spin_lock(&adev->gfx.kiq[inst].ring_lock);
+               amdgpu_ring_alloc(ring, ndw);
+               if (adev->gmc.flush_tlb_needs_extra_type_2)
+                       kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
 
-       spin_lock(&adev->gfx.kiq[inst].ring_lock);
-       amdgpu_ring_alloc(ring, ndw);
-       if (adev->gmc.flush_tlb_needs_extra_type_2)
-               kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
+               if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
+                       kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
 
-       if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
-               kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
+               kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
+               r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
+               if (r) {
+                       amdgpu_ring_undo(ring);
+                       spin_unlock(&adev->gfx.kiq[inst].ring_lock);
+                       goto error_unlock_reset;
+               }
 
-       kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
-       r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
-       if (r) {
-               amdgpu_ring_undo(ring);
+               amdgpu_ring_commit(ring);
                spin_unlock(&adev->gfx.kiq[inst].ring_lock);
-               goto error_unlock_reset;
-       }
-
-       amdgpu_ring_commit(ring);
-       spin_unlock(&adev->gfx.kiq[inst].ring_lock);
-       r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
-       if (r < 1) {
-               dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
-               r = -ETIME;
-               goto error_unlock_reset;
+               if (amdgpu_fence_wait_polling(ring, seq, usec_timeout) < 1) {
+                       dev_err(adev->dev, "timeout waiting for kiq fence\n");
+                       r = -ETIME;
+               }
        }
-       r = 0;
 
 error_unlock_reset:
        up_read(&adev->reset_domain->sem);
-- 
2.34.1

[PATCH v3 7/8] drm/amdgpu: fix locking scope when flushing tlb

Reply via email to