The MES firmware expects synchronous operation with the
driver.  For this to work asynchronously, each caller
would need to provide its own fence location and sequence
number.

For now, add a mutex lock to serialize the MES submission.
For SR-IOV long-wait case, break the long-wait to separated
part to prevent this wait from impacting reset sequence.

Signed-off-by: Horace Chen <horace.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  1 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 18 ++++++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 78e4f88f5134..8896be95b2c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -137,6 +137,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
        spin_lock_init(&adev->mes.queue_id_lock);
        spin_lock_init(&adev->mes.ring_lock);
        mutex_init(&adev->mes.mutex_hidden);
+       mutex_init(&adev->mes.submission_lock);
 
        adev->mes.total_max_queue = AMDGPU_FENCE_MES_QUEUE_ID_MASK;
        adev->mes.vmid_mask_mmhub = 0xffffff00;
@@ -221,6 +222,7 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
        idr_destroy(&adev->mes.queue_id_idr);
        ida_destroy(&adev->mes.doorbell_ida);
        mutex_destroy(&adev->mes.mutex_hidden);
+       mutex_destroy(&adev->mes.submission_lock);
        return r;
 }
 
@@ -240,6 +242,7 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
        idr_destroy(&adev->mes.queue_id_idr);
        ida_destroy(&adev->mes.doorbell_ida);
        mutex_destroy(&adev->mes.mutex_hidden);
+       mutex_destroy(&adev->mes.submission_lock);
 }
 
 static void amdgpu_mes_queue_free_mqd(struct amdgpu_mes_queue *q)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 6b3e1844eac5..90af935cc889 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -85,6 +85,7 @@ struct amdgpu_mes {
 
        struct amdgpu_ring              ring;
        spinlock_t                      ring_lock;
+       struct mutex                    submission_lock;
 
        const struct firmware           *fw[AMDGPU_MAX_MES_PIPES];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index e40d00afd4f5..0a609a5b8835 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -162,6 +162,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct 
amdgpu_mes *mes,
        struct amdgpu_ring *ring = &mes->ring;
        unsigned long flags;
        signed long timeout = adev->usec_timeout;
+       signed long retry_count = 1;
        const char *op_str, *misc_op_str;
 
        if (x_pkt->header.opcode >= MES_SCH_API_MAX)
@@ -169,15 +170,19 @@ static int 
mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 
        if (amdgpu_emu_mode) {
                timeout *= 100;
-       } else if (amdgpu_sriov_vf(adev)) {
+       }
+
+       if (amdgpu_sriov_vf(adev) && timeout > 0) {
                /* Worst case in sriov where all other 15 VF timeout, each VF 
needs about 600ms */
-               timeout = 15 * 600 * 1000;
+               retry_count = (15 * 600 * 1000) / timeout;
        }
        BUG_ON(size % 4 != 0);
 
+       mutex_lock(&mes->submission_lock);
        spin_lock_irqsave(&mes->ring_lock, flags);
        if (amdgpu_ring_alloc(ring, ndw)) {
                spin_unlock_irqrestore(&mes->ring_lock, flags);
+               mutex_unlock(&mes->submission_lock);
                return -ENOMEM;
        }
 
@@ -199,8 +204,13 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct 
amdgpu_mes *mes,
        else
                dev_dbg(adev->dev, "MES msg=%d was emitted\n", 
x_pkt->header.opcode);
 
-       r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
-                     timeout);
+       do {
+               r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
+                               timeout);
+               retry_count--;
+       } while (retry_count > 0 && !amdgpu_in_reset(adev));
+
+       mutex_unlock(&mes->submission_lock);
        if (r < 1) {
 
                if (misc_op_str)
-- 
2.34.1

Reply via email to