Implement the compute pipe reset and driver will
fallback to pipe reset when queue reset failed.

Signed-off-by: Prike Liang <prike.li...@amd.com>
---
v2: Convert the GC logic instance to physical instance in the
    register accessing process and use the dev_* print to specify
    the failed device.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153 ++++++++++++++++++++----
 2 files changed, 138 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index e28c1ebfa98f..d4d74ba2bc27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
                                   uint32_t queue_type, uint32_t me_id,
                                   uint32_t pipe_id, uint32_t queue_id,
                                   uint32_t xcc_id, uint32_t vmid);
+       int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
+                                  uint32_t queue_type, uint32_t me,
+                                  uint32_t pipe, uint32_t queue,
+                                  uint32_t xcc_id);
+
        /* Packet sizes */
        int set_resources_size;
        int map_queues_size;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 2067f26d3a9d..ab9d5adbbfe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device 
*adev,
                                struct amdgpu_cu_info *cu_info);
 static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int 
xcc_id);
 static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device *adev, int 
xcc_id);
+static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
+                                       uint32_t queue_type, uint32_t me,
+                                       uint32_t pipe, uint32_t queue,
+                                       uint32_t xcc_id);
 
 static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
                                uint64_t queue_mask)
@@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs 
= {
        .kiq_query_status = gfx_v9_4_3_kiq_query_status,
        .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
        .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
+       .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
        .set_resources_size = 8,
        .map_queues_size = 7,
        .unmap_queues_size = 6,
@@ -3466,6 +3471,115 @@ static void gfx_v9_4_3_emit_wave_limit(struct 
amdgpu_ring *ring, bool enable)
        }
 }
 
+static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
+                               uint32_t pipe, uint32_t queue,
+                               uint32_t xcc_id)
+{
+       int i, r;
+       /* make sure dequeue is complete*/
+       gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
+       mutex_lock(&adev->srbm_mutex);
+       soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
+       for (i = 0; i < adev->usec_timeout; i++) {
+               if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) 
& 1))
+                       break;
+               udelay(1);
+       }
+       if (i >= adev->usec_timeout)
+               r = -ETIMEDOUT;
+       else
+               r = 0;
+       soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
+       mutex_unlock(&adev->srbm_mutex);
+       gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
+
+       return r;
+
+}
+
+static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
+{
+
+       if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) {
+               DRM_WARN_ONCE("MEC firmware version too old, please use FW no 
older than 155!\n");
+               return false;
+       }
+
+       return true;
+}
+
+static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
+                                       uint32_t queue_type, uint32_t me,
+                                       uint32_t pipe, uint32_t queue,
+                                       uint32_t xcc_id)
+{
+       struct amdgpu_device *adev = kiq_ring->adev;
+       uint32_t reset_pipe, clean_pipe;
+       int r;
+
+       if (!gfx_v9_4_3_pipe_reset_support(adev))
+               return -EINVAL;
+
+       gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
+       mutex_lock(&adev->srbm_mutex);
+       soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
+
+       reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL);
+       clean_pipe = reset_pipe;
+
+       if (me == 1) {
+               switch (pipe) {
+               case 0:
+                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE0_RESET, 1);
+                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE0_RESET, 0);
+                       break;
+               case 1:
+                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE1_RESET, 1);
+                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE1_RESET, 0);
+                       break;
+               case 2:
+                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE2_RESET, 1);
+                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE2_RESET, 0);
+                       break;
+               case 3:
+                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE3_RESET, 1);
+                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
+                                                  MEC_ME1_PIPE3_RESET, 0);
+                       break;
+               default:
+                       break;
+               }
+       } else {
+               if (pipe) {
+                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
+                                                  MEC_ME2_PIPE1_RESET, 1);
+                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
+                                                  MEC_ME2_PIPE1_RESET, 0);
+               } else {
+                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
+                                                  MEC_ME2_PIPE0_RESET, 1);
+                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
+                                                  MEC_ME2_PIPE0_RESET, 0);
+               }
+       }
+
+       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, reset_pipe);
+       WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, clean_pipe);
+       soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
+       mutex_unlock(&adev->srbm_mutex);
+       gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
+
+       r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
+       return r;
+}
+
 static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
                                unsigned int vmid)
 {
@@ -3473,7 +3587,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
        struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
        struct amdgpu_ring *kiq_ring = &kiq->ring;
        unsigned long flags;
-       int r, i;
+       int r;
 
        if (amdgpu_sriov_vf(adev))
                return -EINVAL;
@@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring 
*ring,
        spin_unlock_irqrestore(&kiq->ring_lock, flags);
 
        r = amdgpu_ring_test_ring(kiq_ring);
-       if (r)
-               return r;
-
-       /* make sure dequeue is complete*/
-       amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
-       mutex_lock(&adev->srbm_mutex);
-       soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, 
GET_INST(GC, ring->xcc_id));
-       for (i = 0; i < adev->usec_timeout; i++) {
-               if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
-                       break;
-               udelay(1);
-       }
-       if (i >= adev->usec_timeout)
-               r = -ETIMEDOUT;
-       soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
-       mutex_unlock(&adev->srbm_mutex);
-       amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
        if (r) {
-               dev_err(adev->dev, "fail to wait on hqd deactive\n");
-               return r;
+               DRM_ERROR("kiq ring test failed after ring: %s queue reset\n",
+                               ring->name);
+               goto pipe_reset;
+       }
+
+       r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, 
ring->xcc_id);
+       if (r)
+               dev_err(adev->dev,"fail to wait on hqd deactive and will try 
pipe reset\n");
+
+pipe_reset:
+       if(r) {
+               r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type,
+                                               ring->me, ring->pipe,
+                                               ring->queue, ring->xcc_id);
+               DRM_INFO("ring: %s pipe reset :%s\n", ring->name,
+                               r ? "failed" : "successfully");
+               if (r)
+                       return r;
        }
 
        r = amdgpu_bo_reserve(ring->mqd_obj, false);
-- 
2.34.1

Reply via email to