Hold MEC pipe reset asserted, walk every queue on that (me, pipe) and tear
down CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST via 
gfx_v11_0_clear_hqds_on_mec_pipe(),
then deassert reset. Avoids releasing pipe reset while HQDs may still be
active.

Legacy (non-RS64) path: read CP_MEC_CNTL for the reset mask instead of
reusing CP_MEC_RS64_CNTL state.

V2:stop the schedulers for all queues on the pipe and then mark the fences with 
an error
and then make sure to re-enable and test all of the queues after the reset 
(Alex)

Suggested-by:  Manu Rastogi <[email protected]>
Suggested-by:  Alex Deucher <[email protected]>
Signed-off-by: Jesse Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 133 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  11 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 157 +++++++++++++++---------
 3 files changed, 243 insertions(+), 58 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 2956e45c9254..a30a21163d2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -69,6 +69,139 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct 
amdgpu_device *adev, int bit,
 
 }
 
+static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me, u32 
pipe)
+{
+       if (!ring || !ring->funcs || ring->funcs->type != 
AMDGPU_RING_TYPE_COMPUTE)
+               return false;
+       if (ring->no_scheduler)
+               return false;
+
+       return ring->me == me && ring->pipe == pipe;
+}
+
+/* Same layout as amdgpu_gfx_run_cleaner_shader(): block of num_compute_rings 
per XCC. */
+static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct amdgpu_device 
*adev,
+                                                        u32 xcc_id)
+{
+       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
+
+       if (num_xcc <= 1)
+               return 0;
+       return xcc_id * adev->gfx.num_compute_rings;
+}
+
+/**
+ * amdgpu_gfx_mec_pipe_reset_prepare - stop schedulers before MEC pipe reset HW
+ *
+ * Backs up ring state for sibling KCQs on the same (me, pipe), stops their DRM
+ * schedulers, and stops KFD scheduling for the node. The guilty ring is 
already
+ * backed up by amdgpu_ring_reset_helper_begin().
+ */
+void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
+                                      struct amdgpu_ring *guilty)
+{
+       struct amdgpu_ring *r;
+       unsigned int j, base;
+
+       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
+                       continue;
+               if (r != guilty)
+                       amdgpu_ring_backup_unprocessed_commands(r, NULL);
+               if (amdgpu_ring_sched_ready(r))
+                       drm_sched_wqueue_stop(&r->sched);
+       }
+
+       if (adev->kfd.init_complete)
+               amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id);
+}
+
+void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
+                                           u32 me, u32 pipe, u32 xcc_id)
+{
+       struct amdgpu_ring *r;
+       unsigned int j, base;
+
+       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, xcc_id);
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, me, pipe))
+                       continue;
+               if (amdgpu_ring_sched_ready(r))
+                       drm_sched_wqueue_start(&r->sched);
+       }
+
+       if (adev->kfd.init_complete)
+               amdgpu_amdkfd_start_sched(adev, xcc_id);
+}
+
+/**
+ * amdgpu_gfx_mec_pipe_reset_recover_queues - re-init KCQs after MEC pipe reset
+ *
+ * Re-inits and remaps every kernel compute queue on the guilty ring's MEC 
pipe,
+ * restarts schedulers, then for each queue calls 
amdgpu_ring_reset_helper_end()
+ * (ring test + fence error / reemit). Sibling queues use a synthetic fence
+ * context so collateral work is reemitted.
+ * @timedout_fence: timeout fence for the guilty ring; must be non-NULL.
+ * @kcq_init: IP hook (e.g. gfx_v11_0_kcq_init_queue).
+ */
+int amdgpu_gfx_mec_pipe_reset_recover_queues(struct amdgpu_device *adev,
+                                            struct amdgpu_ring *guilty,
+                                            struct amdgpu_fence 
*timedout_fence,
+                                            amdgpu_gfx_kcq_init_queue_t 
kcq_init)
+{
+       struct amdgpu_fence collateral_reemit = {};
+       struct amdgpu_ring *r;
+       unsigned int j, base;
+       int err = 0;
+
+       if (!timedout_fence)
+               return -EINVAL;
+
+       collateral_reemit.context = (u64)-1;
+
+       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
+                       continue;
+
+               err = kcq_init(r, true);
+               if (err)
+                       goto err_sched;
+               err = amdgpu_mes_map_legacy_queue(adev, r, 0);
+               if (err)
+                       goto err_sched;
+       }
+
+       amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe,
+                                              guilty->xcc_id);
+
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
+                       continue;
+
+               err = amdgpu_ring_reset_helper_end(
+                       r, r == guilty ? timedout_fence : &collateral_reemit);
+               if (err) {
+                       dev_err(adev->dev,
+                               "ring %s failed recover after MEC pipe reset 
(%d)\n",
+                               r->name, err);
+                       return err;
+               }
+       }
+
+       return 0;
+
+err_sched:
+       amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe,
+                                              guilty->xcc_id);
+       return err;
+}
+
 bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev,
                                     int xcc_id, int mec, int pipe, int queue)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index a0cf0a3b41da..a4a31fcd2d47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -603,6 +603,17 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device 
*adev, int mec,
                                int pipe, int queue);
 void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit,
                                 int *mec, int *pipe, int *queue);
+
+typedef int (*amdgpu_gfx_kcq_init_queue_t)(struct amdgpu_ring *ring, bool 
clear);
+
+void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
+                                      struct amdgpu_ring *guilty);
+void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
+                                           u32 me, u32 pipe, u32 xcc_id);
+int amdgpu_gfx_mec_pipe_reset_recover_queues(
+       struct amdgpu_device *adev, struct amdgpu_ring *guilty,
+       struct amdgpu_fence *timedout_fence,
+       amdgpu_gfx_kcq_init_queue_t kcq_init);
 bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id,
                                     int mec, int pipe, int queue);
 bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index ae39b9e1f7d6..a25fc25279b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -6906,11 +6906,40 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
        return amdgpu_ring_reset_helper_end(ring, timedout_fence);
 }
 
+/*
+ * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST 
for
+ * every queue on (me, pipe). HQDs must be torn down while pipe reset stays
+ * asserted; only then clear the pipe reset bit.
+ * Caller must hold adev->srbm_mutex.
+ */
+static void gfx_v11_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 
me,
+                                            u32 pipe)
+{
+       unsigned int q;
+       int j;
+
+       for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+               soc21_grbm_select(adev, me, pipe, q, 0);
+               /* Start from a clean HQD dequeue state before forcing HQD 
inactive. */
+               WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0);
+               if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) {
+                       WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 1);
+                       for (j = 0; j < adev->usec_timeout; j++) {
+                               if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 
1))
+                                       break;
+                               udelay(1);
+                       }
+               }
+
+               WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0);
+       }
+}
+
 static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring)
 {
 
        struct amdgpu_device *adev = ring->adev;
-       uint32_t reset_pipe = 0, clean_pipe = 0;
+       uint32_t reset_val, clean_val;
        int r;
 
        if (!gfx_v11_pipe_reset_support(adev))
@@ -6920,69 +6949,73 @@ static int gfx_v11_0_reset_compute_pipe(struct 
amdgpu_ring *ring)
        mutex_lock(&adev->srbm_mutex);
        soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
 
-       reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
-       clean_pipe = reset_pipe;
-
        if (adev->gfx.rs64_enable) {
+               reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
+               clean_val = reset_val;
 
                switch (ring->pipe) {
                case 0:
-                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE0_RESET, 1);
-                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE0_RESET, 0);
+                       reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE0_RESET, 1);
+                       clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE0_RESET, 0);
                        break;
                case 1:
-                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE1_RESET, 1);
-                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE1_RESET, 0);
+                       reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE1_RESET, 1);
+                       clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE1_RESET, 0);
                        break;
                case 2:
-                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE2_RESET, 1);
-                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE2_RESET, 0);
+                       reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE2_RESET, 1);
+                       clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE2_RESET, 0);
                        break;
                case 3:
-                       reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE3_RESET, 1);
-                       clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL,
-                                                  MEC_PIPE3_RESET, 0);
+                       reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE3_RESET, 1);
+                       clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
+                                                 MEC_PIPE3_RESET, 0);
                        break;
                default:
                        break;
                }
-               WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe);
-               WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe);
+               WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val);
+               gfx_v11_0_clear_hqds_on_mec_pipe(adev, ring->me, ring->pipe);
+               soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+               WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val);
                r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
                                        RS64_FW_UC_START_ADDR_LO;
        } else {
+               reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
+               clean_val = reset_val;
+
                if (ring->me == 1) {
                        switch (ring->pipe) {
                        case 0:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE0_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE0_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE0_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE0_RESET, 
0);
                                break;
                        case 1:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE1_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE1_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE1_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE1_RESET, 
0);
                                break;
                        case 2:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE2_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE2_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE2_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE2_RESET, 
0);
                                break;
                        case 3:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE3_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME1_PIPE3_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE3_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME1_PIPE3_RESET, 
0);
                                break;
                        default:
                                break;
@@ -6991,36 +7024,38 @@ static int gfx_v11_0_reset_compute_pipe(struct 
amdgpu_ring *ring)
                } else {
                        switch (ring->pipe) {
                        case 0:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE0_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE0_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE0_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE0_RESET, 
0);
                                break;
                        case 1:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE1_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE1_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE1_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE1_RESET, 
0);
                                break;
                        case 2:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE2_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE2_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE2_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE2_RESET, 
0);
                                break;
                        case 3:
-                               reset_pipe = REG_SET_FIELD(reset_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE3_RESET, 
1);
-                               clean_pipe = REG_SET_FIELD(clean_pipe, 
CP_MEC_CNTL,
-                                                          MEC_ME2_PIPE3_RESET, 
0);
+                               reset_val = REG_SET_FIELD(reset_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE3_RESET, 
1);
+                               clean_val = REG_SET_FIELD(clean_val, 
CP_MEC_CNTL,
+                                                         MEC_ME2_PIPE3_RESET, 
0);
                                break;
                        default:
                                break;
                        }
                        /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */
                }
-               WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe);
-               WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe);
+               WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val);
+               gfx_v11_0_clear_hqds_on_mec_pipe(adev, ring->me, ring->pipe);
+               soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+               WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val);
                r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR));
        }
 
@@ -7028,7 +7063,7 @@ static int gfx_v11_0_reset_compute_pipe(struct 
amdgpu_ring *ring)
        mutex_unlock(&adev->srbm_mutex);
        gfx_v11_0_unset_safe_mode(adev, 0);
 
-       dev_info(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", 
ring->name,
+       dev_dbg(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", 
ring->name,
                        r == 0 ? "successfully" : "failed");
        /*FIXME:Sometimes driver can't cache the MEC firmware start PC 
correctly, so the pipe
         * reset status relies on the compute ring test result.
@@ -7048,9 +7083,15 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
        r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0);
        if (r) {
                dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe 
reset\n", r);
+               amdgpu_gfx_mec_pipe_reset_prepare(adev, ring);
                r = gfx_v11_0_reset_compute_pipe(ring);
-               if (r)
+               if (r) {
+                       amdgpu_gfx_mec_pipe_restart_schedulers(adev, ring->me, 
ring->pipe,
+                                                              ring->xcc_id);
                        return r;
+               }
+               return amdgpu_gfx_mec_pipe_reset_recover_queues(adev, ring, 
timedout_fence,
+                                                               
gfx_v11_0_kcq_init_queue);
        }
 
        r = gfx_v11_0_kcq_init_queue(ring, true);
-- 
2.49.0

Reply via email to