Hold MEC pipe reset asserted, walk every queue on that (me, pipe) and tear down CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST via gfx_v11_0_clear_hqds_on_mec_pipe(), then deassert reset. Avoids releasing pipe reset while HQDs may still be active.
Legacy (non-RS64) path: read CP_MEC_CNTL for the reset mask instead of reusing CP_MEC_RS64_CNTL state. V2:stop the schedulers for all queues on the pipe and then mark the fences with an error and then make sure to re-enable and test all of the queues after the reset (Alex) Suggested-by: Manu Rastogi <[email protected]> Suggested-by: Alex Deucher <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 133 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 11 ++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 157 +++++++++++++++--------- 3 files changed, 243 insertions(+), 58 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 2956e45c9254..a30a21163d2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -69,6 +69,139 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit, } +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me, u32 pipe) +{ + if (!ring || !ring->funcs || ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) + return false; + if (ring->no_scheduler) + return false; + + return ring->me == me && ring->pipe == pipe; +} + +/* Same layout as amdgpu_gfx_run_cleaner_shader(): block of num_compute_rings per XCC. */ +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct amdgpu_device *adev, + u32 xcc_id) +{ + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + + if (num_xcc <= 1) + return 0; + return xcc_id * adev->gfx.num_compute_rings; +} + +/** + * amdgpu_gfx_mec_pipe_reset_prepare - stop schedulers before MEC pipe reset HW + * + * Backs up ring state for sibling KCQs on the same (me, pipe), stops their DRM + * schedulers, and stops KFD scheduling for the node. The guilty ring is already + * backed up by amdgpu_ring_reset_helper_begin(). + */ +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev, + struct amdgpu_ring *guilty) +{ + struct amdgpu_ring *r; + unsigned int j, base; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe)) + continue; + if (r != guilty) + amdgpu_ring_backup_unprocessed_commands(r, NULL); + if (amdgpu_ring_sched_ready(r)) + drm_sched_wqueue_stop(&r->sched); + } + + if (adev->kfd.init_complete) + amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id); +} + +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 xcc_id) +{ + struct amdgpu_ring *r; + unsigned int j, base; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, me, pipe)) + continue; + if (amdgpu_ring_sched_ready(r)) + drm_sched_wqueue_start(&r->sched); + } + + if (adev->kfd.init_complete) + amdgpu_amdkfd_start_sched(adev, xcc_id); +} + +/** + * amdgpu_gfx_mec_pipe_reset_recover_queues - re-init KCQs after MEC pipe reset + * + * Re-inits and remaps every kernel compute queue on the guilty ring's MEC pipe, + * restarts schedulers, then for each queue calls amdgpu_ring_reset_helper_end() + * (ring test + fence error / reemit). Sibling queues use a synthetic fence + * context so collateral work is reemitted. + * @timedout_fence: timeout fence for the guilty ring; must be non-NULL. + * @kcq_init: IP hook (e.g. gfx_v11_0_kcq_init_queue). + */ +int amdgpu_gfx_mec_pipe_reset_recover_queues(struct amdgpu_device *adev, + struct amdgpu_ring *guilty, + struct amdgpu_fence *timedout_fence, + amdgpu_gfx_kcq_init_queue_t kcq_init) +{ + struct amdgpu_fence collateral_reemit = {}; + struct amdgpu_ring *r; + unsigned int j, base; + int err = 0; + + if (!timedout_fence) + return -EINVAL; + + collateral_reemit.context = (u64)-1; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe)) + continue; + + err = kcq_init(r, true); + if (err) + goto err_sched; + err = amdgpu_mes_map_legacy_queue(adev, r, 0); + if (err) + goto err_sched; + } + + amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe, + guilty->xcc_id); + + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe)) + continue; + + err = amdgpu_ring_reset_helper_end( + r, r == guilty ? timedout_fence : &collateral_reemit); + if (err) { + dev_err(adev->dev, + "ring %s failed recover after MEC pipe reset (%d)\n", + r->name, err); + return err; + } + } + + return 0; + +err_sched: + amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe, + guilty->xcc_id); + return err; +} + bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, int mec, int pipe, int queue) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index a0cf0a3b41da..a4a31fcd2d47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -603,6 +603,17 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device *adev, int mec, int pipe, int queue); void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit, int *mec, int *pipe, int *queue); + +typedef int (*amdgpu_gfx_kcq_init_queue_t)(struct amdgpu_ring *ring, bool clear); + +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev, + struct amdgpu_ring *guilty); +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 xcc_id); +int amdgpu_gfx_mec_pipe_reset_recover_queues( + struct amdgpu_device *adev, struct amdgpu_ring *guilty, + struct amdgpu_fence *timedout_fence, + amdgpu_gfx_kcq_init_queue_t kcq_init); bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, int mec, int pipe, int queue); bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index ae39b9e1f7d6..a25fc25279b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -6906,11 +6906,40 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring, return amdgpu_ring_reset_helper_end(ring, timedout_fence); } +/* + * With MEC pipe reset asserted, clear CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST for + * every queue on (me, pipe). HQDs must be torn down while pipe reset stays + * asserted; only then clear the pipe reset bit. + * Caller must hold adev->srbm_mutex. + */ +static void gfx_v11_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32 me, + u32 pipe) +{ + unsigned int q; + int j; + + for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) { + soc21_grbm_select(adev, me, pipe, q, 0); + /* Start from a clean HQD dequeue state before forcing HQD inactive. */ + WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0); + if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) { + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 1); + for (j = 0; j < adev->usec_timeout; j++) { + if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1)) + break; + udelay(1); + } + } + + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0); + } +} + static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - uint32_t reset_pipe = 0, clean_pipe = 0; + uint32_t reset_val, clean_val; int r; if (!gfx_v11_pipe_reset_support(adev)) @@ -6920,69 +6949,73 @@ static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring) mutex_lock(&adev->srbm_mutex); soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); - reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL); - clean_pipe = reset_pipe; - if (adev->gfx.rs64_enable) { + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL); + clean_val = reset_val; switch (ring->pipe) { case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE0_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE0_RESET, 0); break; case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE1_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE1_RESET, 0); break; case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE2_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE2_RESET, 0); break; case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_RS64_CNTL, - MEC_PIPE3_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL, + MEC_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL, + MEC_PIPE3_RESET, 0); break; default: break; } - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe); + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val); + gfx_v11_0_clear_hqds_on_mec_pipe(adev, ring->me, ring->pipe); + soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val); r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) - RS64_FW_UC_START_ADDR_LO; } else { + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL); + clean_val = reset_val; + if (ring->me == 1) { switch (ring->pipe) { case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE0_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE0_RESET, 0); break; case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE1_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE1_RESET, 0); break; case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE2_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE2_RESET, 0); break; case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME1_PIPE3_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME1_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME1_PIPE3_RESET, 0); break; default: break; @@ -6991,36 +7024,38 @@ static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring) } else { switch (ring->pipe) { case 0: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE0_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE0_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE0_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE0_RESET, 0); break; case 1: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE1_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE1_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE1_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE1_RESET, 0); break; case 2: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE2_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE2_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE2_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE2_RESET, 0); break; case 3: - reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE3_RESET, 1); - clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL, - MEC_ME2_PIPE3_RESET, 0); + reset_val = REG_SET_FIELD(reset_val, CP_MEC_CNTL, + MEC_ME2_PIPE3_RESET, 1); + clean_val = REG_SET_FIELD(clean_val, CP_MEC_CNTL, + MEC_ME2_PIPE3_RESET, 0); break; default: break; } /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */ } - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe); - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe); + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val); + gfx_v11_0_clear_hqds_on_mec_pipe(adev, ring->me, ring->pipe); + soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val); r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR)); } @@ -7028,7 +7063,7 @@ static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring) mutex_unlock(&adev->srbm_mutex); gfx_v11_0_unset_safe_mode(adev, 0); - dev_info(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", ring->name, + dev_dbg(adev->dev, "The ring %s pipe resets to MEC FW start PC: %s\n", ring->name, r == 0 ? "successfully" : "failed"); /*FIXME:Sometimes driver can't cache the MEC firmware start PC correctly, so the pipe * reset status relies on the compute ring test result. @@ -7048,9 +7083,15 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring, r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true, 0); if (r) { dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r); + amdgpu_gfx_mec_pipe_reset_prepare(adev, ring); r = gfx_v11_0_reset_compute_pipe(ring); - if (r) + if (r) { + amdgpu_gfx_mec_pipe_restart_schedulers(adev, ring->me, ring->pipe, + ring->xcc_id); return r; + } + return amdgpu_gfx_mec_pipe_reset_recover_queues(adev, ring, timedout_fence, + gfx_v11_0_kcq_init_queue); } r = gfx_v11_0_kcq_init_queue(ring, true); -- 2.49.0
