On Tue, May 12, 2026 at 4:59 AM Jesse Zhang <[email protected]> wrote:
>
> Introduce helpers to prepare for and recover from a MEC pipe
> reset. The pre‑reset handler stops the KFD scheduler if the KFD is
> initialised, preventing new submissions while the pipe is being
> reset. The post‑reset handler iterates over all compute rings
> sharing the same MEC pipe (on the affected XCC) and marks any
> non‑guilty ring’s scheduler as faulted via drm_sched_fault().
>
> v2: drop the stop drm scheduer, have a worker thread which schedules a call to
> drm_sched_fault() for all of the affected queues (Alex)
>
> Suggested-by: Alex Deucher <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 45 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 +++
> 2 files changed, 50 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 515cc4a2aeb4..a9fd639e4cd6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -69,6 +69,51 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct
> amdgpu_device *adev, int bit,
>
> }
>
> +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me,
> u32 pipe)
> +{
> + if (!ring || !ring->funcs || ring->funcs->type !=
> AMDGPU_RING_TYPE_COMPUTE)
> + return false;
> +
> + return ring->me == me && ring->pipe == pipe;
> +}
> +
> +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct
> amdgpu_device *adev,
> + u32 xcc_id)
> +{
> + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
> +
> + if (num_xcc <= 1)
> + return 0;
> + return xcc_id * adev->gfx.num_compute_rings;
> +}
> +
> +void amdgpu_gfx_mec_pre_pipe_reset(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty)
> +{
> + if (adev->kfd.init_complete)
> + amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id);
> +}
> +
> +void amdgpu_gfx_mec_post_pipe_reset(struct amdgpu_device *adev, struct
> amdgpu_ring *guilty)
> +{
> + struct amdgpu_ring *ring;
> + unsigned int j, base;
> +
> + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
> + for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> + ring = &adev->gfx.compute_ring[base + j];
> + if (!amdgpu_gfx_ring_on_mec_pipe(ring, guilty->me,
> guilty->pipe))
> + continue;
> +
> + if (ring != guilty)
> + drm_sched_fault(&ring->sched);
You can't call this from the reset handler since you are already in
the work queue that this modifies.
Alex
> + }
> +
> + if (adev->kfd.init_complete)
> + amdgpu_amdkfd_start_sched(adev, guilty->xcc_id);
> +}
> +
> bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev,
> int xcc_id, int mec, int pipe, int queue)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 77050f9884f2..1deb82836f02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -603,6 +603,11 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device
> *adev, int mec,
> int pipe, int queue);
> void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit,
> int *mec, int *pipe, int *queue);
> +
> +void amdgpu_gfx_mec_pre_pipe_reset(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty);
> +void amdgpu_gfx_mec_post_pipe_reset(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty);
> bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id,
> int mec, int pipe, int queue);
> bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
> --
> 2.49.0
>