vcn4.0.3: rework reset handling

Zhang, Jesse(Jie) Thu, 15 Jan 2026 07:48:18 -0800

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Alex Deucher <[email protected]>
> Sent: Thursday, January 15, 2026 2:14 AM
> To: Lazar, Lijo <[email protected]>
> Cc: Zhang, Jesse(Jie) <[email protected]>; [email protected];
> Deucher, Alexander <[email protected]>; Koenig, Christian
> <[email protected]>
> Subject: Re: [PATCH 1/5] drm/amdgpu/vcn4.0.3: rework reset handling
>
> On Wed, Jan 14, 2026 at 6:27 AM Lazar, Lijo <[email protected]> wrote:
> >
> >
> >
> > On 14-Jan-26 2:29 PM, Jesse.Zhang wrote:
> > > From: "Alex Deucher" <[email protected]>
> > >
> > > Resetting VCN resets the entire tile, including jpeg.
> > > When we reset VCN, we also need to handle the jpeg queues.
> > > Add a helper to handle recovering the jpeg queues when VCN is reset.
> > >
> > > Signed-off-by: Alex Deucher <[email protected]>
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 11 +++++--
> > >   drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c  | 42
> ++++++++++++++++++++++--
> > >   2 files changed, 49 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > index aae7328973d1..1a32dadf8c5d 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > @@ -1145,13 +1145,20 @@ static int jpeg_v4_0_3_ring_reset(struct
> amdgpu_ring *ring,
> > >                                 unsigned int vmid,
> > >                                 struct amdgpu_fence *timedout_fence)
> > >   {
> > > +     struct amdgpu_device *adev = ring->adev;
> > > +     struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me];
> > > +     int r;
> > > +
> > >       if (amdgpu_sriov_vf(ring->adev))
> > >               return -EOPNOTSUPP;
> > > -
> > > +     /* take the vcn reset mutex here because resetting VCN will reset 
> > > jpeg as
> well */
> > > +     mutex_lock(&vinst->engine_reset_mutex);
> > >       amdgpu_ring_reset_helper_begin(ring, timedout_fence);
> > >       jpeg_v4_0_3_core_stall_reset(ring);
> > >       jpeg_v4_0_3_start_jrbc(ring);
> > > -     return amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > +     r = amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > +     mutex_unlock(&vinst->engine_reset_mutex);
> > > +     return r;
> > >   }
> > >
> > >   static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = { diff
> > > --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > index cb7123ec1a5d..31d93c10dfb1 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > @@ -1596,6 +1596,32 @@ static void vcn_v4_0_3_unified_ring_set_wptr(struct
> amdgpu_ring *ring)
> > >       }
> > >   }
> > >
> > > +static int vcn_v4_0_3_reset_jpeg_helper(struct amdgpu_device *adev,
> > > +                                     int inst) {
> > > +     struct amdgpu_ring *ring;
> > > +     int i, r;
> > > +
> > > +     for (i = 0; i < adev->jpeg.num_jpeg_rings; ++i) {
> > > +             ring = &adev->jpeg.inst[inst].ring_dec[i];
> > > +             drm_sched_wqueue_stop(&ring->sched);
> > > +             amdgpu_fence_driver_force_completion(ring);
> > > +             if (ring->use_doorbell)
> > > +                     WREG32_SOC15_OFFSET(
> > > +                             VCN, GET_INST(VCN, inst),
> > > +                             regVCN_JPEG_DB_CTRL,
> > > +                             (ring->pipe ? (ring->pipe - 0x15) : 0),
> > > +                             ring->doorbell_index
> > > +                             << VCN_JPEG_DB_CTRL__OFFSET__SHIFT |
> > > +                             VCN_JPEG_DB_CTRL__EN_MASK);
> > > +             r = amdgpu_ring_test_helper(ring);
> > > +             if (r)
> > > +                     return r;
> > > +             drm_sched_wqueue_start(&ring->sched);
> > > +     }
> > > +     return 0;
> > > +}
> > > +
> > >   static int vcn_v4_0_3_ring_reset(struct amdgpu_ring *ring,
> > >                                unsigned int vmid,
> > >                                struct amdgpu_fence *timedout_fence)
> > > @@ -1605,6 +1631,9 @@ static int vcn_v4_0_3_ring_reset(struct amdgpu_ring
> *ring,
> > >       struct amdgpu_device *adev = ring->adev;
> > >       struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me];
> > >
> > > +     /* take the vcn reset mutex here because resetting VCN will reset 
> > > jpeg as
> well */
> > > +     mutex_lock(&vinst->engine_reset_mutex);
> > > +
> > >       amdgpu_ring_reset_helper_begin(ring, timedout_fence);
> > >
> > >       vcn_inst = GET_INST(VCN, ring->me); @@ -1612,7 +1641,7 @@
> > > static int vcn_v4_0_3_ring_reset(struct amdgpu_ring *ring,
> > >
> > >       if (r) {
> > >               DRM_DEV_ERROR(adev->dev, "VCN reset fail : %d\n", r);
> > > -             return r;
> > > +             goto unlock;
> > >       }
> > >
> > >       /* This flag is not set for VF, assumed to be disabled always
> > > */ @@ -1621,7 +1650,16 @@ static int vcn_v4_0_3_ring_reset(struct
> amdgpu_ring *ring,
> > >       vcn_v4_0_3_hw_init_inst(vinst);
> > >       vcn_v4_0_3_start_dpg_mode(vinst,
> > > adev->vcn.inst[ring->me].indirect_sram);
> > >
> > > -     return amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > +     r = amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > +     if (r)
> > > +             goto unlock;
> > > +
> > > +     r = vcn_v4_0_3_reset_jpeg_helper(adev, ring->me);
> >
> > This doesn't seem to handle any ongoing jpeg activity before doing a
> > vcn reset. Is that fine?
>
> We could split the helper in two, in the top helper we can stop the sched 
> workqueues
> and attempt to wait for any outstanding fences. Then in the bottom helper, we 
> can
> force completion, re-init the rings, and restart the sched workqueues.  With 
> the patch
> as is, any outstanding job fences would be marked with an error, and 
> signalled so
> userspace would see their jobs lost.
[Zhang, Jesse(Jie)]   Is it possible to back up these commands in the top 
helper ?
This would avoid waiting for any unfinished fence synchronization.
Because these commands might time out.
static int vcn_v4_0_3_reset_jpeg_pre_helper(struct amdgpu_device *adev, int 
inst)
{
        ...
        for (i = 0; i < adev->jpeg.num_jpeg_rings; ++i) {
                ring = &adev->jpeg.inst[inst].ring_dec[i];


                drm_sched_wqueue_stop(&ring->sched);
                /* back up the non-guilty commands */
                amdgpu_ring_backup_unprocessed_commands(ring, guilty_fence);

       }
}
Thanks
Jesse.

>
> Alex
>
> >
> > Thanks,
> > Lijo
> >
> > > +
> > > +unlock:
> > > +     mutex_unlock(&vinst->engine_reset_mutex);
> > > +
> > > +     return r;
> > >   }
> > >
> > >   static const struct amdgpu_ring_funcs
> > > vcn_v4_0_3_unified_ring_vm_funcs = {
> >

RE: [PATCH 1/5] drm/amdgpu/vcn4.0.3: rework reset handling

Reply via email to