vcn4.0.3: rework reset handling

Alex Deucher Fri, 16 Jan 2026 05:50:00 -0800

On Thu, Jan 15, 2026 at 10:10 AM Zhang, Jesse(Jie) <[email protected]> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: Alex Deucher <[email protected]>
> > Sent: Thursday, January 15, 2026 2:14 AM
> > To: Lazar, Lijo <[email protected]>
> > Cc: Zhang, Jesse(Jie) <[email protected]>; [email protected];
> > Deucher, Alexander <[email protected]>; Koenig, Christian
> > <[email protected]>
> > Subject: Re: [PATCH 1/5] drm/amdgpu/vcn4.0.3: rework reset handling
> >
> > On Wed, Jan 14, 2026 at 6:27 AM Lazar, Lijo <[email protected]> wrote:
> > >
> > >
> > >
> > > On 14-Jan-26 2:29 PM, Jesse.Zhang wrote:
> > > > From: "Alex Deucher" <[email protected]>
> > > >
> > > > Resetting VCN resets the entire tile, including jpeg.
> > > > When we reset VCN, we also need to handle the jpeg queues.
> > > > Add a helper to handle recovering the jpeg queues when VCN is reset.
> > > >
> > > > Signed-off-by: Alex Deucher <[email protected]>
> > > > ---
> > > >   drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 11 +++++--
> > > >   drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c  | 42
> > ++++++++++++++++++++++--
> > > >   2 files changed, 49 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > > b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > > index aae7328973d1..1a32dadf8c5d 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> > > > @@ -1145,13 +1145,20 @@ static int jpeg_v4_0_3_ring_reset(struct
> > amdgpu_ring *ring,
> > > >                                 unsigned int vmid,
> > > >                                 struct amdgpu_fence *timedout_fence)
> > > >   {
> > > > +     struct amdgpu_device *adev = ring->adev;
> > > > +     struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me];
> > > > +     int r;
> > > > +
> > > >       if (amdgpu_sriov_vf(ring->adev))
> > > >               return -EOPNOTSUPP;
> > > > -
> > > > +     /* take the vcn reset mutex here because resetting VCN will reset 
> > > > jpeg as
> > well */
> > > > +     mutex_lock(&vinst->engine_reset_mutex);
> > > >       amdgpu_ring_reset_helper_begin(ring, timedout_fence);
> > > >       jpeg_v4_0_3_core_stall_reset(ring);
> > > >       jpeg_v4_0_3_start_jrbc(ring);
> > > > -     return amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > > +     r = amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > > +     mutex_unlock(&vinst->engine_reset_mutex);
> > > > +     return r;
> > > >   }
> > > >
> > > >   static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = { diff
> > > > --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > > b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > > index cb7123ec1a5d..31d93c10dfb1 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> > > > @@ -1596,6 +1596,32 @@ static void 
> > > > vcn_v4_0_3_unified_ring_set_wptr(struct
> > amdgpu_ring *ring)
> > > >       }
> > > >   }
> > > >
> > > > +static int vcn_v4_0_3_reset_jpeg_helper(struct amdgpu_device *adev,
> > > > +                                     int inst) {
> > > > +     struct amdgpu_ring *ring;
> > > > +     int i, r;
> > > > +
> > > > +     for (i = 0; i < adev->jpeg.num_jpeg_rings; ++i) {
> > > > +             ring = &adev->jpeg.inst[inst].ring_dec[i];
> > > > +             drm_sched_wqueue_stop(&ring->sched);
> > > > +             amdgpu_fence_driver_force_completion(ring);
> > > > +             if (ring->use_doorbell)
> > > > +                     WREG32_SOC15_OFFSET(
> > > > +                             VCN, GET_INST(VCN, inst),
> > > > +                             regVCN_JPEG_DB_CTRL,
> > > > +                             (ring->pipe ? (ring->pipe - 0x15) : 0),
> > > > +                             ring->doorbell_index
> > > > +                             << VCN_JPEG_DB_CTRL__OFFSET__SHIFT |
> > > > +                             VCN_JPEG_DB_CTRL__EN_MASK);
> > > > +             r = amdgpu_ring_test_helper(ring);
> > > > +             if (r)
> > > > +                     return r;
> > > > +             drm_sched_wqueue_start(&ring->sched);
> > > > +     }
> > > > +     return 0;
> > > > +}
> > > > +
> > > >   static int vcn_v4_0_3_ring_reset(struct amdgpu_ring *ring,
> > > >                                unsigned int vmid,
> > > >                                struct amdgpu_fence *timedout_fence)
> > > > @@ -1605,6 +1631,9 @@ static int vcn_v4_0_3_ring_reset(struct 
> > > > amdgpu_ring
> > *ring,
> > > >       struct amdgpu_device *adev = ring->adev;
> > > >       struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[ring->me];
> > > >
> > > > +     /* take the vcn reset mutex here because resetting VCN will reset 
> > > > jpeg as
> > well */
> > > > +     mutex_lock(&vinst->engine_reset_mutex);
> > > > +
> > > >       amdgpu_ring_reset_helper_begin(ring, timedout_fence);
> > > >
> > > >       vcn_inst = GET_INST(VCN, ring->me); @@ -1612,7 +1641,7 @@
> > > > static int vcn_v4_0_3_ring_reset(struct amdgpu_ring *ring,
> > > >
> > > >       if (r) {
> > > >               DRM_DEV_ERROR(adev->dev, "VCN reset fail : %d\n", r);
> > > > -             return r;
> > > > +             goto unlock;
> > > >       }
> > > >
> > > >       /* This flag is not set for VF, assumed to be disabled always
> > > > */ @@ -1621,7 +1650,16 @@ static int vcn_v4_0_3_ring_reset(struct
> > amdgpu_ring *ring,
> > > >       vcn_v4_0_3_hw_init_inst(vinst);
> > > >       vcn_v4_0_3_start_dpg_mode(vinst,
> > > > adev->vcn.inst[ring->me].indirect_sram);
> > > >
> > > > -     return amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > > +     r = amdgpu_ring_reset_helper_end(ring, timedout_fence);
> > > > +     if (r)
> > > > +             goto unlock;
> > > > +
> > > > +     r = vcn_v4_0_3_reset_jpeg_helper(adev, ring->me);
> > >
> > > This doesn't seem to handle any ongoing jpeg activity before doing a
> > > vcn reset. Is that fine?
> >
> > We could split the helper in two, in the top helper we can stop the sched 
> > workqueues
> > and attempt to wait for any outstanding fences. Then in the bottom helper, 
> > we can
> > force completion, re-init the rings, and restart the sched workqueues.  
> > With the patch
> > as is, any outstanding job fences would be marked with an error, and 
> > signalled so
> > userspace would see their jobs lost.
> [Zhang, Jesse(Jie)]   Is it possible to back up these commands in the top 
> helper ?
> This would avoid waiting for any unfinished fence synchronization.
> Because these commands might time out.


Not really if you want to salvage any currently active jobs.  Any jobs
that are already running either need to wait for completion or you
need to stop the queue and set error on the fence.  You could add a
function to try and salvage the unprocessed jobs, but the one(s)
already on the hardware would have to be lost.

Alex

> static int vcn_v4_0_3_reset_jpeg_pre_helper(struct amdgpu_device *adev, int 
> inst)
> {
>         ...
>         for (i = 0; i < adev->jpeg.num_jpeg_rings; ++i) {
>                 ring = &adev->jpeg.inst[inst].ring_dec[i];
>
>                 drm_sched_wqueue_stop(&ring->sched);
>                 /* back up the non-guilty commands */
>                 amdgpu_ring_backup_unprocessed_commands(ring, guilty_fence);
>
>        }
> }
> Thanks
> Jesse.
>
> >
> > Alex
> >
> > >
> > > Thanks,
> > > Lijo
> > >
> > > > +
> > > > +unlock:
> > > > +     mutex_unlock(&vinst->engine_reset_mutex);
> > > > +
> > > > +     return r;
> > > >   }
> > > >
> > > >   static const struct amdgpu_ring_funcs
> > > > vcn_v4_0_3_unified_ring_vm_funcs = {
> > >

Re: [PATCH 1/5] drm/amdgpu/vcn4.0.3: rework reset handling

Reply via email to