Hi Alex,

Would it be good to have this logic in the reset call back itself ?

Adding common vinst->reset stops the flexibility of having separate reset functionality for enc rings and decode rings, can selectively handle the drm_sched_wqueue_start/stop and re-emit of guilty/non-guilty for enc and dec separately.

And the usual vcn_stop() followed by vcn_start() isn't helping in reset of the engine for vcn3.

I tried a workaround to pause_dpg and enable static clockgate and powergate, and then stop()/start() the engine
which is working consistently so far.

Regards,
Sathish

On 6/17/2025 8:38 AM, Alex Deucher wrote:
With engine resets we reset all queues on the engine rather
than just a single queue.  Add a framework to handle this
similar to SDMA.

Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 64 +++++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  6 ++-
  2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index c8885c3d54b33..075740ed275eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -134,6 +134,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
        mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
+       mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
        atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
        INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, 
amdgpu_vcn_idle_work_handler);
        atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
@@ -1451,3 +1452,66 @@ int vcn_set_powergating_state(struct amdgpu_ip_block 
*ip_block,
return ret;
  }
+
+/**
+ * amdgpu_vcn_reset_engine - Reset a specific VCN engine
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: VCN engine instance to reset
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
+                                  uint32_t instance_id)
+{
+       struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
+       int r, i;
+
+       mutex_lock(&vinst->engine_reset_mutex);
+       /* Stop the scheduler's work queue for the dec and enc rings if they 
are running.
+        * This ensures that no new tasks are submitted to the queues while
+        * the reset is in progress.
+        */
+       drm_sched_wqueue_stop(&vinst->ring_dec.sched);
+       for (i = 0; i < vinst->num_enc_rings; i++)
+               drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
+
+       /* Perform the VCN reset for the specified instance */
+       r = vinst->reset(vinst);
+       if (r) {
+               dev_err(adev->dev, "Failed to reset VCN instance %u\n", 
instance_id);
+       } else {
+               /* Restart the scheduler's work queue for the dec and enc rings
+                * if they were stopped by this function. This allows new tasks
+                * to be submitted to the queues after the reset is complete.
+                */
+               drm_sched_wqueue_start(&vinst->ring_dec.sched);
+               for (i = 0; i < vinst->num_enc_rings; i++)
+                       drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
+       }
+       mutex_unlock(&vinst->engine_reset_mutex);
+
+       return r;
+}
+
+/**
+ * amdgpu_vcn_ring_reset - Reset a VCN ring
+ * @ring: ring to reset
+ * @vmid: vmid of guilty job
+ * @guilty_fence: guilty fence
+ *
+ * This helper is for VCN blocks without unified queues because
+ * resetting the engine resets all queues in that case.  With
+ * unified queues we have one queue per engine.
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
+                         unsigned int vmid,
+                         struct amdgpu_fence *guilty_fence)
+{
+       struct amdgpu_device *adev = ring->adev;
+
+       if (adev->vcn.inst[ring->me].using_unified_queue)
+               return -EINVAL;
+
+       return amdgpu_vcn_reset_engine(adev, ring->me);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 83adf81defc71..0bc0a94d7cf0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -330,7 +330,9 @@ struct amdgpu_vcn_inst {
                              struct dpg_pause_state *new_state);
        int (*set_pg_state)(struct amdgpu_vcn_inst *vinst,
                            enum amd_powergating_state state);
+       int (*reset)(struct amdgpu_vcn_inst *vinst);
        bool using_unified_queue;
+       struct mutex            engine_reset_mutex;
  };
struct amdgpu_vcn_ras {
@@ -552,5 +554,7 @@ void amdgpu_debugfs_vcn_sched_mask_init(struct 
amdgpu_device *adev);
int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
                              enum amd_powergating_state state);
-
+int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
+                         unsigned int vmid,
+                         struct amdgpu_fence *guilty_fence);
  #endif

Reply via email to