Please ignore my previous comments here, the new helper additions for vcn non unified queues are good.

But one concern, the vinst->reset(vinst) callback must take in ring pointer to handle guilty/non-guilty for appropriate re-emit part, else the guilty ring has to be tracked within the ring structure or identified
by some query with in reset.

Regards,
Sathish


On 6/17/2025 10:00 AM, Sundararaju, Sathishkumar wrote:
Hi Alex,

Would it be good to have this logic in the reset call back itself ?

Adding common vinst->reset stops the flexibility of having separate reset functionality for enc rings and decode rings, can selectively handle the drm_sched_wqueue_start/stop and re-emit of guilty/non-guilty for enc and dec separately.

And the usual vcn_stop() followed by vcn_start() isn't helping in reset of the engine for vcn3.

I tried a workaround to pause_dpg and enable static clockgate and powergate, and then stop()/start() the engine
which is working consistently so far.

Regards,
Sathish

On 6/17/2025 8:38 AM, Alex Deucher wrote:
With engine resets we reset all queues on the engine rather
than just a single queue.  Add a framework to handle this
similar to SDMA.

Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 64 +++++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  6 ++-
  2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index c8885c3d54b33..075740ed275eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -134,6 +134,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
mutex_init(&adev->vcn.inst[i].vcn1_jpeg1_workaround);
      mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
+    mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
      atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
      INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_idle_work_handler);
atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
@@ -1451,3 +1452,66 @@ int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
        return ret;
  }
+
+/**
+ * amdgpu_vcn_reset_engine - Reset a specific VCN engine
+ * @adev: Pointer to the AMDGPU device
+ * @instance_id: VCN engine instance to reset
+ *
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+static int amdgpu_vcn_reset_engine(struct amdgpu_device *adev,
+                   uint32_t instance_id)
+{
+    struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[instance_id];
+    int r, i;
+
+    mutex_lock(&vinst->engine_reset_mutex);
+    /* Stop the scheduler's work queue for the dec and enc rings if they are running.
+     * This ensures that no new tasks are submitted to the queues while
+     * the reset is in progress.
+     */
+    drm_sched_wqueue_stop(&vinst->ring_dec.sched);
+    for (i = 0; i < vinst->num_enc_rings; i++)
+ drm_sched_wqueue_stop(&vinst->ring_enc[i].sched);
+
+    /* Perform the VCN reset for the specified instance */
+    r = vinst->reset(vinst);
+    if (r) {
+        dev_err(adev->dev, "Failed to reset VCN instance %u\n", instance_id);
+    } else {
+        /* Restart the scheduler's work queue for the dec and enc rings
+         * if they were stopped by this function. This allows new tasks
+         * to be submitted to the queues after the reset is complete.
+         */
+        drm_sched_wqueue_start(&vinst->ring_dec.sched);
+        for (i = 0; i < vinst->num_enc_rings; i++)
+ drm_sched_wqueue_start(&vinst->ring_enc[i].sched);
+    }
+    mutex_unlock(&vinst->engine_reset_mutex);
+
+    return r;
+}
+
+/**
+ * amdgpu_vcn_ring_reset - Reset a VCN ring
+ * @ring: ring to reset
+ * @vmid: vmid of guilty job
+ * @guilty_fence: guilty fence
+ *
+ * This helper is for VCN blocks without unified queues because
+ * resetting the engine resets all queues in that case.  With
+ * unified queues we have one queue per engine.
+ * Returns: 0 on success, or a negative error code on failure.
+ */
+int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
+              unsigned int vmid,
+              struct amdgpu_fence *guilty_fence)
+{
+    struct amdgpu_device *adev = ring->adev;
+
+    if (adev->vcn.inst[ring->me].using_unified_queue)
+        return -EINVAL;
+
+    return amdgpu_vcn_reset_engine(adev, ring->me);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 83adf81defc71..0bc0a94d7cf0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -330,7 +330,9 @@ struct amdgpu_vcn_inst {
                    struct dpg_pause_state *new_state);
      int (*set_pg_state)(struct amdgpu_vcn_inst *vinst,
                  enum amd_powergating_state state);
+    int (*reset)(struct amdgpu_vcn_inst *vinst);
      bool using_unified_queue;
+    struct mutex        engine_reset_mutex;
  };
    struct amdgpu_vcn_ras {
@@ -552,5 +554,7 @@ void amdgpu_debugfs_vcn_sched_mask_init(struct amdgpu_device *adev);
    int vcn_set_powergating_state(struct amdgpu_ip_block *ip_block,
                    enum amd_powergating_state state);
-
+int amdgpu_vcn_ring_reset(struct amdgpu_ring *ring,
+              unsigned int vmid,
+              struct amdgpu_fence *guilty_fence);
  #endif


Reply via email to