[Public]

>-----Original Message-----
>From: amd-gfx <[email protected]> On Behalf Of Victor
>Zhao
>Sent: Thursday, October 9, 2025 8:18 AM
>To: [email protected]
>Cc: Chang, HaiJun <[email protected]>; Zhao, Victor
><[email protected]>
>Subject: [PATCH 2/2] drm/amdgpu: use GPU_HDP_FLUSH for sriov
>
>Currently SRIOV runtime will use kiq to write HDP_MEM_FLUSH_CNTL for hdp
>flush. This register need to be write from CPU for nbif to aware, otherwise it 
>will
>not work.
>
[lijo]
I think this may be related to routing in mult-xcc-SOCs. You may keep the 
original path as fallback for SOCs without multi-xcc.

>Implement amdgpu_kiq_hdp_flush and use kiq to do gpu hdp flush during
>sriov runtime.
>
>Signed-off-by: Victor Zhao <[email protected]>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 73
>++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  1 +
> 3 files changed, 76 insertions(+)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 7a899fb4de29..eff43757f983 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -7281,6 +7281,8 @@ void amdgpu_device_flush_hdp(struct
>amdgpu_device *adev,
>
>       if (ring && ring->funcs->emit_hdp_flush)
>               amdgpu_ring_emit_hdp_flush(ring);
>+      else if (!ring && amdgpu_sriov_runtime(adev))
>+              amdgpu_kiq_hdp_flush(adev, 0);
[lijo]

If there is no use case of passing a different xcc_id, the extra parameter may 
be removed and always be assumed that logical xcc 0 will be used.

>       else
>               amdgpu_asic_flush_hdp(adev, ring);
> }
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>index 7f02e36ccc1e..ecd7908590de 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>@@ -1194,6 +1194,78 @@ void amdgpu_kiq_wreg(struct amdgpu_device
>*adev, uint32_t reg, uint32_t v, uint3
>       dev_err(adev->dev, "failed to write reg:%x\n", reg);  }
>
>+void amdgpu_kiq_hdp_flush(struct amdgpu_device *adev, uint32_t xcc_id)
>+{
>+      signed long r, cnt = 0;
>+      unsigned long flags;
>+      uint32_t seq;
>+      uint32_t hdp_flush_req_offset, hdp_flush_done_offset,
>ref_and_mask;
>+      struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id];
>+      struct amdgpu_ring *ring = &kiq->ring;
>+
>+      BUG_ON(!ring->funcs->emit_hdp_flush);
[lijo]

This may be kept as a warning. It's not necessary to bring-down the system.

>+
>+      if (amdgpu_device_skip_hw_access(adev))
>+              return;
>+
>+      if (adev->enable_mes_kiq && adev->mes.ring[0].sched.ready) {
>+              hdp_flush_req_offset = adev->nbio.funcs-
>>get_hdp_flush_req_offset(adev);
>+              hdp_flush_done_offset = adev->nbio.funcs-
>>get_hdp_flush_done_offset(adev);
>+              ref_and_mask = adev->nbio.hdp_flush_reg-
>>ref_and_mask_cp0; /* Use CP0
>+for KIQ */
>+
>+              amdgpu_mes_reg_write_reg_wait(adev, hdp_flush_req_offset,
>hdp_flush_done_offset,
>+                                            ref_and_mask, ref_and_mask);
>+              return;
>+      }
[lijo]

This part may be kept as amdgpu_mes_hdp_flush and moved inside amdgpu_mes.c

Thanks,
Lijo

>+
>+      spin_lock_irqsave(&kiq->ring_lock, flags);
>+      r = amdgpu_ring_alloc(ring, 32);
>+      if (r)
>+              goto failed_unlock;
>+
>+      amdgpu_ring_emit_hdp_flush(ring);
>+      r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
>+      if (r)
>+              goto failed_undo;
>+
>+      amdgpu_ring_commit(ring);
>+      spin_unlock_irqrestore(&kiq->ring_lock, flags);
>+
>+      r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>+
>+      /* don't wait anymore for gpu reset case because this way may
>+       * block gpu_recover() routine forever, e.g. this virt_kiq_rreg
>+       * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will
>+       * never return if we keep waiting in virt_kiq_rreg, which cause
>+       * gpu_recover() hang there.
>+       *
>+       * also don't wait anymore for IRQ context
>+       * */
>+      if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
>+              goto failed_kiq_hdp_flush;
>+
>+      might_sleep();
>+      while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {
>+              if (amdgpu_in_reset(adev))
>+                      goto failed_kiq_hdp_flush;
>+
>+              msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);
>+              r = amdgpu_fence_wait_polling(ring, seq,
>MAX_KIQ_REG_WAIT);
>+      }
>+
>+      if (cnt > MAX_KIQ_REG_TRY)
>+              goto failed_kiq_hdp_flush;
>+
>+      return;
>+
>+failed_undo:
>+      amdgpu_ring_undo(ring);
>+failed_unlock:
>+      spin_unlock_irqrestore(&kiq->ring_lock, flags);
>+failed_kiq_hdp_flush:
>+      dev_err(adev->dev, "failed to flush HDP via KIQ\n"); }
>+
> int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev)  {
>       if (amdgpu_num_kcq == -1) {
>@@ -2484,3 +2556,4 @@ void
>amdgpu_debugfs_compute_sched_mask_init(struct amdgpu_device *adev)
>                           &amdgpu_debugfs_compute_sched_mask_fops);
> #endif
> }
>+
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>index fb5f7a0ee029..5bccd2cc9518 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>@@ -615,6 +615,7 @@ int amdgpu_gfx_cp_ecc_error_irq(struct
>amdgpu_device *adev,
>                                 struct amdgpu_iv_entry *entry);
> uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg,
>uint32_t xcc_id);  void amdgpu_kiq_wreg(struct amdgpu_device *adev,
>uint32_t reg, uint32_t v, uint32_t xcc_id);
>+void amdgpu_kiq_hdp_flush(struct amdgpu_device *adev, uint32_t xcc_id);
> int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);  void
>amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t
>ucode_id);
>
>--
>2.25.1

Reply via email to