During GPU reset, the application could still run CPU page table updates. Each 
commit called
amdgpu_device_flush_hdp(), which on SR-IOV sends work through the KIQ ring.
That can advance sync_seq while the GPU is being reset,
leaving fence writeback out of sync and causing amdgpu_fence_emit_polling()
to time out on later KIQ use.

Fix:
amdgpu_vm_cpu_commit():
  Take reset_domain->sem with down_read_trylock() before 
amdgpu_device_flush_hdp().
  If the reset path holds the write lock, skip the HDP flush so no HDP-related 
HW
  access (including KIQ) runs during reset; state is re-established after reset.

amdgpu_virt_pre_reset():
  After stopping the data exchange thread and setting MP1 FLR state, call
  amdgpu_fence_driver_force_completion() on each initialized KIQ ring so pending
  fences are signalled and writeback is aligned before reset proceeds.

Signed-off-by: Chenglei Xie <[email protected]>
Change-Id: I938bce0cab93a794dbdb02fe3ca9e041f9ac1424
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 11 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 16 +++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 6974b1c5b56c2..0127b0d6c7277 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -1188,9 +1188,20 @@ enum amdgpu_sriov_vf_mode 
amdgpu_virt_get_sriov_vf_mode(struct amdgpu_device *ad
 
 void amdgpu_virt_pre_reset(struct amdgpu_device *adev)
 {
+       int i;
+
        /* stop the data exchange thread */
        amdgpu_virt_fini_data_exchange(adev);
        amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_FLR);
+
+       /* Force completion on KIQ ring fences so pending fences are signalled. 
*/
+       for (i = 0; i < AMDGPU_MAX_GC_INSTANCES; i++) {
+               struct amdgpu_ring *ring = &adev->gfx.kiq[i].ring;
+
+               if (!ring->fence_drv.initialized)
+                       continue;
+               amdgpu_fence_driver_force_completion(ring);
+       }
 }
 
 void amdgpu_virt_post_reset(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
index 22e2e5b473415..a9e33b7e87406 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
@@ -21,6 +21,8 @@
  */
 
 #include "amdgpu_vm.h"
+#include "amdgpu.h"
+#include "amdgpu_reset.h"
 #include "amdgpu_object.h"
 #include "amdgpu_trace.h"
 
@@ -108,11 +110,23 @@ static int amdgpu_vm_cpu_update(struct 
amdgpu_vm_update_params *p,
 static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
                                struct dma_fence **fence)
 {
+       struct amdgpu_device *adev = p->adev;
+
        if (p->needs_flush)
                atomic64_inc(&p->vm->tlb_seq);
 
        mb();
-       amdgpu_device_flush_hdp(p->adev, NULL);
+       /*
+        * While GPU reset holds reset_domain write lock, skip HDP flush 
entirely so
+        * no HDP-related HW access runs during reset;
+        * reset re-establishes consistent state afterward.
+        */
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return 0;
+
+       amdgpu_device_flush_hdp(adev, NULL);
+       up_read(&adev->reset_domain->sem);
+
        return 0;
 }
 
-- 
2.34.1

Reply via email to