amdgpu:fix driver unloading bug

Monk Liu Sun, 17 Sep 2017 23:19:10 -0700

[SWDEV-126631] - fix hypervisor save_vf fail that occured
after driver removed:
1. Because the KIQ and KCQ were not ummapped, save_vf will fail if driver freed 
mqd of KIQ and KCQ.
2. KIQ can't be unmapped since RLCV always need it, the bo_free on KIQ should 
be skipped
3. KCQ can be unmapped, and should be unmapped during hw_fini,
4. RLCV still need to access other mc address from some hw even after driver 
unloaded,
   So we should not unbind gart for VF.


Change-Id: I320487a9a848f41484c5f8cc11be34aca807b424
Signed-off-by: Horace Chen <horace.c...@amd.com>
Signed-off-by: Monk Liu <monk....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  5 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 60 +++++++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index f437008..2fee071 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -394,7 +394,8 @@ int amdgpu_gart_init(struct amdgpu_device *adev)
  */
 void amdgpu_gart_fini(struct amdgpu_device *adev)
 {
-       if (adev->gart.ready) {
+       /* gart is still used by other hw under SRIOV, don't unbind it */
+       if (adev->gart.ready && !amdgpu_sriov_vf(adev)) {
                /* unbind pages */
                amdgpu_gart_unbind(adev, 0, adev->gart.num_cpu_pages);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 4f6c68f..bf6656f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -309,6 +309,11 @@ void amdgpu_gfx_compute_mqd_sw_fini(struct amdgpu_device 
*adev)
                                      &ring->mqd_ptr);
        }
 
+       /* don't deallocate KIQ mqd because the bo is still used by RLCV even
+       the guest VM is shutdown */
+       if (amdgpu_sriov_vf(adev))
+               return;
+
        ring = &adev->gfx.kiq.ring;
        kfree(adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]);
        amdgpu_bo_free_kernel(&ring->mqd_obj,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 44960b3..a577bbc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2892,14 +2892,72 @@ static int gfx_v9_0_hw_init(void *handle)
        return r;
 }
 
+static int gfx_v9_0_kcq_disable(struct amdgpu_ring *kiq_ring,struct 
amdgpu_ring *ring)
+{
+       struct amdgpu_device *adev = kiq_ring->adev;
+       uint32_t scratch, tmp = 0;
+       int r, i;
+
+       r = amdgpu_gfx_scratch_get(adev, &scratch);
+       if (r) {
+               DRM_ERROR("Failed to get scratch reg (%d).\n", r);
+               return r;
+       }
+       WREG32(scratch, 0xCAFEDEAD);
+
+       r = amdgpu_ring_alloc(kiq_ring, 10);
+       if (r) {
+               DRM_ERROR("Failed to lock KIQ (%d).\n", r);
+               amdgpu_gfx_scratch_free(adev, scratch);
+               return r;
+       }
+
+       /* unmap queues */
+       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
+       amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
+                                               PACKET3_UNMAP_QUEUES_ACTION(1) 
| /* RESET_QUEUES */
+                                               
PACKET3_UNMAP_QUEUES_QUEUE_SEL(0) |
+                                               
PACKET3_UNMAP_QUEUES_ENGINE_SEL(0) |
+                                               
PACKET3_UNMAP_QUEUES_NUM_QUEUES(1));
+       amdgpu_ring_write(kiq_ring, 
PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
+       amdgpu_ring_write(kiq_ring, 0);
+       amdgpu_ring_write(kiq_ring, 0);
+       amdgpu_ring_write(kiq_ring, 0);
+       /* write to scratch for completion */
+       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
+       amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START));
+       amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
+       amdgpu_ring_commit(kiq_ring);
+
+       for (i = 0; i < adev->usec_timeout; i++) {
+               tmp = RREG32(scratch);
+               if (tmp == 0xDEADBEEF)
+                       break;
+               DRM_UDELAY(1);
+       }
+       if (i >= adev->usec_timeout) {
+               DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n", 
scratch, tmp);
+               r = -EINVAL;
+       }
+       amdgpu_gfx_scratch_free(adev, scratch);
+       return r;
+}
+
+
 static int gfx_v9_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int i, r;
 
        amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
        if (amdgpu_sriov_vf(adev)) {
-               pr_debug("For SRIOV client, shouldn't do anything.\n");
+               /* disable KCQ to avoid CPC touch memory not valid anymore */
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       r = gfx_v9_0_kcq_disable(&adev->gfx.kiq.ring, 
&adev->gfx.compute_ring[i]);
+                       if (r)
+                               return r;
+               }
                return 0;
        }
        gfx_v9_0_cp_enable(adev, false);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 13/18] drm/amdgpu:fix driver unloading bug

Reply via email to