[WHY]
 Function "amdgpu_irq_update()" called by "amdgpu_device_ip_late_init()" is an 
atomic context.
 We shouldn't access registers through KIQ since "msleep()" may be called in 
"amdgpu_kiq_rreg()".

[HOW]
 Move function "amdgpu_virt_release_full_gpu()" after function 
"amdgpu_device_ip_late_init()",
 to ensure that registers be accessed through RLCG instead of KIQ.

Call Trace:
  <TASK>
  show_stack+0x52/0x69
  dump_stack_lvl+0x49/0x6d
  dump_stack+0x10/0x18
  __schedule_bug.cold+0x4f/0x6b
  __schedule+0x473/0x5d0
  ? __wake_up_klogd.part.0+0x40/0x70
  ? vprintk_emit+0xbe/0x1f0
  schedule+0x68/0x110
  schedule_timeout+0x87/0x160
  ? timer_migration_handler+0xa0/0xa0
  msleep+0x2d/0x50
  amdgpu_kiq_rreg+0x18d/0x1f0 [amdgpu]
  amdgpu_device_rreg.part.0+0x59/0xd0 [amdgpu]
  amdgpu_device_rreg+0x3a/0x50 [amdgpu]
  amdgpu_sriov_rreg+0x3c/0xb0 [amdgpu]
  gfx_v10_0_set_gfx_eop_interrupt_state.constprop.0+0x16c/0x190 [amdgpu]
  gfx_v10_0_set_eop_interrupt_state+0xa5/0xb0 [amdgpu]
  amdgpu_irq_update+0x53/0x80 [amdgpu]
  amdgpu_irq_get+0x7c/0xb0 [amdgpu]
  amdgpu_fence_driver_hw_init+0x58/0x90 [amdgpu]
  amdgpu_device_init.cold+0x16b7/0x2022 [amdgpu]

Signed-off-by: Chong Li <chong...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 051b9e231cf4..ee21a99ab4d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2538,8 +2538,6 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
        amdgpu_fru_get_product_info(adev);
 
 init_failed:
-       if (amdgpu_sriov_vf(adev))
-               amdgpu_virt_release_full_gpu(adev, true);
 
        return r;
 }
@@ -3856,18 +3854,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
        r = amdgpu_device_ip_init(adev);
        if (r) {
-               /* failed in exclusive mode due to timeout */
-               if (amdgpu_sriov_vf(adev) &&
-                   !amdgpu_sriov_runtime(adev) &&
-                   amdgpu_virt_mmio_blocked(adev) &&
-                   !amdgpu_virt_wait_reset(adev)) {
-                       dev_err(adev->dev, "VF exclusive mode timeout\n");
-                       /* Don't send request since VF is inactive. */
-                       adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
-                       adev->virt.ops = NULL;
-                       r = -EAGAIN;
-                       goto release_ras_con;
-               }
                dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
                amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 
0);
                goto release_ras_con;
@@ -3936,8 +3922,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                                   msecs_to_jiffies(AMDGPU_RESUME_MS));
        }
 
-       if (amdgpu_sriov_vf(adev))
+       if (amdgpu_sriov_vf(adev)) {
+               amdgpu_virt_release_full_gpu(adev, true);
                flush_delayed_work(&adev->delayed_init_work);
+       }
 
        r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
        if (r)
@@ -3977,6 +3965,20 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        return 0;
 
 release_ras_con:
+       if (amdgpu_sriov_vf(adev))
+               amdgpu_virt_release_full_gpu(adev, true);
+
+       /* failed in exclusive mode due to timeout */
+       if (amdgpu_sriov_vf(adev) &&
+               !amdgpu_sriov_runtime(adev) &&
+               amdgpu_virt_mmio_blocked(adev) &&
+               !amdgpu_virt_wait_reset(adev)) {
+               dev_err(adev->dev, "VF exclusive mode timeout\n");
+               /* Don't send request since VF is inactive. */
+               adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+               adev->virt.ops = NULL;
+               r = -EAGAIN;
+       }
        amdgpu_release_ras_context(adev);
 
 failed:
-- 
2.34.1

Reply via email to