amdgpu: Implement concurrent asic reset for XGMI.

Andrey Grodzovsky Thu, 29 Nov 2018 12:38:22 -0800

Use per hive wq to concurrently send reset commands to all nodes
in the hive.


Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 +++++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8ad6bf..6fc023b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -910,7 +910,9 @@ struct amdgpu_device {
        bool                            in_gpu_reset;
        struct mutex  lock_reset;
        struct amdgpu_doorbell_index doorbell_index;
+
        int asic_reset_res;
+       struct work_struct              xgmi_reset_work;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8eaa40e..0c8e6a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1861,6 +1861,9 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
 {
        int i, r;
 
+       if (adev->gmc.xgmi.num_physical_nodes > 1)
+               amdgpu_xgmi_remove_device(adev);
+
        amdgpu_amdkfd_device_fini(adev);
 
        amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
@@ -2350,6 +2353,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device 
*adev)
        return amdgpu_device_asic_has_dc_support(adev->asic_type);
 }
 
+
+static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
+{
+       struct amdgpu_device *adev =
+               container_of(__work, struct amdgpu_device, xgmi_reset_work);
+
+       adev->asic_reset_res =  amdgpu_asic_reset(adev);
+       if (adev->asic_reset_res)
+               DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
+                        adev->asic_reset_res, adev->ddev->unique);
+}
+
+
 /**
  * amdgpu_device_init - initialize the driver
  *
@@ -2448,6 +2464,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
                          amdgpu_device_delay_enable_gfx_off);
 
+       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+
        adev->gfx.gfx_off_req_count = 1;
        adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : 
false;
 
@@ -3325,11 +3343,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
         */
        if (need_full_reset) {
                list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
-                       r = amdgpu_asic_reset(tmp_adev);
+
+                       /* For XGMI run all resets in parallel to speed up the 
process */
+                       if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+                               queue_work(hive->reset_queue, 
&tmp_adev->xgmi_reset_work);
+                       else
+                               r = amdgpu_asic_reset(tmp_adev);
                        if (r)
                                DRM_WARN("ASIC reset failed with err r, %d for 
drm dev, %s",
                                         r, tmp_adev->ddev->unique);
                }
+
+               /* For XGMI wait for all PSP resets to complete before proceed 
*/
+               if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                       flush_workqueue(hive->reset_queue);
+                       r = tmp_adev->asic_reset_res;
+               }
        }
 
 
@@ -3515,8 +3544,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                if (tmp_adev == adev)
                        continue;
 
-               dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", 
adev->ddev->unique);
-
                amdgpu_device_lock_adev(tmp_adev);
                r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                 NULL,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amdgpu: Implement concurrent asic reset for XGMI.

Reply via email to