I think it's reasonable to use the hive  specific lock for hive  specific 
functions. 
The changes is acked-by  Shaoyun.liu < shaoyun....@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of StDenis, Tom
Sent: Monday, January 7, 2019 10:16 AM
To: amd-gfx@lists.freedesktop.org
Cc: StDenis, Tom <tom.stde...@amd.com>
Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

v2: Move locks around in other functions so that this function can stand on its 
own.  Also only hold the hive specific lock for add/remove device instead of 
the driver global lock so you can't add/remove devices in parallel from one 
hive.

Signed-off-by: Tom St Denis <tom.stde...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..13d8e2ad2f7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         * by different nodes. No point also since the one node already 
executing
         * reset will also reset all the other nodes in the hive.
         */
-       hive = amdgpu_get_xgmi_hive(adev);
+       hive = amdgpu_get_xgmi_hive(adev, 0);
        if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
            !mutex_trylock(&hive->hive_lock))
                return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..ebf50809485f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
*hive)
        return &hive->device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
+*adev, int lock)
 {
        int i;
        struct amdgpu_hive_info *tmp;
 
        if (!adev->gmc.xgmi.hive_id)
                return NULL;
+
+       mutex_lock(&xgmi_mutex);
+
        for (i = 0 ; i < hive_count; ++i) {
                tmp = &xgmi_hives[i];
-               if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+               if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+                       if (lock)
+                               mutex_lock(&tmp->hive_lock);
+                       mutex_unlock(&xgmi_mutex);
                        return tmp;
+               }
        }
-       if (i >= AMDGPU_MAX_XGMI_HIVE)
+       if (i >= AMDGPU_MAX_XGMI_HIVE) {
+               mutex_unlock(&xgmi_mutex);
                return NULL;
+       }
 
        /* initialize new hive if not exist */
        tmp = &xgmi_hives[hive_count++];
        tmp->hive_id = adev->gmc.xgmi.hive_id;
        INIT_LIST_HEAD(&tmp->device_list);
        mutex_init(&tmp->hive_lock);
+       if (lock)
+               mutex_lock(&tmp->hive_lock);
+
+       mutex_unlock(&xgmi_mutex);
 
        return tmp;
 }
@@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
                return ret;
        }
 
-       mutex_lock(&xgmi_mutex);
-       hive = amdgpu_get_xgmi_hive(adev);
+       /* find hive and take lock */
+       hive = amdgpu_get_xgmi_hive(adev, 1);
        if (!hive)
                goto exit;
 
@@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
                        break;
        }
 
+       mutex_unlock(&hive->hive_lock);
 exit:
-       mutex_unlock(&xgmi_mutex);
        return ret;
 }
 
@@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
        if (!adev->gmc.xgmi.supported)
                return;
 
-       mutex_lock(&xgmi_mutex);
-
-       hive = amdgpu_get_xgmi_hive(adev);
+       hive = amdgpu_get_xgmi_hive(adev, 1);
        if (!hive)
-               goto exit;
+               return;
 
        if (!(hive->number_devices--))
                mutex_destroy(&hive->hive_lock);
-
-exit:
-       mutex_unlock(&xgmi_mutex);
+       else
+               mutex_unlock(&hive->hive_lock);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..8d7984844174 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -32,7 +32,7 @@ struct amdgpu_hive_info {
        struct mutex hive_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
+*adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct 
amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);  
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
--
2.17.2

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to