Release profiler_lock before calling kfd_ptl_disable_request
and kfd_ptl_disable_release to resolve potential deadlock
Introduce a dedicated adev->psp.ptl_mutex to protect PTL state
instead of relying on the KFD process mutex. This optimization
ensures thread safety independent of the process lock hierarchy.

Signed-off-by: Perry Yuan <[email protected]>
Reviewed-by: Yifan Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 16 +++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   | 12 ++++++++----
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6e422da88b7e..aae2f850b044 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4442,6 +4442,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        mutex_init(&adev->virt.vf_errors.lock);
        hash_init(adev->mn_hash);
        mutex_init(&adev->psp.mutex);
+       mutex_init(&adev->psp.ptl_mutex);
        mutex_init(&adev->notifier_lock);
        mutex_init(&adev->pm.stable_pstate_ctx_lock);
        mutex_init(&adev->benchmark_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 2b197cdefe31..682a0e4adafd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1260,6 +1260,15 @@ int psp_performance_monitor_hw(struct psp_context *psp, 
u32 req_code,
                        psp_ptl_fmt_verify(psp, *fmt2, &ptl_fmt2))
                return -EINVAL;
 
+       /*
+        * Add check to skip if state and formats are identical to current ones
+        */
+       if (req_code == PSP_PTL_PERF_MON_SET &&
+                       psp->ptl_enabled == *ptl_state &&
+                       psp->ptl_fmt1 == ptl_fmt1 &&
+                       psp->ptl_fmt2 == ptl_fmt2)
+               return 0;
+
        cmd = acquire_psp_cmd_buf(psp);
 
        cmd->cmd_id                     = GFX_CMD_ID_PERF_HW;
@@ -1334,19 +1343,24 @@ static ssize_t ptl_enable_store(struct device *dev,
        else
                return -EINVAL;
 
+       mutex_lock(&psp->ptl_mutex);
        fmt1 = psp->ptl_fmt1;
        fmt2 = psp->ptl_fmt2;
        ptl_state = enable ? 1 : 0;
 
        cur_enabled = READ_ONCE(psp->ptl_enabled);
-       if (cur_enabled == enable)
+       if (cur_enabled == enable) {
+               mutex_unlock(&psp->ptl_mutex);
                return count;
+       }
 
        ret = psp_performance_monitor_hw(psp, PSP_PTL_PERF_MON_SET, &ptl_state, 
&fmt1, &fmt2);
        if (ret) {
                dev_err(adev->dev, "Failed to set PTL err = %d\n", ret);
+               mutex_unlock(&psp->ptl_mutex);
                return ret;
        }
+       mutex_unlock(&psp->ptl_mutex);
 
        return count;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 1af641ae9a02..1ab7255718df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -478,6 +478,7 @@ struct psp_context {
        bool                            ptl_hw_supported;
        /* PTL disable reference counting */
        atomic_t                        ptl_disable_ref;
+       struct mutex                    ptl_mutex;
 };
 
 struct amdgpu_psp_funcs {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 5fda0efe5469..bccb857c81c4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1802,7 +1802,7 @@ int kfd_ptl_disable_request(struct kfd_process_device 
*pdd,
                return -ENODEV;
 
        adev = pdd->dev->adev;
-       mutex_lock(&p->mutex);
+       mutex_lock(&adev->psp.ptl_mutex);
 
        if (pdd->ptl_disable_req)
                goto out;
@@ -1819,7 +1819,7 @@ int kfd_ptl_disable_request(struct kfd_process_device 
*pdd,
        pdd->ptl_disable_req = true;
 
 out:
-       mutex_unlock(&p->mutex);
+       mutex_unlock(&adev->psp.ptl_mutex);
        return ret;
 }
 
@@ -1833,7 +1833,7 @@ int kfd_ptl_disable_release(struct kfd_process_device 
*pdd,
                return -ENODEV;
 
        adev = pdd->dev->adev;
-       mutex_lock(&p->mutex);
+       mutex_lock(&adev->psp.ptl_mutex);
        if (!pdd->ptl_disable_req)
                goto out;
 
@@ -1850,7 +1850,7 @@ int kfd_ptl_disable_release(struct kfd_process_device 
*pdd,
        pdd->ptl_disable_req = false;
 
 out:
-       mutex_unlock(&p->mutex);
+       mutex_unlock(&adev->psp.ptl_mutex);
        return ret;
 }
 
@@ -3340,6 +3340,7 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
                if (!kfd->profiler_process) {
                        kfd->profiler_process = p;
                        status = 0;
+                       mutex_unlock(&kfd->profiler_lock);
                        if (pdd->dev->adev->psp.ptl_hw_supported) {
                                status = kfd_ptl_disable_request(pdd, p);
                                if (status != 0)
@@ -3347,6 +3348,7 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
                                                "Failed to lock device %d for 
profiling, error %d\n",
                                                gpu_id, status);
                        }
+                       return status;
                } else if (kfd->profiler_process == p) {
                        status = -EALREADY;
                } else {
@@ -3355,6 +3357,7 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
        } else if (op == 0 && kfd->profiler_process == p) {
                kfd->profiler_process = NULL;
                status = 0;
+               mutex_unlock(&kfd->profiler_lock);
 
                if (pdd->dev->adev->psp.ptl_hw_supported) {
                        status = kfd_ptl_disable_release(pdd, p);
@@ -3363,6 +3366,7 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
                                                "Failed to unlock device %d for 
profiling, error %d\n",
                                                gpu_id, status);
                }
+               return status;
        }
        mutex_unlock(&kfd->profiler_lock);
 
-- 
2.34.1

Reply via email to