Stop the scheduler before releasing the PTL disable request to ensure
the GPU is quiescent during the PTL state transition. This prevents
potential queue preemption failures and GPU resets caused by modifying
PTL state while waves are executing

v1->v2:
only stop/start the scheduler when the PTL state actually needs to 
transition(Yifan)

Signed-off-by: Perry Yuan <[email protected]>
Reviewed-by: Yifan Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 9a23621542fa..d699d0354bda 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1780,10 +1780,17 @@ static int kfd_ptl_control(struct kfd_process_device 
*pdd, bool enable)
        if (!pdd->dev->kfd2kgd || !pdd->dev->kfd2kgd->ptl_ctrl)
                return -EOPNOTSUPP;
 
+       if (adev->kfd.init_complete)
+               amdgpu_amdkfd_stop_sched(adev, pdd->dev->node_id);
+
        ret = pdd->dev->kfd2kgd->ptl_ctrl(adev, PSP_PTL_PERF_MON_SET,
                                          &ptl_state,
                                          &pref_format1,
                                          &pref_format2);
+
+       if (adev->kfd.init_complete)
+               amdgpu_amdkfd_start_sched(adev, pdd->dev->node_id);
+
        return ret;
 }
 
@@ -3310,6 +3317,7 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
        struct kfd_process_device *pdd;
        struct kfd_dev *kfd;
        int status = -EINVAL;
+       struct amdgpu_ptl *ptl;
 
        if (!p)
                return -EINVAL;
@@ -3322,13 +3330,22 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
                return -EINVAL;
 
        kfd = pdd->dev->kfd;
+       ptl = &pdd->dev->adev->psp.ptl;
 
        mutex_lock(&kfd->profiler_lock);
        if (op == 1) {
                if (!kfd->profiler_process) {
                        kfd->profiler_process = p;
                        status = 0;
-                       kfd_ptl_disable_request(pdd, p);
+                       mutex_unlock(&kfd->profiler_lock);
+                       if (ptl->hw_supported) {
+                               status = kfd_ptl_disable_request(pdd, p);
+                               if (status != 0)
+                                       dev_err(kfd_device,
+                                               "Failed to lock device %d for 
profiling, error %d\n",
+                                               gpu_id, status);
+                       }
+                       return status;
                } else if (kfd->profiler_process == p) {
                        status = -EALREADY;
                } else {
@@ -3337,7 +3354,16 @@ static inline uint32_t profile_lock_device(struct 
kfd_process *p,
        } else if (op == 0 && kfd->profiler_process == p) {
                kfd->profiler_process = NULL;
                status = 0;
-               kfd_ptl_disable_release(pdd, p);
+               mutex_unlock(&kfd->profiler_lock);
+
+               if (ptl->hw_supported) {
+                       status = kfd_ptl_disable_release(pdd, p);
+                       if (status)
+                               dev_err(kfd_device,
+                                               "Failed to unlock device %d for 
profiling, error %d\n",
+                                               gpu_id, status);
+               }
+               return status;
        }
        mutex_unlock(&kfd->profiler_lock);
 
-- 
2.34.1

Reply via email to