Re: [PATCH 1/1] drm/amdkfd: Process notifier release callback don't take mutex

2022-07-11 Thread Felix Kuehling

On 2022-07-07 17:23, Philip Yang wrote:

Move process queues cleanup to deferred work kfd_process_wq_release, to
avoid potential deadlock circular locking warning:

  WARNING: possible circular locking dependency detected
the existing dependency chain (in reverse order) is:
   -> #2
 ((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}:
 __flush_work+0x343/0x4a0
 svm_range_list_lock_and_flush_work+0x39/0xc0
 svm_range_set_attr+0xe8/0x1080 [amdgpu]
 kfd_ioctl+0x19b/0x600 [amdgpu]
 __x64_sys_ioctl+0x81/0xb0
 do_syscall_64+0x34/0x80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

   -> #1 (&info->lock#2){+.+.}-{3:3}:
 __mutex_lock+0xa4/0x940
 amdgpu_amdkfd_gpuvm_acquire_process_vm+0x2e3/0x590
 kfd_process_device_init_vm+0x61/0x200 [amdgpu]
 kfd_ioctl_acquire_vm+0x83/0xb0 [amdgpu]
 kfd_ioctl+0x19b/0x600 [amdgpu]
 __x64_sys_ioctl+0x81/0xb0
 do_syscall_64+0x34/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae

   -> #0 (&process->mutex){+.+.}-{3:3}:
 __lock_acquire+0x1365/0x23d0
 lock_acquire+0xc9/0x2e0
 __mutex_lock+0xa4/0x940
 kfd_process_notifier_release+0x96/0xe0 [amdgpu]
 __mmu_notifier_release+0x94/0x210
 exit_mmap+0x35/0x1f0
 mmput+0x63/0x120
 svm_range_deferred_list_work+0x177/0x2c0 [amdgpu]
 process_one_work+0x2a4/0x600
 worker_thread+0x39/0x3e0
 kthread+0x16d/0x1a0

   Possible unsafe locking scenario:

   CPU0CPU1
 
lock((work_completion)(&svms->deferred_list_work));
 lock(&info->lock#2);
  lock((work_completion)(&svms->deferred_list_work));
lock(&process->mutex);

Signed-off-by: Philip Yang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 21 +
  1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fc38a4d81420..3c9cf9bdb63f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1115,6 +1115,15 @@ static void kfd_process_wq_release(struct work_struct 
*work)
struct kfd_process *p = container_of(work, struct kfd_process,
 release_work);
  
+	kfd_process_dequeue_from_all_devices(p);

+   pqm_uninit(&p->pqm);
+
+   /* Signal the eviction fence after user mode queues are
+* destroyed. This allows any BOs to be freed without
+* triggering pointless evictions or waiting for fences.
+*/
+   dma_fence_signal(p->ef);
+
kfd_process_remove_sysfs(p);
kfd_iommu_unbind_process(p);
  
@@ -1179,20 +1188,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,

cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
  
-	mutex_lock(&p->mutex);

-
-   kfd_process_dequeue_from_all_devices(p);
-   pqm_uninit(&p->pqm);
-
/* Indicate to other users that MM is no longer valid */
p->mm = NULL;
-   /* Signal the eviction fence after user mode queues are
-* destroyed. This allows any BOs to be freed without
-* triggering pointless evictions or waiting for fences.
-*/
-   dma_fence_signal(p->ef);
-
-   mutex_unlock(&p->mutex);
  
  	mmu_notifier_put(&p->mmu_notifier);

  }


Re: [PATCH 1/1] drm/amdkfd: Process notifier release callback don't take mutex

2022-07-08 Thread Felix Kuehling
I think this could also be fixed by not taking the process_info lock in 
svm_range_restore_work and svm_range_set_attr. I'm not even sure why 
we're taking this lock in the SVM code. I think that was copied from the 
restore workers in amdgpu_amdkfd_gpuvm.c because there it's used to 
protect the BO lists. But I think in the case of SVM, the equivalent 
lists are protected by the svms->lock.


Regards,
  Felix


On 2022-07-07 17:23, Philip Yang wrote:

Move process queues cleanup to deferred work kfd_process_wq_release, to
avoid potential deadlock circular locking warning:

  WARNING: possible circular locking dependency detected
the existing dependency chain (in reverse order) is:
   -> #2
 ((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}:
 __flush_work+0x343/0x4a0
 svm_range_list_lock_and_flush_work+0x39/0xc0
 svm_range_set_attr+0xe8/0x1080 [amdgpu]
 kfd_ioctl+0x19b/0x600 [amdgpu]
 __x64_sys_ioctl+0x81/0xb0
 do_syscall_64+0x34/0x80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

   -> #1 (&info->lock#2){+.+.}-{3:3}:
 __mutex_lock+0xa4/0x940
 amdgpu_amdkfd_gpuvm_acquire_process_vm+0x2e3/0x590
 kfd_process_device_init_vm+0x61/0x200 [amdgpu]
 kfd_ioctl_acquire_vm+0x83/0xb0 [amdgpu]
 kfd_ioctl+0x19b/0x600 [amdgpu]
 __x64_sys_ioctl+0x81/0xb0
 do_syscall_64+0x34/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae

   -> #0 (&process->mutex){+.+.}-{3:3}:
 __lock_acquire+0x1365/0x23d0
 lock_acquire+0xc9/0x2e0
 __mutex_lock+0xa4/0x940
 kfd_process_notifier_release+0x96/0xe0 [amdgpu]
 __mmu_notifier_release+0x94/0x210
 exit_mmap+0x35/0x1f0
 mmput+0x63/0x120
 svm_range_deferred_list_work+0x177/0x2c0 [amdgpu]
 process_one_work+0x2a4/0x600
 worker_thread+0x39/0x3e0
 kthread+0x16d/0x1a0

   Possible unsafe locking scenario:

   CPU0CPU1
 
lock((work_completion)(&svms->deferred_list_work));
 lock(&info->lock#2);
  lock((work_completion)(&svms->deferred_list_work));
lock(&process->mutex);

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 21 +
  1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fc38a4d81420..3c9cf9bdb63f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1115,6 +1115,15 @@ static void kfd_process_wq_release(struct work_struct 
*work)
struct kfd_process *p = container_of(work, struct kfd_process,
 release_work);
  
+	kfd_process_dequeue_from_all_devices(p);

+   pqm_uninit(&p->pqm);
+
+   /* Signal the eviction fence after user mode queues are
+* destroyed. This allows any BOs to be freed without
+* triggering pointless evictions or waiting for fences.
+*/
+   dma_fence_signal(p->ef);
+
kfd_process_remove_sysfs(p);
kfd_iommu_unbind_process(p);
  
@@ -1179,20 +1188,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,

cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
  
-	mutex_lock(&p->mutex);

-
-   kfd_process_dequeue_from_all_devices(p);
-   pqm_uninit(&p->pqm);
-
/* Indicate to other users that MM is no longer valid */
p->mm = NULL;
-   /* Signal the eviction fence after user mode queues are
-* destroyed. This allows any BOs to be freed without
-* triggering pointless evictions or waiting for fences.
-*/
-   dma_fence_signal(p->ef);
-
-   mutex_unlock(&p->mutex);
  
  	mmu_notifier_put(&p->mmu_notifier);

  }


[PATCH 1/1] drm/amdkfd: Process notifier release callback don't take mutex

2022-07-07 Thread Philip Yang
Move process queues cleanup to deferred work kfd_process_wq_release, to
avoid potential deadlock circular locking warning:

 WARNING: possible circular locking dependency detected
   the existing dependency chain (in reverse order) is:
  -> #2
((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}:
__flush_work+0x343/0x4a0
svm_range_list_lock_and_flush_work+0x39/0xc0
svm_range_set_attr+0xe8/0x1080 [amdgpu]
kfd_ioctl+0x19b/0x600 [amdgpu]
__x64_sys_ioctl+0x81/0xb0
do_syscall_64+0x34/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae

  -> #1 (&info->lock#2){+.+.}-{3:3}:
__mutex_lock+0xa4/0x940
amdgpu_amdkfd_gpuvm_acquire_process_vm+0x2e3/0x590
kfd_process_device_init_vm+0x61/0x200 [amdgpu]
kfd_ioctl_acquire_vm+0x83/0xb0 [amdgpu]
kfd_ioctl+0x19b/0x600 [amdgpu]
__x64_sys_ioctl+0x81/0xb0
do_syscall_64+0x34/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xae

  -> #0 (&process->mutex){+.+.}-{3:3}:
__lock_acquire+0x1365/0x23d0
lock_acquire+0xc9/0x2e0
__mutex_lock+0xa4/0x940
kfd_process_notifier_release+0x96/0xe0 [amdgpu]
__mmu_notifier_release+0x94/0x210
exit_mmap+0x35/0x1f0
mmput+0x63/0x120
svm_range_deferred_list_work+0x177/0x2c0 [amdgpu]
process_one_work+0x2a4/0x600
worker_thread+0x39/0x3e0
kthread+0x16d/0x1a0

  Possible unsafe locking scenario:

  CPU0CPU1

   lock((work_completion)(&svms->deferred_list_work));
lock(&info->lock#2);
 lock((work_completion)(&svms->deferred_list_work));
   lock(&process->mutex);

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fc38a4d81420..3c9cf9bdb63f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1115,6 +1115,15 @@ static void kfd_process_wq_release(struct work_struct 
*work)
struct kfd_process *p = container_of(work, struct kfd_process,
 release_work);
 
+   kfd_process_dequeue_from_all_devices(p);
+   pqm_uninit(&p->pqm);
+
+   /* Signal the eviction fence after user mode queues are
+* destroyed. This allows any BOs to be freed without
+* triggering pointless evictions or waiting for fences.
+*/
+   dma_fence_signal(p->ef);
+
kfd_process_remove_sysfs(p);
kfd_iommu_unbind_process(p);
 
@@ -1179,20 +1188,8 @@ static void kfd_process_notifier_release(struct 
mmu_notifier *mn,
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
 
-   mutex_lock(&p->mutex);
-
-   kfd_process_dequeue_from_all_devices(p);
-   pqm_uninit(&p->pqm);
-
/* Indicate to other users that MM is no longer valid */
p->mm = NULL;
-   /* Signal the eviction fence after user mode queues are
-* destroyed. This allows any BOs to be freed without
-* triggering pointless evictions or waiting for fences.
-*/
-   dma_fence_signal(p->ef);
-
-   mutex_unlock(&p->mutex);
 
mmu_notifier_put(&p->mmu_notifier);
 }
-- 
2.35.1