Re: [PATCH v2 6/8] drm/amdkfd: Add user queue eviction restore SMI event

2022-01-27 Thread Felix Kuehling



Am 2022-01-20 um 18:13 schrieb Philip Yang:

Output user queue eviction and restore event. User queue eviction may be
triggered by migration, MMU notifier, TTM eviction or device suspend.

User queue restore may be rescheduled if eviction happens again while
restore.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  7 +++-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 11 --
  drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 37 +--
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 34 +
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  4 ++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 16 ++--
  8 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ac841ae8f5cc..bd3301e2c682 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -309,6 +309,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device 
*adev,
   */
  void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
  void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
+void kfd_process_smi_event_restore_rescheduled(struct mm_struct *mm);
  #else
  static inline
  void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
@@ -325,9 +326,13 @@ static inline
  void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
  {
  }
+
+static inline void kfd_process_smi_event_restore_rescheduled(struct mm_struct 
*mm)
+{
+}
  #endif
  /* KGD2KFD callbacks */
-int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
  int kgd2kfd_resume_mm(struct mm_struct *mm);
  int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5df387c4d7fb..c44e8dc0d869 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2066,7 +2066,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
evicted_bos = atomic_inc_return(&process_info->evicted_bos);
if (evicted_bos == 1) {
/* First eviction, stop the queues */
-   r = kgd2kfd_quiesce_mm(mm);
+   r = kgd2kfd_quiesce_mm(mm, USERPTR_EVICTION);
if (r)
pr_err("Failed to quiesce KFD\n");
schedule_delayed_work(&process_info->restore_userptr_work,
@@ -2340,13 +2340,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
  
  unlock_out:

mutex_unlock(&process_info->lock);
-   mmput(mm);
-   put_task_struct(usertask);
  
  	/* If validation failed, reschedule another attempt */

-   if (evicted_bos)
+   if (evicted_bos) {
schedule_delayed_work(&process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+   kfd_process_smi_event_restore_rescheduled(mm);
+   }
+   mmput(mm);
+   put_task_struct(usertask);
  }
  
  /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 5a47f437b455..ffaa80447d9c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -783,7 +783,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
*ih_ring_entry)
spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
  }
  
-int kgd2kfd_quiesce_mm(struct mm_struct *mm)

+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
  {
struct kfd_process *p;
int r;
@@ -797,7 +797,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
return -ESRCH;
  
  	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);

-   r = kfd_process_evict_queues(p);
+   r = kfd_process_evict_queues(p, trigger);
  
  	kfd_unref_process(p);

return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ea68f3b3a4e9..39519084df78 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -906,7 +906,7 @@ static inline struct kfd_process_device 
*kfd_process_device_from_gpuidx(
  }
  
  void kfd_unref_process(struct kfd_process *p);

-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
  int kfd_process_restore_queues(struct kfd_process *p);
  void kfd_suspend_all_processes(void);
  int kfd_resume_all_processes(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_

[PATCH v2 6/8] drm/amdkfd: Add user queue eviction restore SMI event

2022-01-20 Thread Philip Yang
Output user queue eviction and restore event. User queue eviction may be
triggered by migration, MMU notifier, TTM eviction or device suspend.

User queue restore may be rescheduled if eviction happens again while
restore.

Signed-off-by: Philip Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  7 +++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 11 --
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 37 +--
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 34 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  4 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c  | 16 ++--
 8 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ac841ae8f5cc..bd3301e2c682 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -309,6 +309,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device 
*adev,
  */
 void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
 void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
+void kfd_process_smi_event_restore_rescheduled(struct mm_struct *mm);
 #else
 static inline
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
@@ -325,9 +326,13 @@ static inline
 void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
 {
 }
+
+static inline void kfd_process_smi_event_restore_rescheduled(struct mm_struct 
*mm)
+{
+}
 #endif
 /* KGD2KFD callbacks */
-int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5df387c4d7fb..c44e8dc0d869 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2066,7 +2066,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
evicted_bos = atomic_inc_return(&process_info->evicted_bos);
if (evicted_bos == 1) {
/* First eviction, stop the queues */
-   r = kgd2kfd_quiesce_mm(mm);
+   r = kgd2kfd_quiesce_mm(mm, USERPTR_EVICTION);
if (r)
pr_err("Failed to quiesce KFD\n");
schedule_delayed_work(&process_info->restore_userptr_work,
@@ -2340,13 +2340,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
 
 unlock_out:
mutex_unlock(&process_info->lock);
-   mmput(mm);
-   put_task_struct(usertask);
 
/* If validation failed, reschedule another attempt */
-   if (evicted_bos)
+   if (evicted_bos) {
schedule_delayed_work(&process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+   kfd_process_smi_event_restore_rescheduled(mm);
+   }
+   mmput(mm);
+   put_task_struct(usertask);
 }
 
 /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 5a47f437b455..ffaa80447d9c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -783,7 +783,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
*ih_ring_entry)
spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
 }
 
-int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
 {
struct kfd_process *p;
int r;
@@ -797,7 +797,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
return -ESRCH;
 
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
-   r = kfd_process_evict_queues(p);
+   r = kfd_process_evict_queues(p, trigger);
 
kfd_unref_process(p);
return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ea68f3b3a4e9..39519084df78 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -906,7 +906,7 @@ static inline struct kfd_process_device 
*kfd_process_device_from_gpuidx(
 }
 
 void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
 int kfd_process_restore_queues(struct kfd_process *p);
 void kfd_suspend_all_processes(void);
 int kfd_resume_all_processes(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 74f162887d3b..e4ba4d537b3c 100644
--- a/drivers/gpu/drm/amd/amd