Am 2022-01-20 um 18:13 schrieb Philip Yang:
Output user queue eviction and restore event. User queue eviction may be
triggered by migration, MMU notifier, TTM eviction or device suspend.

User queue restore may be rescheduled if eviction happens again while
restore.

Signed-off-by: Philip Yang <philip.y...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  7 +++-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 11 ++++--
  drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 37 +++++++++++++++++--
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 34 +++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  4 ++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c          | 16 ++++++--
  8 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ac841ae8f5cc..bd3301e2c682 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -309,6 +309,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device 
*adev,
   */
  void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);
  void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
+void kfd_process_smi_event_restore_rescheduled(struct mm_struct *mm);
  #else
  static inline
  void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
@@ -325,9 +326,13 @@ static inline
  void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
  {
  }
+
+static inline void kfd_process_smi_event_restore_rescheduled(struct mm_struct 
*mm)
+{
+}
  #endif
  /* KGD2KFD callbacks */
-int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
  int kgd2kfd_resume_mm(struct mm_struct *mm);
  int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                                struct dma_fence *fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5df387c4d7fb..c44e8dc0d869 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2066,7 +2066,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
        evicted_bos = atomic_inc_return(&process_info->evicted_bos);
        if (evicted_bos == 1) {
                /* First eviction, stop the queues */
-               r = kgd2kfd_quiesce_mm(mm);
+               r = kgd2kfd_quiesce_mm(mm, USERPTR_EVICTION);
                if (r)
                        pr_err("Failed to quiesce KFD\n");
                schedule_delayed_work(&process_info->restore_userptr_work,
@@ -2340,13 +2340,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
unlock_out:
        mutex_unlock(&process_info->lock);
-       mmput(mm);
-       put_task_struct(usertask);
/* If validation failed, reschedule another attempt */
-       if (evicted_bos)
+       if (evicted_bos) {
                schedule_delayed_work(&process_info->restore_userptr_work,
                        msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+               kfd_process_smi_event_restore_rescheduled(mm);
+       }
+       mmput(mm);
+       put_task_struct(usertask);
  }
/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 5a47f437b455..ffaa80447d9c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -783,7 +783,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void 
*ih_ring_entry)
        spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
  }
-int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
  {
        struct kfd_process *p;
        int r;
@@ -797,7 +797,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
                return -ESRCH;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
-       r = kfd_process_evict_queues(p);
+       r = kfd_process_evict_queues(p, trigger);
kfd_unref_process(p);
        return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ea68f3b3a4e9..39519084df78 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -906,7 +906,7 @@ static inline struct kfd_process_device 
*kfd_process_device_from_gpuidx(
  }
void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
  int kfd_process_restore_queues(struct kfd_process *p);
  void kfd_suspend_all_processes(void);
  int kfd_resume_all_processes(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 74f162887d3b..e4ba4d537b3c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -43,6 +43,7 @@ struct mm_struct;
  #include "kfd_dbgmgr.h"
  #include "kfd_iommu.h"
  #include "kfd_svm.h"
+#include "kfd_smi_events.h"
/*
   * List of struct kfd_process (field kfd_process).
@@ -1701,7 +1702,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct 
mm_struct *mm)
   * Eviction is reference-counted per process-device. This means multiple
   * evictions from different sources can be nested safely.
   */
-int kfd_process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
  {
        int r = 0;
        int i;
@@ -1710,6 +1711,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
+ kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
+                                            trigger);
+

If we're always evicting queues on all GPUs, we probably only need one event per process, not one event per GPU.


                r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
                                                            &pdd->qpd);
                /* evict return -EIO if HWS is hang or asic is resetting, in 
this case
@@ -1734,6 +1738,11 @@ int kfd_process_evict_queues(struct kfd_process *p)
if (n_evicted == 0)
                        break;
+
+               kfd_smi_event_queue_eviction_restore(pdd->dev,
+                                                    p->lead_thread->pid,
+                                                    false);
+
                if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
                                                              &pdd->qpd))
                        pr_err("Failed to restore queues\n");
@@ -1753,6 +1762,10 @@ int kfd_process_restore_queues(struct kfd_process *p)
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
+ kfd_smi_event_queue_eviction_restore(pdd->dev,
+                                                    p->lead_thread->pid,
+                                                    false);
+
                r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
                                                              &pdd->qpd);
                if (r) {
@@ -1765,6 +1778,24 @@ int kfd_process_restore_queues(struct kfd_process *p)
        return ret;
  }
+void kfd_process_smi_event_restore_rescheduled(struct mm_struct *mm)
+{
+       struct kfd_process *p;
+       int i;
+
+       p = kfd_lookup_process_by_mm(mm);
+       if (!p)
+               return;
+
+       for (i = 0; i < p->n_pdds; i++) {
+               struct kfd_process_device *pdd = p->pdds[i];
+
+               kfd_smi_event_queue_eviction_restore(pdd->dev,
+                                                    p->lead_thread->pid, true);

Same as above. One event per process should be enough.


+       }
+       kfd_unref_process(p);
+}
+
  int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t gpu_id)
  {
        int i;
@@ -1814,7 +1845,7 @@ static void evict_process_worker(struct work_struct *work)
        flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, TTM_EVICTION);
        if (!ret) {
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
@@ -1881,7 +1912,7 @@ void kfd_suspend_all_processes(void)
                cancel_delayed_work_sync(&p->eviction_work);
                cancel_delayed_work_sync(&p->restore_work);
- if (kfd_process_evict_queues(p))
+               if (kfd_process_evict_queues(p, SUSPEND_EVICTION))
                        pr_err("Failed to suspend process 0x%x\n", p->pasid);
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 97393f4f3549..facc8d7627d8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -343,6 +343,40 @@ void kfd_smi_event_migration(struct kfd_dev *dev, uint16_t 
pasid,
        add_event_to_kfifo(pid, dev, KFD_SMI_EVENT_MIGRATION, fifo_in, len);
  }
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+                                 uint32_t trigger)
+{
+       char fifo_in[64];
+       int len;
+
+       if (list_empty(&dev->smi_clients))
+               return;
+
+       len = snprintf(fifo_in, sizeof(fifo_in), "%x %lld -%d @%x %d\n",
+                      KFD_SMI_EVENT_QUEUE_EVICTION, ktime_get_boottime_ns(),
+                      pid, dev->id, trigger);
+
+       add_event_to_kfifo(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION, fifo_in, 
len);
+}
+
+void kfd_smi_event_queue_eviction_restore(struct kfd_dev *dev, pid_t pid,
+                                         bool rescheduled)
+{
+       char fifo_in[64];
+       int len;
+
+       if (list_empty(&dev->smi_clients))
+               return;
+
+       len = snprintf(fifo_in, sizeof(fifo_in), "%x %lld -%d @%x %c\n",
+                      KFD_SMI_EVENT_QUEUE_EVICTION_RESTORE,
+                      ktime_get_boottime_ns(), pid, dev->id,
+                      rescheduled ? 'r' : ' ');
+
+       add_event_to_kfifo(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION_RESTORE,
+                          fifo_in, len);
+}
+
  int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
  {
        struct kfd_smi_client *client;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 5788728f2879..d85300b5af23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -39,4 +39,8 @@ void kfd_smi_event_migration(struct kfd_dev *dev, uint16_t 
pasid,
                             uint32_t from, uint32_t to,
                             uint32_t prefetch_loc, uint32_t preferred_loc,
                             uint32_t trigger, uint64_t ts);
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+                                 uint32_t trigger);
+void kfd_smi_event_queue_eviction_restore(struct kfd_dev *dev, pid_t pid,
+                                         bool rescheduled);
  #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7dbc724364e6..30aaa9764067 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1712,7 +1712,10 @@ static void svm_range_restore_work(struct work_struct 
*work)
                pr_debug("reschedule to restore svm range\n");
                schedule_delayed_work(&svms->restore_work,
                        msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+
+               kfd_process_smi_event_restore_rescheduled(mm);
        }
+       mmput(mm);

Where is the matching mmget for this? If this is a fix for an old reference-leak bug in the code, it deserves its own patch.

Regards,
  Felix


  }
/**
@@ -1732,15 +1735,22 @@ static void svm_range_restore_work(struct work_struct 
*work)
   */
  static int
  svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
-               unsigned long start, unsigned long last)
+               unsigned long start, unsigned long last,
+               enum mmu_notifier_event event)
  {
        struct svm_range_list *svms = prange->svms;
        struct svm_range *pchild;
        struct kfd_process *p;
+       uint32_t trigger;
        int r = 0;
p = container_of(svms, struct kfd_process, svms); + if (event == MMU_NOTIFY_MIGRATE)
+               trigger = SVM_RANGE_MIGRATION;
+       else
+               trigger = SVM_RANGE_EVICTION;
+
        pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
                 svms, prange->start, prange->last, start, last);
@@ -1768,7 +1778,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
                         prange->svms, prange->start, prange->last);
/* First eviction, stop the queues */
-               r = kgd2kfd_quiesce_mm(mm);
+               r = kgd2kfd_quiesce_mm(mm, trigger);
                if (r)
                        pr_debug("failed to quiesce KFD\n");
@@ -2303,7 +2313,7 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
                svm_range_unmap_from_cpu(mni->mm, prange, start, last);
                break;
        default:
-               svm_range_evict(prange, mni->mm, start, last);
+               svm_range_evict(prange, mni->mm, start, last, range->event);
                break;
        }

Reply via email to