Am 2021-08-20 um 1:32 a.m. schrieb Joseph Greathouse:
> Give every process at most one queue from each SDMA engine.
> Previously, we allocated all SDMA engines and queues on a first-
> come-first-serve basis. This meant that it was possible for two
> processes racing on their allocation requests to each end up with
> two queues on the same SDMA engine. That could serialize the
> performance of commands to different queues.
>
> This new mechanism allows each process at most a single queue
> on each SDMA engine. Processes will check for queue availability on
> SDMA engine 0, then 1, then 2, etc. and only allocate on that
> engine if there is an available queue slot. If there are no
> queue slots available (or if this process has already allocated
> a queue on this engine), it moves on and tries the next engine.
>
> The Aldebaran chip has a small quirk that SDMA0 should only be
> used by the very first allocation from each process. It is OK to
> use any other SDMA engine at any time, but after the first SDMA
> allocation request from a process, the resulting engine must
> be from SDMA1 or above. This patch handles this case as well.
>
> Signed-off-by: Joseph Greathouse <joseph.greatho...@amd.com>

One final nit-pick inline. With that fixed, the series is

Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>


> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 135 +++++++++++++-----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   3 +
>  3 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f8fce9d05f50..86bdb765f350 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -52,12 +52,14 @@ static int unmap_queues_cpsch(struct device_queue_manager 
> *dqm,
>  static int map_queues_cpsch(struct device_queue_manager *dqm);
>  
>  static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +                             struct qcm_process_device *qpd,
>                               struct queue *q);
>  
>  static inline void deallocate_hqd(struct device_queue_manager *dqm,
>                               struct queue *q);
>  static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
>  static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +                             struct qcm_process_device *qpd,
>                               struct queue *q);
>  static void kfd_process_hw_exception(struct work_struct *work);
>  
> @@ -349,7 +351,7 @@ static int create_queue_nocpsch(struct 
> device_queue_manager *dqm,
>                       q->pipe, q->queue);
>       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -             retval = allocate_sdma_queue(dqm, q);
> +             retval = allocate_sdma_queue(dqm, qpd, q);
>               if (retval)
>                       goto deallocate_vmid;
>               dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> @@ -410,7 +412,7 @@ static int create_queue_nocpsch(struct 
> device_queue_manager *dqm,
>               deallocate_hqd(dqm, q);
>       else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -             deallocate_sdma_queue(dqm, q);
> +             deallocate_sdma_queue(dqm, qpd, q);
>  deallocate_vmid:
>       if (list_empty(&qpd->queues_list))
>               deallocate_vmid(dqm, qpd, q);
> @@ -475,9 +477,9 @@ static int destroy_queue_nocpsch_locked(struct 
> device_queue_manager *dqm,
>       if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>               deallocate_hqd(dqm, q);
>       else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -             deallocate_sdma_queue(dqm, q);
> +             deallocate_sdma_queue(dqm, qpd, q);
>       else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -             deallocate_sdma_queue(dqm, q);
> +             deallocate_sdma_queue(dqm, qpd, q);
>       else {
>               pr_debug("q->properties.type %d is invalid\n",
>                               q->properties.type);
> @@ -1039,42 +1041,93 @@ static void pre_reset(struct device_queue_manager 
> *dqm)
>       dqm_unlock(dqm);
>  }
>  
> +static uint64_t sdma_engine_mask(int engine, int num_engines)
> +{
> +     uint64_t mask = 0;
> +
> +     engine %= num_engines;
> +     while (engine < (sizeof(uint64_t)*8)) {
> +             mask |= 1ULL << engine;
> +             engine += num_engines;
> +     }
> +     return mask;
> +}
> +
>  static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +                             struct qcm_process_device *qpd,
>                               struct queue *q)
>  {
> -     int bit;
> +     uint64_t available_queue_bitmap;
> +     unsigned int bit, engine, num_engines;
> +     bool found_sdma = false;
>  
>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -             if (dqm->sdma_bitmap == 0) {
> +             num_engines = get_num_sdma_engines(dqm);
> +             for_each_set_bit(engine, &(qpd->sdma_engine_bitmap), 
> num_engines) {
> +                     /* Do not reuse SDMA0 for any subsequent SDMA queue
> +                      * requests on Aldebaran. If SDMA0's queues are all
> +                      * full, then this process should never use SDMA0
> +                      * for any further requests
> +                      */
> +                     if (dqm->dev->device_info->asic_family == 
> CHIP_ALDEBARAN &&
> +                                     engine == 0)
> +                             qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +
> +                     available_queue_bitmap = sdma_engine_mask(engine, 
> num_engines);
> +                     available_queue_bitmap &= dqm->sdma_bitmap;
> +
> +                     if (!available_queue_bitmap)
> +                             continue;
> +                     /* Take the selected engine off the list so we will not
> +                      * allocate two queues onto the same engine
> +                      */
> +                     qpd->sdma_engine_bitmap &= ~(1ULL << engine);
> +                     found_sdma = true;
> +
> +                     bit = __ffs64(available_queue_bitmap);
> +                     dqm->sdma_bitmap &= ~(1ULL << bit);
> +                     q->sdma_id = bit;
> +                     q->properties.sdma_engine_id = q->sdma_id % num_engines;
> +                     q->properties.sdma_queue_id = q->sdma_id / num_engines;
> +                     break;
> +             }
> +             if (!found_sdma) {
>                       pr_err("No more SDMA queue to allocate\n");
>                       return -ENOMEM;
>               }
> -
> -             bit = __ffs64(dqm->sdma_bitmap);
> -             dqm->sdma_bitmap &= ~(1ULL << bit);
> -             q->sdma_id = bit;
> -             q->properties.sdma_engine_id = q->sdma_id %
> -                             get_num_sdma_engines(dqm);
> -             q->properties.sdma_queue_id = q->sdma_id /
> -                             get_num_sdma_engines(dqm);
>       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -             if (dqm->xgmi_sdma_bitmap == 0) {
> +             num_engines = get_num_xgmi_sdma_engines(dqm);
> +             for_each_set_bit(engine, &(qpd->xgmi_sdma_engine_bitmap), 
> num_engines) {
> +                     available_queue_bitmap = sdma_engine_mask(engine, 
> num_engines);
> +                     available_queue_bitmap &= dqm->xgmi_sdma_bitmap;
> +
> +                     if (!available_queue_bitmap)
> +                             continue;
> +                     /* Take the selected engine off the list so we will not
> +                      * allocate two queues onto the same engine
> +                      */
> +                     qpd->xgmi_sdma_engine_bitmap &= ~(1ULL << engine);
> +                     found_sdma = true;
> +
> +                     bit = __ffs64(available_queue_bitmap);
> +                     dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> +                     q->sdma_id = bit;
> +                     /* sdma_engine_id is sdma id including
> +                      * both PCIe-optimized SDMAs and XGMI-
> +                      * optimized SDMAs. The calculation below
> +                      * assumes the first N engines are always
> +                      * PCIe-optimized ones
> +                      */
> +                     q->properties.sdma_engine_id = 
> get_num_sdma_engines(dqm) +
> +                             q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> +                     q->properties.sdma_queue_id = q->sdma_id /
> +                             get_num_xgmi_sdma_engines(dqm);

You could use "num_engines" here instead of get_num_xgmi_sdma_engines(dqm).

Regards,
  Felix


> +                     break;
> +             }
> +             if (!found_sdma) {
>                       pr_err("No more XGMI SDMA queue to allocate\n");
>                       return -ENOMEM;
>               }
> -             bit = __ffs64(dqm->xgmi_sdma_bitmap);
> -             dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> -             q->sdma_id = bit;
> -             /* sdma_engine_id is sdma id including
> -              * both PCIe-optimized SDMAs and XGMI-
> -              * optimized SDMAs. The calculation below
> -              * assumes the first N engines are always
> -              * PCIe-optimized ones
> -              */
> -             q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
> -                             q->sdma_id % get_num_xgmi_sdma_engines(dqm);
> -             q->properties.sdma_queue_id = q->sdma_id /
> -                             get_num_xgmi_sdma_engines(dqm);
>       }
>  
>       pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> @@ -1084,16 +1137,32 @@ static int allocate_sdma_queue(struct 
> device_queue_manager *dqm,
>  }
>  
>  static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +                             struct qcm_process_device *qpd,
>                               struct queue *q)
>  {
> +     uint32_t engine = q->properties.sdma_engine_id;
> +
>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>               if (q->sdma_id >= get_num_sdma_queues(dqm))
>                       return;
>               dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +             /* Don't give SDMA0 back to be reallocated on Aldebaran.
> +              * It is only OK to use this engine from the first allocation
> +              * within a process.
> +              */
> +             if (!(dqm->dev->device_info->asic_family == CHIP_ALDEBARAN &&
> +                                     engine == 0))
> +                     qpd->sdma_engine_bitmap |= (1ULL << engine);
> +
>       } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>               if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>                       return;
>               dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +             /* engine ID in the queue properties is the global engine ID.
> +              * The XGMI engine bitmap ignores the PCIe-optimized engines.
> +              */
> +             engine -= get_num_sdma_engines(dqm);
> +             qpd->xgmi_sdma_engine_bitmap |= (1ULL << engine);
>       }
>  }
>  
> @@ -1303,7 +1372,7 @@ static int create_queue_cpsch(struct 
> device_queue_manager *dqm, struct queue *q,
>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>               dqm_lock(dqm);
> -             retval = allocate_sdma_queue(dqm, q);
> +             retval = allocate_sdma_queue(dqm, qpd, q);
>               dqm_unlock(dqm);
>               if (retval)
>                       goto out;
> @@ -1365,7 +1434,7 @@ static int create_queue_cpsch(struct 
> device_queue_manager *dqm, struct queue *q,
>       if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>               dqm_lock(dqm);
> -             deallocate_sdma_queue(dqm, q);
> +             deallocate_sdma_queue(dqm, qpd, q);
>               dqm_unlock(dqm);
>       }
>  out:
> @@ -1536,7 +1605,7 @@ static int destroy_queue_cpsch(struct 
> device_queue_manager *dqm,
>  
>       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>           (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -             deallocate_sdma_queue(dqm, q);
> +             deallocate_sdma_queue(dqm, qpd, q);
>               pdd->sdma_past_activity_counter += sdma_val;
>       }
>  
> @@ -1751,9 +1820,9 @@ static int process_termination_cpsch(struct 
> device_queue_manager *dqm,
>       /* Clear all user mode queues */
>       list_for_each_entry(q, &qpd->queues_list, list) {
>               if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> -                     deallocate_sdma_queue(dqm, q);
> +                     deallocate_sdma_queue(dqm, qpd, q);
>               else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -                     deallocate_sdma_queue(dqm, q);
> +                     deallocate_sdma_queue(dqm, qpd, q);
>  
>               if (q->properties.is_active) {
>                       decrement_queue_count(dqm, q->properties.type);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ab83b0de6b22..c38eebc9db4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -576,6 +576,8 @@ struct qcm_process_device {
>       struct list_head priv_queue_list;
>  
>       unsigned int queue_count;
> +     unsigned long sdma_engine_bitmap;
> +     unsigned long xgmi_sdma_engine_bitmap;
>       unsigned int vmid;
>       bool is_debug;
>       unsigned int evicted; /* eviction counter, 0=active */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 21ec8a18cad2..13c85624bf7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1422,6 +1422,7 @@ struct kfd_process_device 
> *kfd_create_process_device_data(struct kfd_dev *dev,
>                                                       struct kfd_process *p)
>  {
>       struct kfd_process_device *pdd = NULL;
> +     const struct kfd_device_info *dev_info = dev->dqm->dev->device_info;
>  
>       if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>               return NULL;
> @@ -1446,6 +1447,8 @@ struct kfd_process_device 
> *kfd_create_process_device_data(struct kfd_dev *dev,
>       pdd->qpd.pqm = &p->pqm;
>       pdd->qpd.evicted = 0;
>       pdd->qpd.mapped_gws_queue = false;
> +     pdd->qpd.sdma_engine_bitmap = BIT_ULL(dev_info->num_sdma_engines) - 1;
> +     pdd->qpd.xgmi_sdma_engine_bitmap = 
> BIT_ULL(dev_info->num_xgmi_sdma_engines) - 1;
>       pdd->process = p;
>       pdd->bound = PDD_UNBOUND;
>       pdd->already_dequeued = false;

Reply via email to