amdkfd: enable cooperative groups for gfx11

Felix Kuehling Tue, 18 Jul 2023 14:17:05 -0700


Am 2023-07-14 um 05:37 schrieb Jonathan Kim:

MES can concurrently schedule queues on the device that require
exclusive device access if marked exclusively_scheduled without the
requirement of GWS.  Similar to the F32 HWS, MES will manage
quality of service for these queues.
Use this for cooperative groups since cooperative groups are device
occupancy limited.

Since some GFX11 devices can only be debugged with partial CUs, do not
allow the debugging of cooperative groups on these devices as the CU
occupancy limit will change on attach.

In addition, zero initialize the MES add queue submission vector for MES
initialization tests as we do not want these to be cooperative
dispatches.

NOTE: FIXME MES FW enablement checks are a placeholder at the moment and
will be updated when the binary revision number is finalized.

Signed-off-by: Jonathan Kim <jonathan....@amd.com>

Some nit-picks inline. Looks good to me otherwise.

---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c               |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h               |  1 +
  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c                |  2 ++
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c              |  3 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c                |  3 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_device.c               |  6 +++++-
  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  9 ++++-----
  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c    | 11 +++++++----
  drivers/gpu/drm/amd/include/mes_v11_api_def.h         |  4 +++-
  9 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e9091ebfe230..8d13623389d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -638,7 +638,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int 
gang_id,
  {
        struct amdgpu_mes_queue *queue;
        struct amdgpu_mes_gang *gang;
-       struct mes_add_queue_input queue_input;
+       struct mes_add_queue_input queue_input = {0};

Generally, it is preferred to use memset to initialize structures onthe stack because that also initializes padding.

        unsigned long flags;
        int r;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

index 2d6ac30b7135..2053954a235c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -224,6 +224,7 @@ struct mes_add_queue_input {
        uint32_t        is_kfd_process;
        uint32_t        is_aql_queue;
        uint32_t        queue_size;
+       uint32_t        exclusively_scheduled;
  };

struct mes_remove_queue_input {

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 1bdaa00c0b46..8e67e965f7ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -214,6 +214,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
        mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
        mes_add_queue_pkt.gds_size = input->queue_size;

+ mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;

+
        return mes_v11_0_submit_pkt_and_poll_completion(mes,
                        &mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
                        offsetof(union MESAPI__ADD_QUEUE, api_status));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 40ac093b5035..e18401811956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1487,7 +1487,8 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
                goto out_unlock;
        }

- if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {

+       if (p->debug_trap_enabled && (!kfd_dbg_has_gws_support(dev) ||
+                                      kfd_dbg_has_cwsr_workaround(dev))) {

Indentation looks off. kfd_dbg_has_cwsr_workaround should be indentedone less space. Otherwise you may be incorrectly implying that the !applies to it.

                retval = -EBUSY;
                goto out_unlock;
        }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index ccfc81f085ce..895e7f690fd0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -753,7 +753,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, 
uint32_t fd,
                if (!KFD_IS_SOC15(pdd->dev))
                        return -ENODEV;

- if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)

+               if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
+                                         
kfd_dbg_has_cwsr_workaround(pdd->dev)))


Same as above.

                        return -EBUSY;
        }

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c

index 0b3dc754e06b..9f4f75fd2fb2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -508,6 +508,7 @@ static int kfd_gws_init(struct kfd_node *node)
  {
        int ret = 0;
        struct kfd_dev *kfd = node->kfd;
+       uint32_t mes_rev = node->adev->mes.sched_version & 
AMDGPU_MES_VERSION_MASK;

if (node->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)

                return 0;
@@ -524,7 +525,10 @@ static int kfd_gws_init(struct kfd_node *node)
                (KFD_GC_VERSION(node) == IP_VERSION(9, 4, 3)) ||
                (KFD_GC_VERSION(node) >= IP_VERSION(10, 3, 0)
                        && KFD_GC_VERSION(node) < IP_VERSION(11, 0, 0)
-                       && kfd->mec2_fw_version >= 0x6b))))
+                       && kfd->mec2_fw_version >= 0x6b) ||
+               (KFD_GC_VERSION(node) >= IP_VERSION(11, 0, 0)
+                       && KFD_GC_VERSION(node) < IP_VERSION(12, 0, 0)
+                       && mes_rev >= 68)))) /* FIXME: Placeholder version */

Can this comment be removed? You should know the correct MES versionbefore submitting this patch.

                ret = amdgpu_amdkfd_alloc_gws(node->adev,
                                node->adev->gds.gws_size, &node->gws);

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index 761963ad6154..7e8bc7328a79 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -237,10 +237,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
        }
        queue_input.queue_type = (uint32_t)queue_type;

- if (q->gws) {

-               queue_input.gws_base = 0;
-               queue_input.gws_size = qpd->num_gws;
-       }
+       queue_input.exclusively_scheduled = q->properties.is_gws;
+       if (q->properties.is_gws && dqm->gws_queue_count > 1)
+               pr_warn("Runlist is getting oversubscibed.  Expect reduced ROCm 
performance.\n");

I'm not sure this message makes sense for MES. There isn't a "runlist"in the same sense as with the old HWS. And the scheduling algorithm ofMES should perform better, so maybe we don't need to warn about poorperformance.

amdgpu_mes_lock(&adev->mes);

        r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
@@ -250,7 +249,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
                        q->properties.doorbell_off);
                pr_err("MES might be in unrecoverable state, issue a GPU 
reset\n");
                kfd_hws_hang(dqm);
-}
+       }

return r;

  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index ba9d69054119..139c6b58bb7e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -123,7 +123,7 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned 
int qid,
        if (!gws && pdd->qpd.num_gws == 0)
                return -EINVAL;

- if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3)) {

+       if (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 3) && 
!dev->kfd->shared_resources.enable_mes) {
                if (gws)
                        ret = 
amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
                                gws, &mem);
@@ -136,7 +136,8 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned 
int qid,
        } else {
                /*
                 * Intentionally set GWS to a non-NULL value
-                * for GFX 9.4.3.
+                * for devices that do not require GWS but require the formality
+                * of setting GWS for cooperative groups.

Change "devices that do not require GWS" to "devices that do not use GWSfor global wave synchronization".

                 */
                pqn->q->gws = gws ? ERR_PTR(-ENOMEM) : NULL;
        }
@@ -173,7 +174,8 @@ void pqm_uninit(struct process_queue_manager *pqm)

list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {

                if (pqn->q && pqn->q->gws &&
-                   KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3))
+                   KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) &&
+                               
!pqn->q->device->kfd->shared_resources.enable_mes)


The indentation looks off here.

                        
amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
                                pqn->q->gws);
                kfd_procfs_del_queue(pqn->q);
@@ -455,7 +457,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
unsigned int qid)
                }

if (pqn->q->gws) {

-                       if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 
3))
+                       if (KFD_GC_VERSION(pqn->q->device) != IP_VERSION(9, 4, 3) 
&&
+                                       !dev->kfd->shared_resources.enable_mes)


Indentation.

Regards,
  Felix

                                amdgpu_amdkfd_remove_gws_from_process(
                                                pqm->process->kgd_process_info,
                                                pqn->q->gws);
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h 
b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index 0997e999416a..b1db2b190187 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -275,7 +275,9 @@ union MESAPI__ADD_QUEUE {
                        uint32_t trap_en                : 1;
                        uint32_t is_aql_queue           : 1;
                        uint32_t skip_process_ctx_clear : 1;
-                       uint32_t reserved               : 19;
+                       uint32_t map_legacy_kq          : 1;
+                       uint32_t exclusively_scheduled  : 1;
+                       uint32_t reserved               : 17;
                };
                struct MES_API_STATUS           api_status;
                uint64_t                        tma_addr;

Re: [PATCH 2/2] drm/amdkfd: enable cooperative groups for gfx11

Reply via email to