Add support for dynamic Compute Unit (CU) mask modification to
AMDGPU user queues via a new MODIFY_CU_MASK operation. This enables
userspace to update CU allocation for existing queues at runtime.

v2: add a new op for AMDGPU_USERQ. E.g., AMDGPU_USERQ_OP_CU_MASK

Suggested-by: Alex Deucher <[email protected]>
Signed-off-by: Jesse Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 111 ++++++++++++++++++++++
 include/uapi/drm/amdgpu_drm.h             |  13 +++
 2 files changed, 124 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 256ceca6d429..4d7841f47dd3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -901,6 +901,113 @@ bool amdgpu_userq_enabled(struct drm_device *dev)
        return false;
 }
 
+static int amdgpu_userq_update_queue(struct amdgpu_usermode_queue *queue)
+{
+       struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
+       struct amdgpu_device *adev = uq_mgr->adev;
+       const struct amdgpu_userq_funcs *uq_funcs;
+       bool unmap_queue = false;
+       int r;
+
+       uq_funcs = adev->userq_funcs[queue->queue_type];
+       if (!uq_funcs || (queue->queue_type != AMDGPU_HW_IP_COMPUTE))
+               return -EOPNOTSUPP;
+
+       /*
+        * Unmap the queue if it's mapped or preempted to ensure a clean update.
+        * If the queue is already unmapped or hung, we skip this step.
+        */
+       if (queue->state == AMDGPU_USERQ_STATE_MAPPED ||
+           queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
+               r = amdgpu_userq_unmap_helper(queue);
+               if (r)
+                       return r;
+               unmap_queue = true;
+       }
+
+       r = uq_funcs->mqd_update(queue);
+
+       if (unmap_queue) {
+               r = amdgpu_userq_map_helper(queue);
+               if (r)
+                       drm_file_err(uq_mgr->file, "Failed to remap queue %llu 
after update\n",
+                               queue->doorbell_index);
+       }
+
+       return r;
+}
+
+static int amdgpu_userq_set_cu_mask(struct drm_file *filp,  union 
drm_amdgpu_userq *args)
+{
+       struct amdgpu_fpriv *fpriv = filp->driver_priv;
+       struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
+       struct amdgpu_device *adev = uq_mgr->adev;
+       struct amdgpu_usermode_queue *queue;
+       struct amdgpu_mqd_prop *props;
+       const int max_num_cus = 1024;
+       size_t cu_mask_size;
+       uint32_t count;
+       uint32_t *ptr;
+       int r;
+
+       mutex_lock(&uq_mgr->userq_mutex);
+       queue = amdgpu_userq_find(uq_mgr, args->in.queue_id);
+       if (!queue) {
+               mutex_unlock(&uq_mgr->userq_mutex);
+               return -EINVAL;
+       }
+       props = queue->userq_prop;
+
+       if (args->in.cu_mask_count == 0 || args->in.cu_mask_count % 32) {
+               r = -EINVAL;
+               goto unlock;
+       }
+
+       count = args->in.cu_mask_count;
+       /* To prevent an unreasonably large CU mask size, set an arbitrary
+       * limit of max_num_cus bits.  We can then just drop any CU mask bits
+       * past max_num_cus bits and just use the first max_num_cus bits.
+       */
+       if (count > max_num_cus) {
+               drm_file_err(uq_mgr->file, "CU mask cannot be greater than 1024 
bits");
+               count = max_num_cus;
+               cu_mask_size = sizeof(uint32_t) * (max_num_cus / 32);
+       } else {
+               cu_mask_size = sizeof(uint32_t) * (args->in.cu_mask_count / 32);
+       }
+
+       ptr = memdup_user(u64_to_user_ptr(args->in.cu_mask_ptr),
+                                   cu_mask_size);
+       if (IS_ERR(ptr)) {
+               r = PTR_ERR(ptr);
+               goto unlock;
+       }
+
+       /* ASICs that have WGPs must enforce pairwise enabled mask checks. */
+       if (ptr && adev->ip_versions[GC_HWIP][0] >= IP_VERSION(10, 0, 0)) {
+               for (int i = 0; i < count; i +=2) {
+                       uint32_t cu_pair = (ptr[i / 32] >> (i % 32)) & 0x3;
+
+                       if (cu_pair && cu_pair != 0x3) {
+                               drm_file_err(uq_mgr->file, "CUs must be 
adjacent pairwise enabled.\n");
+                               kfree(ptr);
+                               r = -EINVAL;
+                               goto unlock;
+                       }
+               }
+       }
+
+       props->cu_mask = ptr;
+       props->cu_mask_count = count;
+       r = amdgpu_userq_update_queue(queue);
+
+       kfree(ptr);
+unlock:
+       mutex_unlock(&uq_mgr->userq_mutex);
+
+       return r;
+}
+
 int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
                       struct drm_file *filp)
 {
@@ -920,6 +1027,10 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
                        drm_file_err(filp, "Failed to create usermode queue\n");
                break;
 
+       case AMDGPU_USERQ_OP_MODIFY_CU_MASK:
+               amdgpu_userq_set_cu_mask(filp, args);
+               break;
+
        case AMDGPU_USERQ_OP_FREE:
                r = amdgpu_userq_destroy(filp, args->in.queue_id);
                if (r)
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index ab2bf47553e1..cfc3a9313229 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -330,6 +330,7 @@ union drm_amdgpu_ctx {
 /* user queue IOCTL operations */
 #define AMDGPU_USERQ_OP_CREATE 1
 #define AMDGPU_USERQ_OP_FREE   2
+#define AMDGPU_USERQ_OP_MODIFY_CU_MASK 3
 
 /* queue priority levels */
 /* low < normal low < normal high < high */
@@ -410,6 +411,18 @@ struct drm_amdgpu_userq_in {
         * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11).
         */
        __u64 mqd_size;
+       /**
+        * @cu_mask_ptr: User-space pointer to CU (Compute Unit) mask array
+        * Points to an array of __u32 values that define which CUs are enabled
+        * for this queue (0 = disabled, 1 = enabled per bit)
+        */
+       __u64 cu_mask_ptr;
+       /**
+        * @cu_mask_count: Number of entries in the CU mask array
+        * Total count of __u32 elements in the cu_mask_ptr array (each element
+        * represents 32 CUs/WGPs)
+        */
+       __u32 cu_mask_count;
 };
 
 /* The structure to carry output of userqueue ops */
-- 
2.49.0

Reply via email to