Add support for dynamic Compute Unit (CU) mask modification to AMDGPU user queues via a new MODIFY_CU_MASK operation. This enables userspace to update CU allocation for existing queues at runtime.
v2: add a new op for AMDGPU_USERQ. E.g., AMDGPU_USERQ_OP_CU_MASK Suggested-by: Alex Deucher <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 111 ++++++++++++++++++++++ include/uapi/drm/amdgpu_drm.h | 13 +++ 2 files changed, 124 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 256ceca6d429..4d7841f47dd3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -901,6 +901,113 @@ bool amdgpu_userq_enabled(struct drm_device *dev) return false; } +static int amdgpu_userq_update_queue(struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *uq_funcs; + bool unmap_queue = false; + int r; + + uq_funcs = adev->userq_funcs[queue->queue_type]; + if (!uq_funcs || (queue->queue_type != AMDGPU_HW_IP_COMPUTE)) + return -EOPNOTSUPP; + + /* + * Unmap the queue if it's mapped or preempted to ensure a clean update. + * If the queue is already unmapped or hung, we skip this step. + */ + if (queue->state == AMDGPU_USERQ_STATE_MAPPED || + queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { + r = amdgpu_userq_unmap_helper(queue); + if (r) + return r; + unmap_queue = true; + } + + r = uq_funcs->mqd_update(queue); + + if (unmap_queue) { + r = amdgpu_userq_map_helper(queue); + if (r) + drm_file_err(uq_mgr->file, "Failed to remap queue %llu after update\n", + queue->doorbell_index); + } + + return r; +} + +static int amdgpu_userq_set_cu_mask(struct drm_file *filp, union drm_amdgpu_userq *args) +{ + struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_usermode_queue *queue; + struct amdgpu_mqd_prop *props; + const int max_num_cus = 1024; + size_t cu_mask_size; + uint32_t count; + uint32_t *ptr; + int r; + + mutex_lock(&uq_mgr->userq_mutex); + queue = amdgpu_userq_find(uq_mgr, args->in.queue_id); + if (!queue) { + mutex_unlock(&uq_mgr->userq_mutex); + return -EINVAL; + } + props = queue->userq_prop; + + if (args->in.cu_mask_count == 0 || args->in.cu_mask_count % 32) { + r = -EINVAL; + goto unlock; + } + + count = args->in.cu_mask_count; + /* To prevent an unreasonably large CU mask size, set an arbitrary + * limit of max_num_cus bits. We can then just drop any CU mask bits + * past max_num_cus bits and just use the first max_num_cus bits. + */ + if (count > max_num_cus) { + drm_file_err(uq_mgr->file, "CU mask cannot be greater than 1024 bits"); + count = max_num_cus; + cu_mask_size = sizeof(uint32_t) * (max_num_cus / 32); + } else { + cu_mask_size = sizeof(uint32_t) * (args->in.cu_mask_count / 32); + } + + ptr = memdup_user(u64_to_user_ptr(args->in.cu_mask_ptr), + cu_mask_size); + if (IS_ERR(ptr)) { + r = PTR_ERR(ptr); + goto unlock; + } + + /* ASICs that have WGPs must enforce pairwise enabled mask checks. */ + if (ptr && adev->ip_versions[GC_HWIP][0] >= IP_VERSION(10, 0, 0)) { + for (int i = 0; i < count; i +=2) { + uint32_t cu_pair = (ptr[i / 32] >> (i % 32)) & 0x3; + + if (cu_pair && cu_pair != 0x3) { + drm_file_err(uq_mgr->file, "CUs must be adjacent pairwise enabled.\n"); + kfree(ptr); + r = -EINVAL; + goto unlock; + } + } + } + + props->cu_mask = ptr; + props->cu_mask_count = count; + r = amdgpu_userq_update_queue(queue); + + kfree(ptr); +unlock: + mutex_unlock(&uq_mgr->userq_mutex); + + return r; +} + int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { @@ -920,6 +1027,10 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, drm_file_err(filp, "Failed to create usermode queue\n"); break; + case AMDGPU_USERQ_OP_MODIFY_CU_MASK: + amdgpu_userq_set_cu_mask(filp, args); + break; + case AMDGPU_USERQ_OP_FREE: r = amdgpu_userq_destroy(filp, args->in.queue_id); if (r) diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ab2bf47553e1..cfc3a9313229 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -330,6 +330,7 @@ union drm_amdgpu_ctx { /* user queue IOCTL operations */ #define AMDGPU_USERQ_OP_CREATE 1 #define AMDGPU_USERQ_OP_FREE 2 +#define AMDGPU_USERQ_OP_MODIFY_CU_MASK 3 /* queue priority levels */ /* low < normal low < normal high < high */ @@ -410,6 +411,18 @@ struct drm_amdgpu_userq_in { * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). */ __u64 mqd_size; + /** + * @cu_mask_ptr: User-space pointer to CU (Compute Unit) mask array + * Points to an array of __u32 values that define which CUs are enabled + * for this queue (0 = disabled, 1 = enabled per bit) + */ + __u64 cu_mask_ptr; + /** + * @cu_mask_count: Number of entries in the CU mask array + * Total count of __u32 elements in the cu_mask_ptr array (each element + * represents 32 CUs/WGPs) + */ + __u32 cu_mask_count; }; /* The structure to carry output of userqueue ops */ -- 2.49.0
