On Wed, Sep 10, 2025 at 4:16 AM Jesse.Zhang <[email protected]> wrote:
>
> Use the suspend and resume API rather than remove queue
> and add queue API. The former just preempts the queue
> while the latter remove it from the scheduler completely.
> There is no need to do that, we only need preemption
> in this case.
>
> V2: replace queue_active with queue state
> v3: set the suspend_fence_addr
> v4: allocate another per queue buffer for the suspend fence, and set the
> sequence number.
> also wait for the suspend fence. (Alex)
> v5: use a wb slot (Alex)
> v6: Change the timeout period. For MES, the default timeout is 2100000; /*
> 2100 ms */ (Alex)
>
> Reviewed-by: Alex Deucher <[email protected]>
> Signed-off-by: Alex Deucher <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
Feel free to apply patches 1, 2.
Alex
> ---
> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 72 ++++++++++++++++++++++
> 1 file changed, 72 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> index 3a4fd6de08ce..8e771ca67d43 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
> @@ -404,10 +404,82 @@ mes_userq_mqd_destroy(struct amdgpu_userq_mgr *uq_mgr,
> amdgpu_userq_destroy_object(uq_mgr, &queue->mqd);
> }
>
> +static int mes_userq_preempt(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + struct mes_suspend_gang_input queue_input;
> + struct amdgpu_userq_obj *ctx = &queue->fw_obj;
> + signed long timeout = 2100000; /* 2100 ms */
> + u64 fence_gpu_addr;
> + u32 fence_offset;
> + u64 *fence_ptr;
> + int i, r;
> +
> + if (queue->state != AMDGPU_USERQ_STATE_MAPPED)
> + return 0;
> + r = amdgpu_device_wb_get(adev, &fence_offset);
> + if (r)
> + return r;
> +
> + fence_gpu_addr = adev->wb.gpu_addr + (fence_offset * 4);
> + fence_ptr = (u64 *)&adev->wb.wb[fence_offset];
> + *fence_ptr = 0;
> +
> + memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input));
> + queue_input.gang_context_addr = ctx->gpu_addr +
> AMDGPU_USERQ_PROC_CTX_SZ;
> + queue_input.suspend_fence_addr = fence_gpu_addr;
> + queue_input.suspend_fence_value = 1;
> + amdgpu_mes_lock(&adev->mes);
> + r = adev->mes.funcs->suspend_gang(&adev->mes, &queue_input);
> + amdgpu_mes_unlock(&adev->mes);
> + if (r) {
> + DRM_ERROR("Failed to suspend gang: %d\n", r);
> + goto out;
> + }
> +
> + for (i = 0; i < timeout; i++) {
> + if (*fence_ptr == 1)
> + goto out;
> + udelay(1);
> + }
> + r = -ETIMEDOUT;
> +
> +out:
> + amdgpu_device_wb_free(adev, fence_offset);
> + return r;
> +}
> +
> +static int mes_userq_restore(struct amdgpu_userq_mgr *uq_mgr,
> + struct amdgpu_usermode_queue *queue)
> +{
> + struct amdgpu_device *adev = uq_mgr->adev;
> + struct mes_resume_gang_input queue_input;
> + struct amdgpu_userq_obj *ctx = &queue->fw_obj;
> + int r;
> +
> + if (queue->state == AMDGPU_USERQ_STATE_HUNG)
> + return -EINVAL;
> + if (queue->state != AMDGPU_USERQ_STATE_PREEMPTED)
> + return 0;
> +
> + memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input));
> + queue_input.gang_context_addr = ctx->gpu_addr +
> AMDGPU_USERQ_PROC_CTX_SZ;
> +
> + amdgpu_mes_lock(&adev->mes);
> + r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input);
> + amdgpu_mes_unlock(&adev->mes);
> + if (r)
> + dev_err(adev->dev, "Failed to resume queue, err (%d)\n", r);
> + return r;
> + }
> +
> const struct amdgpu_userq_funcs userq_mes_funcs = {
> .mqd_create = mes_userq_mqd_create,
> .mqd_destroy = mes_userq_mqd_destroy,
> .unmap = mes_userq_unmap,
> .map = mes_userq_map,
> .detect_and_reset = mes_userq_detect_and_reset,
> + .preempt = mes_userq_preempt,
> + .restore = mes_userq_restore,
> };
> --
> 2.49.0
>