amdxdna: Add command execution

Lizhi Hou Fri, 19 Jul 2024 10:51:40 -0700

Add interfaces for user application to submit command and wait for its
completion.


Co-developed-by: Min Ma <min...@amd.com>
Signed-off-by: Min Ma <min...@amd.com>
Signed-off-by: Lizhi Hou <lizhi....@amd.com>
---
 drivers/accel/amdxdna/aie2_ctx.c     | 618 ++++++++++++++++++++++++++-
 drivers/accel/amdxdna/aie2_message.c | 341 +++++++++++++++
 drivers/accel/amdxdna/aie2_pci.c     |   3 +
 drivers/accel/amdxdna/aie2_pci.h     |  34 ++
 drivers/accel/amdxdna/amdxdna_ctx.c  | 332 +++++++++++++-
 drivers/accel/amdxdna/amdxdna_ctx.h  | 153 +++++++
 drivers/accel/amdxdna/amdxdna_drm.c  |   5 +
 drivers/accel/amdxdna/amdxdna_drm.h  |   4 +
 include/trace/events/amdxdna.h       |  41 ++
 include/uapi/drm/amdxdna_accel.h     |  59 +++
 10 files changed, 1581 insertions(+), 9 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 435af66b2dd5..bec3ed9756f0 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -3,9 +3,367 @@
  * Copyright (C) 2024, Advanced Micro Devices, Inc.
  */
 
+#include <trace/events/amdxdna.h>
+
+#include "aie2_msg_priv.h"
 #include "aie2_pci.h"
 #include "aie2_solver.h"
 
+bool force_cmdlist;
+module_param(force_cmdlist, bool, 0600);
+MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
+
+#define HWCTX_MAX_TIMEOUT      60000 /* miliseconds */
+
+static inline int
+aie2_hwctx_add_job(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job)
+{
+       struct amdxdna_sched_job *other;
+       int idx;
+
+       idx = get_job_idx(hwctx->priv->seq);
+       /* When pending list full, hwctx->seq points to oldest fence */
+       other = hwctx->priv->pending[idx];
+       if (other && other->fence)
+               return -EAGAIN;
+
+       if (other) {
+               dma_fence_put(other->out_fence);
+               amdxdna_job_put(other);
+       }
+
+       hwctx->priv->pending[idx] = job;
+       job->seq = hwctx->priv->seq++;
+       kref_get(&job->refcnt);
+
+       return 0;
+}
+
+static inline struct amdxdna_sched_job *
+aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+       int idx;
+
+       /* Special sequence number for oldest fence if exist */
+       if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
+               idx = get_job_idx(hwctx->priv->seq);
+               goto out;
+       }
+
+       if (seq >= hwctx->priv->seq)
+               return ERR_PTR(-EINVAL);
+
+       if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
+               return NULL;
+
+       idx = get_job_idx(seq);
+
+out:
+       return hwctx->priv->pending[idx];
+}
+
+/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
+static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
*hwctx,
+                           struct drm_sched_job *bad_job)
+{
+       drm_sched_stop(&hwctx->priv->sched, bad_job);
+       aie2_destroy_context(xdna->dev_handle, hwctx);
+}
+
+static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
*hwctx)
+{
+       struct amdxdna_gem_obj *heap = hwctx->priv->heap;
+       int ret;
+
+       ret = aie2_create_context(xdna->dev_handle, hwctx);
+       if (ret) {
+               XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
+               goto out;
+       }
+
+       ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+                               heap->mem.userptr, heap->mem.size);
+       if (ret) {
+               XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
+               goto out;
+       }
+
+       if (hwctx->status != HWCTX_STAT_READY) {
+               XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
+               goto out;
+       }
+
+       ret = aie2_config_cu(hwctx);
+       if (ret) {
+               XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
+               goto out;
+       }
+
+out:
+       drm_sched_start(&hwctx->priv->sched, true);
+       XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
+       return ret;
+}
+
+void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
+{
+       struct amdxdna_dev *xdna = client->xdna;
+       struct amdxdna_hwctx *hwctx;
+       int next = 0;
+
+       drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+       mutex_lock(&client->hwctx_lock);
+       idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
+               /* check if the HW context uses the error column */
+               if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
+                       continue;
+
+               aie2_hwctx_stop(xdna, hwctx, NULL);
+               hwctx->old_status = hwctx->status;
+               hwctx->status = HWCTX_STAT_STOP;
+               XDNA_DBG(xdna, "Stop %s", hwctx->name);
+       }
+       mutex_unlock(&client->hwctx_lock);
+}
+
+void aie2_restart_ctx(struct amdxdna_client *client)
+{
+       struct amdxdna_dev *xdna = client->xdna;
+       struct amdxdna_hwctx *hwctx;
+       int next = 0;
+
+       drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+       mutex_lock(&client->hwctx_lock);
+       idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
+               if (hwctx->status != HWCTX_STAT_STOP)
+                       continue;
+
+               hwctx->status = hwctx->old_status;
+               XDNA_DBG(xdna, "Resetting %s", hwctx->name);
+               aie2_hwctx_restart(xdna, hwctx);
+       }
+       mutex_unlock(&client->hwctx_lock);
+}
+
+static int aie2_hwctx_wait_for_idle(struct amdxdna_hwctx *hwctx)
+{
+       struct amdxdna_sched_job *job;
+
+       mutex_lock(&hwctx->priv->io_lock);
+       if (!hwctx->priv->seq) {
+               mutex_unlock(&hwctx->priv->io_lock);
+               return 0;
+       }
+
+       job = aie2_hwctx_get_job(hwctx, hwctx->priv->seq - 1);
+       if (IS_ERR_OR_NULL(job)) {
+               mutex_unlock(&hwctx->priv->io_lock);
+               XDNA_WARN(hwctx->client->xdna, "Corrupted pending list");
+               return 0;
+       }
+       mutex_unlock(&hwctx->priv->io_lock);
+
+       wait_event(hwctx->priv->job_free_wq, !job->fence);
+
+       return 0;
+}
+
+static inline void
+aie2_sched_notify(struct amdxdna_sched_job *job)
+{
+       struct dma_fence *fence = job->fence;
+
+       job->hwctx->priv->completed++;
+       dma_fence_signal(fence);
+       trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", 
job->seq);
+       dma_fence_put(fence);
+       mmput(job->mm);
+       amdxdna_job_put(job);
+}
+
+static int
+aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
+{
+       struct amdxdna_sched_job *job = handle;
+       struct amdxdna_gem_obj *cmd_abo;
+       u32 ret = 0;
+       u32 status;
+
+       cmd_abo = job->cmd_bo;
+
+       if (unlikely(!data))
+               goto out;
+
+       if (unlikely(size != sizeof(u32))) {
+               amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       status = *data;
+       XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+       if (status == AIE2_STATUS_SUCCESS)
+               amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
+       else
+               amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
+
+out:
+       aie2_sched_notify(job);
+       return ret;
+}
+
+static int
+aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
+{
+       struct amdxdna_sched_job *job = handle;
+       u32 ret = 0;
+       u32 status;
+
+       if (unlikely(!data))
+               goto out;
+
+       if (unlikely(size != sizeof(u32))) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       status = *data;
+       XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+
+out:
+       aie2_sched_notify(job);
+       return ret;
+}
+
+static int
+aie2_sched_cmdlist_resp_handler(void *handle, const u32 *data, size_t size)
+{
+       struct amdxdna_sched_job *job = handle;
+       struct amdxdna_gem_obj *cmd_abo;
+       struct cmd_chain_resp *resp;
+       struct amdxdna_dev *xdna;
+       u32 fail_cmd_status;
+       u32 fail_cmd_idx;
+       u32 ret = 0;
+
+       cmd_abo = job->cmd_bo;
+       if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
+               amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       resp = (struct cmd_chain_resp *)data;
+       xdna = job->hwctx->client->xdna;
+       XDNA_DBG(xdna, "Status 0x%x", resp->status);
+       if (resp->status == AIE2_STATUS_SUCCESS) {
+               amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
+               goto out;
+       }
+
+       /* Slow path to handle error, read from ringbuf on BAR */
+       fail_cmd_idx = resp->fail_cmd_idx;
+       fail_cmd_status = resp->fail_cmd_status;
+       XDNA_DBG(xdna, "Failed cmd idx %d, status 0x%x",
+                fail_cmd_idx, fail_cmd_status);
+
+       if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
+               amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+               ret = -EINVAL;
+               goto out;
+       }
+       amdxdna_cmd_set_state(cmd_abo, fail_cmd_status);
+
+       if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
+               struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, 
NULL);
+
+               cc->error_index = fail_cmd_idx;
+               if (cc->error_index >= cc->command_count)
+                       cc->error_index = 0;
+       }
+out:
+       aie2_sched_notify(job);
+       return ret;
+}
+
+static struct dma_fence *
+aie2_sched_job_run(struct drm_sched_job *sched_job)
+{
+       struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+       struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+       struct amdxdna_hwctx *hwctx = job->hwctx;
+       struct dma_fence *fence;
+       int ret;
+
+       if (!mmget_not_zero(job->mm))
+               return ERR_PTR(-ESRCH);
+
+       kref_get(&job->refcnt);
+       fence = dma_fence_get(job->fence);
+
+       if (unlikely(!cmd_abo)) {
+               ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
+               goto out;
+       }
+
+       amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_NEW);
+
+       if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN)
+               ret = aie2_cmdlist_multi_execbuf(hwctx, job, 
aie2_sched_cmdlist_resp_handler);
+       else if (force_cmdlist)
+               ret = aie2_cmdlist_single_execbuf(hwctx, job, 
aie2_sched_cmdlist_resp_handler);
+       else
+               ret = aie2_execbuf(hwctx, job, aie2_sched_resp_handler);
+
+out:
+       if (ret) {
+               dma_fence_put(job->fence);
+               amdxdna_job_put(job);
+               mmput(job->mm);
+               fence = ERR_PTR(ret);
+       }
+       trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
+
+       return fence;
+}
+
+static void aie2_sched_job_free(struct drm_sched_job *sched_job)
+{
+       struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+       struct amdxdna_hwctx *hwctx = job->hwctx;
+
+       trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
+       drm_sched_job_cleanup(sched_job);
+       job->fence = NULL;
+       amdxdna_job_put(job);
+
+       wake_up(&hwctx->priv->job_free_wq);
+}
+
+static enum drm_gpu_sched_stat
+aie2_sched_job_timedout(struct drm_sched_job *sched_job)
+{
+       struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+       struct amdxdna_hwctx *hwctx = job->hwctx;
+       struct amdxdna_dev *xdna;
+
+       xdna = hwctx->client->xdna;
+       trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
+       mutex_lock(&xdna->dev_lock);
+       aie2_hwctx_stop(xdna, hwctx, sched_job);
+
+       aie2_hwctx_restart(xdna, hwctx);
+       mutex_unlock(&xdna->dev_lock);
+
+       return DRM_GPU_SCHED_STAT_NOMINAL;
+}
+
+const struct drm_sched_backend_ops sched_ops = {
+       .run_job = aie2_sched_job_run,
+       .free_job = aie2_sched_job_free,
+       .timedout_job = aie2_sched_job_timedout,
+};
+
 static int aie2_hwctx_col_list(struct amdxdna_hwctx *hwctx)
 {
        struct amdxdna_dev *xdna = hwctx->client->xdna;
@@ -119,9 +477,10 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 {
        struct amdxdna_client *client = hwctx->client;
        struct amdxdna_dev *xdna = client->xdna;
+       struct drm_gpu_scheduler *sched;
        struct amdxdna_hwctx_priv *priv;
        struct amdxdna_gem_obj *heap;
-       int ret;
+       int i, ret;
 
        priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
        if (!priv)
@@ -146,10 +505,48 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
                goto put_heap;
        }
 
+       for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
+               struct amdxdna_gem_obj *abo;
+               struct amdxdna_drm_create_bo args = {
+                       .flags = 0,
+                       .type = AMDXDNA_BO_DEV,
+                       .vaddr = 0,
+                       .size = MAX_CHAIN_CMDBUF_SIZE,
+               };
+
+               abo = amdxdna_drm_alloc_dev_bo(&xdna->ddev, &args, 
client->filp, true);
+               if (IS_ERR(abo)) {
+                       ret = PTR_ERR(abo);
+                       goto free_cmd_bufs;
+               }
+
+               XDNA_DBG(xdna, "Command buf %d addr 0x%llx size 0x%lx",
+                        i, abo->mem.dev_addr, abo->mem.size);
+               priv->cmd_buf[i] = abo;
+       }
+
+       sched = &priv->sched;
+       mutex_init(&priv->io_lock);
+       ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
+                            HWCTX_MAX_CMDS, 0, 
msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
+                            NULL, NULL, hwctx->name, xdna->ddev.dev);
+       if (ret) {
+               XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
+               goto free_cmd_bufs;
+       }
+
+       ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
+                                   &sched, 1, NULL);
+       if (ret) {
+               XDNA_ERR(xdna, "Failed to initial sched entiry. ret %d", ret);
+               goto free_sched;
+       }
+       init_waitqueue_head(&priv->job_free_wq);
+
        ret = aie2_hwctx_col_list(hwctx);
        if (ret) {
                XDNA_ERR(xdna, "Create col list failed, ret %d", ret);
-               goto unpin;
+               goto free_entity;
        }
 
        ret = aie2_alloc_resource(hwctx);
@@ -174,7 +571,16 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
        aie2_release_resource(hwctx);
 free_col_list:
        kfree(hwctx->col_list);
-unpin:
+free_entity:
+       drm_sched_entity_destroy(&priv->entity);
+free_sched:
+       drm_sched_fini(&priv->sched);
+free_cmd_bufs:
+       for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
+               if (!priv->cmd_buf[i])
+                       continue;
+               drm_gem_object_put(to_gobj(priv->cmd_buf[i]));
+       }
        amdxdna_gem_unpin(heap);
 put_heap:
        drm_gem_object_put(to_gobj(heap));
@@ -185,14 +591,43 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 
 void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
 {
+       struct amdxdna_sched_job *job;
        struct amdxdna_dev *xdna;
+       int idx;
 
        xdna = hwctx->client->xdna;
+       drm_sched_wqueue_stop(&hwctx->priv->sched);
+
+       /* Now, scheduler will not send command to device. */
        aie2_release_resource(hwctx);
 
+       /*
+        * All submitted commands are aborted.
+        * Restart scheduler queues to cleanup jobs. The amdxdna_sched_job_run()
+        * will return NODEV if it is called.
+        */
+       drm_sched_wqueue_start(&hwctx->priv->sched);
+
+       aie2_hwctx_wait_for_idle(hwctx);
+       drm_sched_entity_destroy(&hwctx->priv->entity);
+       drm_sched_fini(&hwctx->priv->sched);
+
+       for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
+               job = hwctx->priv->pending[idx];
+               if (!job)
+                       continue;
+
+               dma_fence_put(job->out_fence);
+               amdxdna_job_put(job);
+       }
+       XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, 
hwctx->priv->seq);
+
+       for (idx = 0; idx < ARRAY_SIZE(hwctx->priv->cmd_buf); idx++)
+               drm_gem_object_put(to_gobj(hwctx->priv->cmd_buf[idx]));
        amdxdna_gem_unpin(hwctx->priv->heap);
        drm_gem_object_put(to_gobj(hwctx->priv->heap));
 
+       mutex_destroy(&hwctx->priv->io_lock);
        kfree(hwctx->col_list);
        kfree(hwctx->priv);
        kfree(hwctx->cus);
@@ -259,3 +694,180 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 
type, u64 value, void *bu
                return -EOPNOTSUPP;
        }
 }
+
+static int aie2_populate_range(struct amdxdna_gem_obj *abo)
+{
+       struct mm_struct *mm = abo->mem.notifier.mm;
+       struct hmm_range range = { 0 };
+       unsigned long timeout;
+       int ret;
+
+       range.notifier = &abo->mem.notifier;
+       range.start = abo->mem.userptr;
+       range.end = abo->mem.userptr + abo->mem.size;
+       range.hmm_pfns = abo->mem.pfns;
+       range.default_flags = HMM_PFN_REQ_FAULT;
+
+       if (!mmget_not_zero(mm))
+               return -EFAULT;
+
+       timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+again:
+       range.notifier_seq = mmu_interval_read_begin(&abo->mem.notifier);
+       mmap_read_lock(mm);
+       ret = hmm_range_fault(&range);
+       mmap_read_unlock(mm);
+       if (ret) {
+               if (time_after(jiffies, timeout)) {
+                       ret = -ETIME;
+                       goto put_mm;
+               }
+
+               if (ret == -EBUSY)
+                       goto again;
+
+               goto put_mm;
+       }
+
+       dma_resv_lock(to_gobj(abo)->resv, NULL);
+       if (mmu_interval_read_retry(&abo->mem.notifier, range.notifier_seq)) {
+               dma_resv_unlock(to_gobj(abo)->resv);
+               goto again;
+       }
+       abo->mem.map_invalid = false;
+       dma_resv_unlock(to_gobj(abo)->resv);
+
+put_mm:
+       mmput(mm);
+       return ret;
+}
+
+int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job 
*job, u64 *seq)
+{
+       struct amdxdna_dev *xdna = hwctx->client->xdna;
+       struct ww_acquire_ctx acquire_ctx;
+       struct amdxdna_gem_obj *abo;
+       unsigned long timeout = 0;
+       int ret, i;
+
+       ret = drm_sched_job_init(&job->base, &hwctx->priv->entity, 1, hwctx);
+       if (ret) {
+               XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
+               return ret;
+       }
+
+       drm_sched_job_arm(&job->base);
+       job->out_fence = dma_fence_get(&job->base.s_fence->finished);
+
+retry:
+       ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+       if (ret) {
+               XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
+               goto put_fence;
+       }
+
+       for (i = 0; i < job->bo_cnt; i++) {
+               abo = to_xdna_obj(job->bos[i]);
+               if (abo->mem.map_invalid) {
+                       drm_gem_unlock_reservations(job->bos, job->bo_cnt, 
&acquire_ctx);
+                       if (!timeout) {
+                               timeout = jiffies +
+                                       
msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+                       } else if (time_after(jiffies, timeout)) {
+                               ret = -ETIME;
+                               goto put_fence;
+                       }
+
+                       ret = aie2_populate_range(abo);
+                       if (ret)
+                               goto put_fence;
+                       goto retry;
+               }
+
+               ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
+               if (ret) {
+                       XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
+                       drm_gem_unlock_reservations(job->bos, job->bo_cnt, 
&acquire_ctx);
+                       goto put_fence;
+               }
+       }
+
+       for (i = 0; i < job->bo_cnt; i++)
+               dma_resv_add_fence(job->bos[i]->resv, job->out_fence, 
DMA_RESV_USAGE_WRITE);
+       drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+
+       mutex_lock(&hwctx->priv->io_lock);
+       ret = aie2_hwctx_add_job(hwctx, job);
+       if (ret) {
+               mutex_unlock(&hwctx->priv->io_lock);
+               goto signal_fence;
+       }
+
+       *seq = job->seq;
+       drm_sched_entity_push_job(&job->base);
+       mutex_unlock(&hwctx->priv->io_lock);
+
+       return 0;
+
+signal_fence:
+       dma_fence_signal(job->out_fence);
+put_fence:
+       dma_fence_put(job->out_fence);
+       drm_sched_job_cleanup(&job->base);
+       return ret;
+}
+
+int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout)
+{
+       signed long remaining = MAX_SCHEDULE_TIMEOUT;
+       struct amdxdna_sched_job *job;
+       struct dma_fence *out_fence;
+       long ret;
+
+       mutex_lock(&hwctx->priv->io_lock);
+       job = aie2_hwctx_get_job(hwctx, seq);
+       if (IS_ERR(job)) {
+               mutex_unlock(&hwctx->priv->io_lock);
+               ret = PTR_ERR(job);
+               goto out;
+       }
+
+       if (unlikely(!job)) {
+               mutex_unlock(&hwctx->priv->io_lock);
+               ret = 0;
+               goto out;
+       }
+       out_fence = dma_fence_get(job->out_fence);
+       mutex_unlock(&hwctx->priv->io_lock);
+
+       if (timeout)
+               remaining = msecs_to_jiffies(timeout);
+
+       ret = dma_fence_wait_timeout(out_fence, true, remaining);
+       if (!ret)
+               ret = -ETIME;
+       else if (ret > 0)
+               ret = 0;
+
+       dma_fence_put(out_fence);
+out:
+       return ret;
+}
+
+void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
+                        unsigned long cur_seq)
+{
+       struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+       struct drm_gem_object *gobj = to_gobj(abo);
+       long ret;
+
+       dma_resv_lock(gobj->resv, NULL);
+       abo->mem.map_invalid = true;
+       mmu_interval_set_seq(&abo->mem.notifier, cur_seq);
+       ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP,
+                                   true, MAX_SCHEDULE_TIMEOUT);
+       dma_resv_unlock(gobj->resv);
+
+       if (!ret || ret == -ERESTARTSYS)
+               XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret);
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c 
b/drivers/accel/amdxdna/aie2_message.c
index c99c8df937cc..fd9650db524f 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -348,3 +348,344 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx)
                 msg.opcode, resp.status, ret);
        return ret;
 }
+
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+                int (*notify_cb)(void *, const u32 *, size_t))
+{
+       struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+       struct amdxdna_dev *xdna = hwctx->client->xdna;
+       struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+       union {
+               struct execute_buffer_req ebuf;
+               struct exec_dpu_req dpu;
+       } req;
+       struct xdna_mailbox_msg msg;
+       u32 payload_len;
+       void *payload;
+       int cu_idx;
+       int ret;
+       u32 op;
+
+       if (!chann)
+               return -ENODEV;
+
+       payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
+       if (!payload) {
+               XDNA_ERR(xdna, "Invalid command, cannot get payload");
+               return -EINVAL;
+       }
+
+       cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
+       if (cu_idx < 0) {
+               XDNA_DBG(xdna, "Invalid cu idx");
+               return -EINVAL;
+       }
+
+       op = amdxdna_cmd_get_op(cmd_abo);
+       switch (op) {
+       case ERT_START_CU:
+               if (unlikely(payload_len > sizeof(req.ebuf.payload)))
+                       XDNA_DBG(xdna, "Invalid ebuf payload len: %d", 
payload_len);
+               req.ebuf.cu_idx = cu_idx;
+               memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
+               msg.send_size = sizeof(req.ebuf);
+               msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
+               break;
+       case ERT_START_NPU: {
+               struct amdxdna_cmd_start_npu *sn = payload;
+
+               if (unlikely(payload_len - sizeof(*sn) > 
sizeof(req.dpu.payload)))
+                       XDNA_DBG(xdna, "Invalid dpu payload len: %d", 
payload_len);
+               req.dpu.inst_buf_addr = sn->buffer;
+               req.dpu.inst_size = sn->buffer_size;
+               req.dpu.inst_prop_cnt = sn->prop_count;
+               req.dpu.cu_idx = cu_idx;
+               memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
+               msg.send_size = sizeof(req.dpu);
+               msg.opcode = MSG_OP_EXEC_DPU;
+               break;
+       }
+       default:
+               XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
+               return -EINVAL;
+       }
+       msg.handle = job;
+       msg.notify_cb = notify_cb;
+       msg.send_data = (u8 *)&req;
+       print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
+                            0x40, false);
+
+       ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+       if (ret) {
+               XDNA_ERR(xdna, "Send message failed");
+               return ret;
+       }
+
+       return 0;
+}
+
+static inline int
+aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
+                             struct amdxdna_gem_obj *abo, u32 *size)
+{
+       struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
+       int cu_idx = amdxdna_cmd_get_cu_idx(abo);
+       u32 payload_len;
+       void *payload;
+
+       if (cu_idx < 0)
+               return -EINVAL;
+
+       payload = amdxdna_cmd_get_payload(abo, &payload_len);
+       if (!payload)
+               return -EINVAL;
+
+       if (!slot_cf_has_space(offset, payload_len))
+               return -ENOSPC;
+
+       buf->cu_idx = cu_idx;
+       buf->arg_cnt = payload_len / sizeof(u32);
+       memcpy(buf->args, payload, payload_len);
+       /* Accurate buf size to hint firmware to do necessary copy */
+       *size = sizeof(*buf) + payload_len;
+       return 0;
+}
+
+static inline int
+aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
+                              struct amdxdna_gem_obj *abo, u32 *size)
+{
+       struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
+       int cu_idx = amdxdna_cmd_get_cu_idx(abo);
+       struct amdxdna_cmd_start_npu *sn;
+       u32 payload_len;
+       void *payload;
+       u32 arg_sz;
+
+       if (cu_idx < 0)
+               return -EINVAL;
+
+       payload = amdxdna_cmd_get_payload(abo, &payload_len);
+       if (!payload)
+               return -EINVAL;
+       sn = payload;
+       arg_sz = payload_len - sizeof(*sn);
+       if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
+               return -EINVAL;
+
+       if (!slot_dpu_has_space(offset, arg_sz))
+               return -ENOSPC;
+
+       buf->inst_buf_addr = sn->buffer;
+       buf->inst_size = sn->buffer_size;
+       buf->inst_prop_cnt = sn->prop_count;
+       buf->cu_idx = cu_idx;
+       buf->arg_cnt = arg_sz / sizeof(u32);
+       memcpy(buf->args, sn->prop_args, arg_sz);
+
+       /* Accurate buf size to hint firmware to do necessary copy */
+       *size += sizeof(*buf) + arg_sz;
+       return 0;
+}
+
+static inline int
+aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 
offset,
+                          struct amdxdna_gem_obj *abo, u32 *size)
+{
+       u32 this_op = amdxdna_cmd_get_op(abo);
+       void *cmd_buf = cmdbuf_abo->mem.kva;
+       int ret;
+
+       if (this_op != op) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       switch (op) {
+       case ERT_START_CU:
+               ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
+               break;
+       case ERT_START_NPU:
+               ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, 
size);
+               break;
+       default:
+               ret = -EOPNOTSUPP;
+       }
+
+done:
+       if (ret) {
+               XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret 
%d",
+                        op, ret);
+       }
+       return ret;
+}
+
+static inline struct amdxdna_gem_obj *
+aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
+{
+       int idx = get_job_idx(job->seq);
+
+       return job->hwctx->priv->cmd_buf[idx];
+}
+
+static inline void
+aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
+                            struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 
cnt)
+{
+       req->buf_addr = cmdbuf_abo->mem.dev_addr;
+       req->buf_size = size;
+       req->count = cnt;
+       drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
+       XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x 
count %d",
+                req->buf_addr, size, cnt);
+}
+
+static inline u32
+aie2_cmd_op_to_msg_op(u32 op)
+{
+       switch (op) {
+       case ERT_START_CU:
+               return MSG_OP_CHAIN_EXEC_BUFFER_CF;
+       case ERT_START_NPU:
+               return MSG_OP_CHAIN_EXEC_DPU;
+       default:
+               return MSG_OP_MAX_OPCODE;
+       }
+}
+
+int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
+                              struct amdxdna_sched_job *job,
+                              int (*notify_cb)(void *, const u32 *, size_t))
+{
+       struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
+       struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+       struct amdxdna_client *client = hwctx->client;
+       struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+       struct amdxdna_cmd_chain *payload;
+       struct xdna_mailbox_msg msg;
+       struct cmd_chain_req req;
+       u32 payload_len;
+       u32 offset = 0;
+       u32 size;
+       int ret;
+       u32 op;
+       u32 i;
+
+       op = amdxdna_cmd_get_op(cmd_abo);
+       payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
+       if (op != ERT_CMD_CHAIN || !payload ||
+           payload_len < struct_size(payload, data, payload->command_count))
+               return -EINVAL;
+
+       for (i = 0; i < payload->command_count; i++) {
+               u32 boh = (u32)(payload->data[i]);
+               struct amdxdna_gem_obj *abo;
+
+               abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
+               if (!abo) {
+                       XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
+                       return -ENOENT;
+               }
+
+               /* All sub-cmd should have same op, use the first one. */
+               if (i == 0)
+                       op = amdxdna_cmd_get_op(abo);
+
+               ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, 
&size);
+               amdxdna_gem_put_obj(abo);
+               if (ret)
+                       return -EINVAL;
+
+               offset += size;
+       }
+
+       /* The offset is the accumulated total size of the cmd buffer */
+       aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, 
payload->command_count);
+
+       msg.opcode = aie2_cmd_op_to_msg_op(op);
+       if (msg.opcode == MSG_OP_MAX_OPCODE)
+               return -EOPNOTSUPP;
+       msg.handle = job;
+       msg.notify_cb = notify_cb;
+       msg.send_data = (u8 *)&req;
+       msg.send_size = sizeof(req);
+       ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+       if (ret) {
+               XDNA_ERR(hwctx->client->xdna, "Send message failed");
+               return ret;
+       }
+
+       return 0;
+}
+
+int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
+                               struct amdxdna_sched_job *job,
+                               int (*notify_cb)(void *, const u32 *, size_t))
+{
+       struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
+       struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+       struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+       struct xdna_mailbox_msg msg;
+       struct cmd_chain_req req;
+       u32 size;
+       int ret;
+       u32 op;
+
+       op = amdxdna_cmd_get_op(cmd_abo);
+       ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
+       if (ret)
+               return ret;
+
+       aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
+
+       msg.opcode = aie2_cmd_op_to_msg_op(op);
+       if (msg.opcode == MSG_OP_MAX_OPCODE)
+               return -EOPNOTSUPP;
+       msg.handle = job;
+       msg.notify_cb = notify_cb;
+       msg.send_data = (u8 *)&req;
+       msg.send_size = sizeof(req);
+       ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+       if (ret) {
+               XDNA_ERR(hwctx->client->xdna, "Send message failed");
+               return ret;
+       }
+
+       return 0;
+}
+
+int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+                int (*notify_cb)(void *, const u32 *, size_t))
+{
+       struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+       struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
+       struct amdxdna_dev *xdna = hwctx->client->xdna;
+       struct xdna_mailbox_msg msg;
+       struct sync_bo_req req;
+       int ret = 0;
+
+       req.src_addr = 0;
+       req.dst_addr = abo->mem.dev_addr - 
hwctx->client->dev_heap->mem.dev_addr;
+       req.size = abo->mem.size;
+
+       /* Device to Host */
+       req.type = FIELD_PREP(AIE2_MSG_SYNC_BO_SRC_TYPE, SYNC_BO_DEV_MEM) |
+               FIELD_PREP(AIE2_MSG_SYNC_BO_DST_TYPE, SYNC_BO_HOST_MEM);
+
+       XDNA_DBG(xdna, "sync %d bytes src(0x%llx) to dst(0x%llx) completed",
+                req.size, req.src_addr, req.dst_addr);
+
+       msg.handle = job;
+       msg.notify_cb = notify_cb;
+       msg.send_data = (u8 *)&req;
+       msg.send_size = sizeof(req);
+       msg.opcode = MSG_OP_SYNC_BO;
+
+       ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+       if (ret) {
+               XDNA_ERR(xdna, "Send message failed");
+               return ret;
+       }
+
+       return 0;
+}
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 5fe0db3462d4..8efff1ba6319 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -493,4 +493,7 @@ const struct amdxdna_dev_ops aie2_ops = {
        .hwctx_init     = aie2_hwctx_init,
        .hwctx_fini     = aie2_hwctx_fini,
        .hwctx_config   = aie2_hwctx_config,
+       .cmd_submit     = aie2_cmd_submit,
+       .cmd_wait       = aie2_cmd_wait,
+       .hmm_invalidate = aie2_hmm_invalidate,
 };
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 0f9f7517c831..26d271a1040b 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -128,9 +128,28 @@ struct rt_config {
        u32     value;
 };
 
+/*
+ * Define the maximum number of pending commands in a hardware context.
+ * Must be power of 2!
+ */
+#define HWCTX_MAX_CMDS         4
+#define get_job_idx(seq) ((seq) & (HWCTX_MAX_CMDS - 1))
 struct amdxdna_hwctx_priv {
        struct amdxdna_gem_obj          *heap;
        void                            *mbox_chann;
+
+       struct drm_gpu_scheduler        sched;
+       struct drm_sched_entity         entity;
+
+       struct mutex                    io_lock; /* protect seq and cmd order */
+       struct wait_queue_head          job_free_wq;
+       struct amdxdna_sched_job        *pending[HWCTX_MAX_CMDS];
+       u32                             num_pending;
+       u64                             seq;
+       /* Completed job counter */
+       u64                             completed;
+
+       struct amdxdna_gem_obj          *cmd_buf[HWCTX_MAX_CMDS];
 };
 
 struct amdxdna_dev_hdl {
@@ -209,10 +228,25 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, 
struct amdxdna_hwctx *hwct
 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx 
*hwctx);
 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, 
u64 size);
 int aie2_config_cu(struct amdxdna_hwctx *hwctx);
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+                int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
+                               struct amdxdna_sched_job *job,
+                               int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
+                              struct amdxdna_sched_job *job,
+                              int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+                int (*notify_cb)(void *, const u32 *, size_t));
 
 /* aie2_hwctx.c */
 int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
 void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
 int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void 
*buf, u32 size);
+int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job 
*job, u64 *seq);
+int aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
+void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
+void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
+void aie2_restart_ctx(struct amdxdna_client *client);
 
 #endif /* _AIE2_PCI_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c 
b/drivers/accel/amdxdna/amdxdna_ctx.c
index 3daf7065acf4..399f9c359ac7 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -2,14 +2,59 @@
 /*
  * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
  */
+#include <trace/events/amdxdna.h>
+
 #include "amdxdna_drm.h"
 
 #define MAX_HWCTX_ID           255
+#define MAX_ARG_COUNT          4095
+
+struct amdxdna_fence {
+       struct dma_fence        base;
+       spinlock_t              lock; /* for base */
+       struct amdxdna_hwctx    *hwctx;
+};
+
+static const char *amdxdna_fence_get_driver_name(struct dma_fence *fence)
+{
+       return KBUILD_MODNAME;
+}
 
-static void amdxdna_hwctx_destroy(struct amdxdna_hwctx *hwctx)
+static const char *amdxdna_fence_get_timeline_name(struct dma_fence *fence)
+{
+       struct amdxdna_fence *xdna_fence;
+
+       xdna_fence = container_of(fence, struct amdxdna_fence, base);
+
+       return xdna_fence->hwctx->name;
+}
+
+static const struct dma_fence_ops fence_ops = {
+       .get_driver_name = amdxdna_fence_get_driver_name,
+       .get_timeline_name = amdxdna_fence_get_timeline_name,
+};
+
+static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
+{
+       struct amdxdna_fence *fence;
+
+       fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+       if (!fence)
+               return NULL;
+
+       fence->hwctx = hwctx;
+       spin_lock_init(&fence->lock);
+       dma_fence_init(&fence->base, &fence_ops, &fence->lock, hwctx->id, 0);
+       return &fence->base;
+}
+
+static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
+                                     struct srcu_struct *ss)
 {
        struct amdxdna_dev *xdna = hwctx->client->xdna;
 
+       synchronize_srcu(ss);
+
        /* At this point, user is not able to submit new commands */
        mutex_lock(&xdna->dev_lock);
        xdna->dev_info->ops->hwctx_fini(hwctx);
@@ -35,7 +80,7 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client)
                         client->pid, hwctx->id);
                idr_remove(&client->hwctx_idr, hwctx->id);
                mutex_unlock(&client->hwctx_lock);
-               amdxdna_hwctx_destroy(hwctx);
+               amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
                mutex_lock(&client->hwctx_lock);
        }
        mutex_unlock(&client->hwctx_lock);
@@ -126,6 +171,12 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device 
*dev, void *data, struct d
        if (!drm_dev_enter(dev, &idx))
                return -ENODEV;
 
+       /*
+        * Use hwctx_lock to achieve exclusion with other hwctx writers,
+        * SRCU to synchronize with exec/wait command ioctls.
+        *
+        * The pushed jobs are handled by DRM scheduler during destroy.
+        */
        mutex_lock(&client->hwctx_lock);
        hwctx = idr_find(&client->hwctx_idr, args->handle);
        if (!hwctx) {
@@ -138,7 +189,7 @@ int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, 
void *data, struct d
        idr_remove(&client->hwctx_idr, hwctx->id);
        mutex_unlock(&client->hwctx_lock);
 
-       amdxdna_hwctx_destroy(hwctx);
+       amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
 
        XDNA_DBG(xdna, "PID %d destroyed HW context %d", client->pid, 
args->handle);
 out:
@@ -152,10 +203,10 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device 
*dev, void *data, struct dr
        struct amdxdna_drm_config_hwctx *args = data;
        struct amdxdna_dev *xdna = to_xdna_dev(dev);
        struct amdxdna_hwctx *hwctx;
+       int ret, idx;
        u32 buf_size;
        void *buf;
        u64 val;
-       int ret;
 
        if (!xdna->dev_info->ops->hwctx_config)
                return -EOPNOTSUPP;
@@ -194,17 +245,286 @@ int amdxdna_drm_config_hwctx_ioctl(struct drm_device 
*dev, void *data, struct dr
        }
 
        mutex_lock(&xdna->dev_lock);
+       idx = srcu_read_lock(&client->hwctx_srcu);
        hwctx = idr_find(&client->hwctx_idr, args->handle);
        if (!hwctx) {
                XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, 
args->handle);
                ret = -EINVAL;
-               goto unlock;
+               goto unlock_srcu;
        }
 
        ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, 
buf, buf_size);
 
-unlock:
+unlock_srcu:
+       srcu_read_unlock(&client->hwctx_srcu, idx);
        mutex_unlock(&xdna->dev_lock);
        kfree(buf);
        return ret;
 }
+
+static inline void
+amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
+{
+       int i;
+
+       for (i = 0; i < job->bo_cnt; i++) {
+               if (!job->bos[i])
+                       break;
+               drm_gem_object_put(job->bos[i]);
+       }
+}
+
+static inline int
+amdxdna_arg_bos_lookup(struct amdxdna_client *client,
+                      struct amdxdna_sched_job *job,
+                      u32 *bo_hdls, u32 bo_cnt)
+{
+       struct drm_gem_object *gobj;
+       int i, ret;
+
+       job->bo_cnt = bo_cnt;
+       for (i = 0; i < job->bo_cnt; i++) {
+               struct amdxdna_gem_obj *abo;
+
+               gobj = drm_gem_object_lookup(client->filp, bo_hdls[i]);
+               if (!gobj) {
+                       ret = -ENOENT;
+                       goto put_shmem_bo;
+               }
+               abo = to_xdna_obj(gobj);
+
+               mutex_lock(&abo->lock);
+               if (abo->pinned) {
+                       mutex_unlock(&abo->lock);
+                       job->bos[i] = gobj;
+                       continue;
+               }
+
+               ret = amdxdna_gem_pin_nolock(abo);
+               if (ret) {
+                       mutex_unlock(&abo->lock);
+                       drm_gem_object_put(gobj);
+                       goto put_shmem_bo;
+               }
+               abo->pinned = true;
+               mutex_unlock(&abo->lock);
+
+               job->bos[i] = gobj;
+       }
+
+       return 0;
+
+put_shmem_bo:
+       amdxdna_arg_bos_put(job);
+       return ret;
+}
+
+static void amdxdna_sched_job_release(struct kref *ref)
+{
+       struct amdxdna_sched_job *job;
+
+       job = container_of(ref, struct amdxdna_sched_job, refcnt);
+
+       trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
+       amdxdna_arg_bos_put(job);
+       amdxdna_gem_put_obj(job->cmd_bo);
+       kfree(job);
+}
+
+void amdxdna_job_put(struct amdxdna_sched_job *job)
+{
+       kref_put(&job->refcnt, amdxdna_sched_job_release);
+}
+
+int amdxdna_cmd_submit(struct amdxdna_client *client,
+                      u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+                      u32 hwctx_hdl, u64 *seq)
+{
+       struct amdxdna_dev *xdna = client->xdna;
+       struct amdxdna_sched_job *job;
+       struct amdxdna_hwctx *hwctx;
+       int ret, idx;
+
+       XDNA_DBG(xdna, "Command BO hdl %d, Arg BO count %d", cmd_bo_hdl, 
arg_bo_cnt);
+       job = kzalloc(struct_size(job, bos, arg_bo_cnt), GFP_KERNEL);
+       if (!job)
+               return -ENOMEM;
+
+       if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
+               job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, 
AMDXDNA_BO_CMD);
+               if (!job->cmd_bo) {
+                       XDNA_ERR(xdna, "Failed to get cmd bo from %d", 
cmd_bo_hdl);
+                       ret = -EINVAL;
+                       goto free_job;
+               }
+       } else {
+               job->cmd_bo = NULL;
+       }
+
+       ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
+       if (ret) {
+               XDNA_ERR(xdna, "Argument BOs lookup failed, ret %d", ret);
+               goto cmd_put;
+       }
+
+       idx = srcu_read_lock(&client->hwctx_srcu);
+       hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
+       if (!hwctx) {
+               XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
+                        client->pid, hwctx_hdl);
+               ret = -EINVAL;
+               goto unlock_srcu;
+       }
+
+       if (hwctx->status != HWCTX_STAT_READY) {
+               XDNA_ERR(xdna, "HW Context is not ready");
+               ret = -EINVAL;
+               goto unlock_srcu;
+       }
+
+       job->hwctx = hwctx;
+       job->mm = current->mm;
+
+       job->fence = amdxdna_fence_create(hwctx);
+       if (!job->fence) {
+               XDNA_ERR(xdna, "Failed to create fence");
+               ret = -ENOMEM;
+               goto unlock_srcu;
+       }
+       kref_init(&job->refcnt);
+
+       ret = xdna->dev_info->ops->cmd_submit(hwctx, job, seq);
+       if (ret)
+               goto put_fence;
+
+       /*
+        * The amdxdna_hwctx_destroy_rcu() will release hwctx and associated
+        * resource after synchronize_srcu(). The submitted jobs should be
+        * handled by the queue, for example DRM scheduler, in device layer.
+        * For here we can unlock SRCU.
+        */
+       srcu_read_unlock(&client->hwctx_srcu, idx);
+       trace_amdxdna_debug_point(hwctx->name, *seq, "job pushed");
+
+       return 0;
+
+put_fence:
+       dma_fence_put(job->fence);
+unlock_srcu:
+       srcu_read_unlock(&client->hwctx_srcu, idx);
+       amdxdna_arg_bos_put(job);
+cmd_put:
+       amdxdna_gem_put_obj(job->cmd_bo);
+free_job:
+       kfree(job);
+       return ret;
+}
+
+/*
+ * The submit command ioctl submits a command to firmware. One firmware command
+ * may contain multiple command BOs for processing as a whole.
+ * The command sequence number is returned which can be used for wait command 
ioctl.
+ */
+static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
+                                     struct amdxdna_drm_exec_cmd *args)
+{
+       struct amdxdna_dev *xdna = client->xdna;
+       u32 *arg_bo_hdls;
+       u32 cmd_bo_hdl;
+       int ret;
+
+       if (!args->arg_count || args->arg_count > MAX_ARG_COUNT) {
+               XDNA_ERR(xdna, "Invalid arg bo count %d", args->arg_count);
+               return -EINVAL;
+       }
+
+       /* Only support single command for now. */
+       if (args->cmd_count != 1) {
+               XDNA_ERR(xdna, "Invalid cmd bo count %d", args->cmd_count);
+               return -EINVAL;
+       }
+
+       cmd_bo_hdl = (u32)args->cmd_handles;
+       arg_bo_hdls = kcalloc(args->arg_count, sizeof(u32), GFP_KERNEL);
+       if (!arg_bo_hdls)
+               return -ENOMEM;
+       ret = copy_from_user(arg_bo_hdls, u64_to_user_ptr(args->args),
+                            args->arg_count * sizeof(u32));
+       if (ret) {
+               ret = -EFAULT;
+               goto free_cmd_bo_hdls;
+       }
+
+       ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
+                                args->arg_count, args->hwctx, &args->seq);
+       if (ret)
+               XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
+
+free_cmd_bo_hdls:
+       kfree(arg_bo_hdls);
+       if (!ret)
+               XDNA_DBG(xdna, "Pushed cmd %lld to scheduler", args->seq);
+       return ret;
+}
+
+int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp)
+{
+       struct amdxdna_client *client = filp->driver_priv;
+       struct amdxdna_drm_exec_cmd *args = data;
+
+       if (args->ext_flags)
+               return -EINVAL;
+
+       switch (args->type) {
+       case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
+               return amdxdna_drm_submit_execbuf(client, args);
+       }
+
+       XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
+       return -EINVAL;
+}
+
+int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
+                    u64 seq, u32 timeout)
+{
+       struct amdxdna_dev *xdna = client->xdna;
+       struct amdxdna_hwctx *hwctx;
+       int ret, idx;
+
+       if (!xdna->dev_info->ops->cmd_wait)
+               return -EOPNOTSUPP;
+
+       /* For locking concerns, see amdxdna_drm_exec_cmd_ioctl. */
+       idx = srcu_read_lock(&client->hwctx_srcu);
+       hwctx = idr_find(&client->hwctx_idr, hwctx_hdl);
+       if (!hwctx) {
+               XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
+                        client->pid, hwctx_hdl);
+               ret = -EINVAL;
+               goto unlock_hwctx_srcu;
+       }
+
+       ret = xdna->dev_info->ops->cmd_wait(hwctx, seq, timeout);
+
+unlock_hwctx_srcu:
+       srcu_read_unlock(&client->hwctx_srcu, idx);
+       return ret;
+}
+
+int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp)
+{
+       struct amdxdna_client *client = filp->driver_priv;
+       struct amdxdna_dev *xdna = to_xdna_dev(dev);
+       struct amdxdna_drm_wait_cmd *args = data;
+       int ret;
+
+       XDNA_DBG(xdna, "PID %d hwctx %d timeout set %d ms for cmd %lld",
+                client->pid, args->hwctx, args->timeout, args->seq);
+
+       ret = amdxdna_cmd_wait(client, args->hwctx, args->seq, args->timeout);
+
+       XDNA_DBG(xdna, "PID %d hwctx %d cmd %lld wait finished, ret %d",
+                client->pid, args->hwctx, args->seq, ret);
+
+       return ret;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h 
b/drivers/accel/amdxdna/amdxdna_ctx.h
index deb8ad3e0884..c0761527e0b2 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -7,8 +7,55 @@
 #define _AMDXDNA_CTX_H_
 
 #include <drm/drm_drv.h>
+#include <drm/gpu_scheduler.h>
 #include <linux/kref.h>
 
+#include "amdxdna_gem.h"
+
+struct amdxdna_hwctx_priv;
+
+enum ert_cmd_opcode {
+       ERT_START_CU      = 0,
+       ERT_CMD_CHAIN     = 19,
+       ERT_START_NPU     = 20,
+};
+
+enum ert_cmd_state {
+       ERT_CMD_STATE_INVALID,
+       ERT_CMD_STATE_NEW,
+       ERT_CMD_STATE_QUEUED,
+       ERT_CMD_STATE_RUNNING,
+       ERT_CMD_STATE_COMPLETED,
+       ERT_CMD_STATE_ERROR,
+       ERT_CMD_STATE_ABORT,
+       ERT_CMD_STATE_SUBMITTED,
+       ERT_CMD_STATE_TIMEOUT,
+       ERT_CMD_STATE_NORESPONSE,
+};
+
+/*
+ * Interpretation of the beginning of data payload for ERT_START_NPU in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
+ */
+struct amdxdna_cmd_start_npu {
+       u64 buffer;       /* instruction buffer address */
+       u32 buffer_size;  /* size of buffer in bytes */
+       u32 prop_count;   /* properties count */
+       u32 prop_args[];  /* properties and regular kernel arguments */
+};
+
+/*
+ * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
+ */
+struct amdxdna_cmd_chain {
+       u32 command_count;
+       u32 submit_index;
+       u32 error_index;
+       u32 reserved[3];
+       u64 data[] __counted_by(command_count);
+};
+
 /* Exec buffer command header format */
 #define AMDXDNA_CMD_STATE              GENMASK(3, 0)
 #define AMDXDNA_CMD_EXTRA_CU_MASK      GENMASK(11, 10)
@@ -43,9 +90,115 @@ struct amdxdna_hwctx {
        struct amdxdna_hwctx_param_config_cu *cus;
 };
 
+#define drm_job_to_xdna_job(j) \
+       container_of(j, struct amdxdna_sched_job, base)
+
+struct amdxdna_sched_job {
+       struct drm_sched_job    base;
+       struct kref             refcnt;
+       struct amdxdna_hwctx    *hwctx;
+       struct mm_struct        *mm;
+       /* The fence to notice DRM scheduler that job is done by hardware */
+       struct dma_fence        *fence;
+       /* user can wait on this fence */
+       struct dma_fence        *out_fence;
+       u64                     seq;
+       struct amdxdna_gem_obj  *cmd_bo;
+       size_t                  bo_cnt;
+       struct drm_gem_object   *bos[] __counted_by(bo_cnt);
+};
+
+static inline u32
+amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
+{
+       struct amdxdna_cmd *cmd = abo->mem.kva;
+
+       return FIELD_GET(AMDXDNA_CMD_OPCODE, cmd->header);
+}
+
+static inline void
+amdxdna_cmd_set_state(struct amdxdna_gem_obj *abo, enum ert_cmd_state s)
+{
+       struct amdxdna_cmd *cmd = abo->mem.kva;
+
+       cmd->header &= ~AMDXDNA_CMD_STATE;
+       cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, s);
+}
+
+static inline enum ert_cmd_state
+amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
+{
+       struct amdxdna_cmd *cmd = abo->mem.kva;
+
+       return FIELD_GET(AMDXDNA_CMD_STATE, cmd->header);
+}
+
+static inline void *
+amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
+{
+       struct amdxdna_cmd *cmd = abo->mem.kva;
+       u32 num_masks, count;
+
+       if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
+               num_masks = 0;
+       else
+               num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, 
cmd->header);
+
+       if (size) {
+               count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header);
+               if (unlikely(count <= num_masks)) {
+                       *size = 0;
+                       return NULL;
+               }
+               *size = (count - num_masks) * sizeof(u32);
+       }
+       return &cmd->data[num_masks];
+}
+
+static inline int
+amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
+{
+       struct amdxdna_cmd *cmd = abo->mem.kva;
+       u32 num_masks, i;
+       u32 *cu_mask;
+       int cu_idx;
+
+       if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
+               return -1;
+
+       num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
+       cu_mask = cmd->data;
+       for (i = 0; i < num_masks; i++) {
+               cu_idx = ffs(cu_mask[i]) - 1;
+
+               if (cu_idx >= 0)
+                       break;
+       }
+
+       return cu_idx;
+}
+
+static inline u32 amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx)
+{
+       return GENMASK(hwctx->start_col + hwctx->num_col - 1,
+                      hwctx->start_col);
+}
+
+void amdxdna_job_put(struct amdxdna_sched_job *job);
+
 void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
+
+int amdxdna_cmd_submit(struct amdxdna_client *client,
+                      u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+                      u32 hwctx_hdl, u64 *seq);
+
+int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
+                    u64 seq, u32 timeout);
+
 int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp);
 int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp);
 int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp);
+int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp);
+int amdxdna_drm_wait_cmd_ioctl(struct drm_device *dev, void *data, struct 
drm_file *filp);
 
 #endif /* _AMDXDNA_CTX_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_drm.c 
b/drivers/accel/amdxdna/amdxdna_drm.c
index 7b500db83795..3b15eae65e6d 100644
--- a/drivers/accel/amdxdna/amdxdna_drm.c
+++ b/drivers/accel/amdxdna/amdxdna_drm.c
@@ -35,6 +35,7 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct 
drm_file *filp)
                goto unbind_sva;
        }
        mutex_init(&client->hwctx_lock);
+       init_srcu_struct(&client->hwctx_srcu);
        idr_init_base(&client->hwctx_idr, AMDXDNA_INVALID_CTX_HANDLE + 1);
        mutex_init(&client->mm_lock);
 
@@ -64,6 +65,7 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct 
drm_file *filp)
        XDNA_DBG(xdna, "closing pid %d", client->pid);
 
        idr_destroy(&client->hwctx_idr);
+       cleanup_srcu_struct(&client->hwctx_srcu);
        mutex_destroy(&client->hwctx_lock);
        mutex_destroy(&client->mm_lock);
        if (client->dev_heap)
@@ -104,6 +106,9 @@ static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
        DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_BO, amdxdna_drm_create_bo_ioctl, 0),
        DRM_IOCTL_DEF_DRV(AMDXDNA_GET_BO_INFO, amdxdna_drm_get_bo_info_ioctl, 
0),
        DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
+       /* Exectuion */
+       DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
+       DRM_IOCTL_DEF_DRV(AMDXDNA_WAIT_CMD, amdxdna_drm_wait_cmd_ioctl, 0),
 };
 
 static const struct file_operations amdxdna_fops = {
diff --git a/drivers/accel/amdxdna/amdxdna_drm.h 
b/drivers/accel/amdxdna/amdxdna_drm.h
index 49e9cd3468ee..6f1eaed46382 100644
--- a/drivers/accel/amdxdna/amdxdna_drm.h
+++ b/drivers/accel/amdxdna/amdxdna_drm.h
@@ -42,6 +42,8 @@ struct amdxdna_dev_ops {
        void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
        int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, 
void *buf, u32 size);
        void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long 
cur_seq);
+       int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job 
*job, u64 *seq);
+       int (*cmd_wait)(struct amdxdna_hwctx *hwctx, u64 seq, u32 timeout);
 };
 
 /*
@@ -90,6 +92,8 @@ struct amdxdna_client {
        struct list_head                node;
        pid_t                           pid;
        struct mutex                    hwctx_lock; /* protect hwctx */
+       /* do NOT wait this srcu when hwctx_lock is hold */
+       struct srcu_struct              hwctx_srcu;
        struct idr                      hwctx_idr;
        struct amdxdna_dev              *xdna;
        struct drm_file                 *filp;
diff --git a/include/trace/events/amdxdna.h b/include/trace/events/amdxdna.h
index 33343d8f0622..c6cb2da7b706 100644
--- a/include/trace/events/amdxdna.h
+++ b/include/trace/events/amdxdna.h
@@ -9,8 +9,49 @@
 #if !defined(_TRACE_AMDXDNA_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_AMDXDNA_H
 
+#include <drm/gpu_scheduler.h>
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(amdxdna_debug_point,
+           TP_PROTO(const char *name, u64 number, const char *str),
+
+           TP_ARGS(name, number, str),
+
+           TP_STRUCT__entry(__string(name, name)
+                            __field(u64, number)
+                            __string(str, str)),
+
+           TP_fast_assign(__assign_str(name);
+                          __entry->number = number;
+                          __assign_str(str);),
+
+           TP_printk("%s:%llu %s", __get_str(name), __entry->number,
+                     __get_str(str))
+);
+
+TRACE_EVENT(xdna_job,
+           TP_PROTO(struct drm_sched_job *sched_job, const char *name, const 
char *str, u64 seq),
+
+           TP_ARGS(sched_job, name, str, seq),
+
+           TP_STRUCT__entry(__string(name, name)
+                            __string(str, str)
+                            __field(u64, fence_context)
+                            __field(u64, fence_seqno)
+                            __field(u64, seq)),
+
+           TP_fast_assign(__assign_str(name);
+                          __assign_str(str);
+                          __entry->fence_context = 
sched_job->s_fence->finished.context;
+                          __entry->fence_seqno = 
sched_job->s_fence->finished.seqno;
+                          __entry->seq = seq;),
+
+           TP_printk("fence=(context:%llu, seqno:%lld), %s seq#:%lld %s",
+                     __entry->fence_context, __entry->fence_seqno,
+                     __get_str(name), __entry->seq,
+                     __get_str(str))
+);
+
 DECLARE_EVENT_CLASS(xdna_mbox_msg,
                    TP_PROTO(char *name, u8 chann_id, u32 opcode, u32 msg_id),
 
diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
index 2af4f80dff57..920cdd2a190d 100644
--- a/include/uapi/drm/amdxdna_accel.h
+++ b/include/uapi/drm/amdxdna_accel.h
@@ -16,6 +16,7 @@ extern "C" {
 #define AMDXDNA_DRIVER_MAJOR   1
 #define AMDXDNA_DRIVER_MINOR   0
 
+#define AMDXDNA_INVALID_CMD_HANDLE     (~0UL)
 #define AMDXDNA_INVALID_ADDR           (~0UL)
 #define AMDXDNA_INVALID_CTX_HANDLE     0
 #define AMDXDNA_INVALID_BO_HANDLE      0
@@ -32,6 +33,8 @@ enum amdxdna_drm_ioctl_id {
        DRM_AMDXDNA_CREATE_BO,
        DRM_AMDXDNA_GET_BO_INFO,
        DRM_AMDXDNA_SYNC_BO,
+       DRM_AMDXDNA_EXEC_CMD,
+       DRM_AMDXDNA_WAIT_CMD,
 };
 
 /**
@@ -201,6 +204,54 @@ struct amdxdna_drm_sync_bo {
        __u64 size;
 };
 
+enum amdxdna_cmd_type {
+       AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0,
+       AMDXDNA_CMD_SUBMIT_DEPENDENCY,
+       AMDXDNA_CMD_SUBMIT_SIGNAL,
+};
+
+/**
+ * struct amdxdna_drm_exec_cmd - Execute command.
+ * @ext: MBZ.
+ * @ext_flags: MBZ.
+ * @hwctx: Hardware context handle.
+ * @type: One of command type in enum amdxdna_cmd_type.
+ * @cmd_handles: Array of command handles or the command handle itself
+ *               in case of just one.
+ * @args: Array of arguments for all command handles.
+ * @cmd_count: Number of command handles in the cmd_handles array.
+ * @arg_count: Number of arguments in the args array.
+ * @seq: Returned sequence number for this command.
+ */
+struct amdxdna_drm_exec_cmd {
+       __u64 ext;
+       __u64 ext_flags;
+       __u32 hwctx;
+       __u32 type;
+       __u64 cmd_handles;
+       __u64 args;
+       __u32 cmd_count;
+       __u32 arg_count;
+       __u64 seq;
+};
+
+/**
+ * struct amdxdna_drm_wait_cmd - Wait exectuion command.
+ *
+ * @hwctx: hardware context handle.
+ * @timeout: timeout in ms, 0 implies infinite wait.
+ * @seq: sequence number of the command returned by execute command.
+ *
+ * Wait a command specified by seq to be completed.
+ * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
+ * to submit a new command.
+ */
+struct amdxdna_drm_wait_cmd {
+       __u32 hwctx;
+       __u32 timeout;
+       __u64 seq;
+};
+
 #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
        DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
                 struct amdxdna_drm_create_hwctx)
@@ -225,6 +276,14 @@ struct amdxdna_drm_sync_bo {
        DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, \
                 struct amdxdna_drm_sync_bo)
 
+#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
+       DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, \
+                struct amdxdna_drm_exec_cmd)
+
+#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
+       DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \
+                struct amdxdna_drm_wait_cmd)
+
 #if defined(__cplusplus)
 } /* extern c end */
 #endif
-- 
2.34.1

[PATCH V1 06/10] accel/amdxdna: Add command execution

Reply via email to