A CPU job is a type of job that performs operations that requires CPU
intervention. An indirect CSD job is a job that, when executed in the
queue, will map the indirect buffer, read the dispatch parameters, and
submit a regular dispatch. Therefore, it is a job that needs CPU
intervention.

So, create a user extension for the CPU job that enables the creation
of an indirect CSD. This user extension will allow the creation of a CSD
job linked to a CPU job. The CPU job will wait for the indirect CSD job
dependencies and, once they are signaled, it will update the CSD job
parameters.

Co-developed-by: Melissa Wen <m...@igalia.com>
Signed-off-by: Melissa Wen <m...@igalia.com>
Signed-off-by: Maíra Canal <mca...@igalia.com>
---
 drivers/gpu/drm/v3d/v3d_drv.h    |  31 ++++++++-
 drivers/gpu/drm/v3d/v3d_sched.c  |  41 +++++++++++-
 drivers/gpu/drm/v3d/v3d_submit.c | 104 ++++++++++++++++++++++++++++++-
 include/uapi/drm/v3d_drm.h       |  43 ++++++++++++-
 4 files changed, 213 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 39d62915cdd6..202c0d4b04a5 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -316,12 +316,41 @@ struct v3d_csd_job {
        struct drm_v3d_submit_csd args;
 };
 
-enum v3d_cpu_job_type {};
+enum v3d_cpu_job_type {
+       V3D_CPU_JOB_TYPE_INDIRECT_CSD = 1,
+};
+
+struct v3d_indirect_csd_info {
+       /* Indirect CSD */
+       struct v3d_csd_job *job;
+
+       /* Clean cache job associated to the Indirect CSD job */
+       struct v3d_job *clean_job;
+
+       /* Offset within the BO where the workgroup counts are stored */
+       u32 offset;
+
+       /* Workgroups size */
+       u32 wg_size;
+
+       /* Indices of the uniforms with the workgroup dispatch counts
+        * in the uniform stream.
+        */
+       u32 wg_uniform_offsets[3];
+
+       /* Indirect BO */
+       struct drm_gem_object *indirect;
+
+       /* Context of the Indirect CSD job */
+       struct ww_acquire_ctx acquire_ctx;
+};
 
 struct v3d_cpu_job {
        struct v3d_job base;
 
        enum v3d_cpu_job_type job_type;
+
+       struct v3d_indirect_csd_info indirect_csd;
 };
 
 typedef void (*v3d_cpu_job_fn)(struct v3d_cpu_job *);
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index ebbd00840a73..597e4ec3d28d 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -25,6 +25,8 @@
 #include "v3d_regs.h"
 #include "v3d_trace.h"
 
+#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
+
 static struct v3d_job *
 to_v3d_job(struct drm_sched_job *sched_job)
 {
@@ -268,7 +270,44 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)
        return fence;
 }
 
-static const v3d_cpu_job_fn cpu_job_function[] = { };
+static void
+v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job)
+{
+       struct v3d_indirect_csd_info *indirect_csd = &job->indirect_csd;
+       struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
+       struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect);
+       struct drm_v3d_submit_csd *args = &indirect_csd->job->args;
+       u32 *wg_counts;
+
+       v3d_get_bo_vaddr(bo);
+       v3d_get_bo_vaddr(indirect);
+
+       wg_counts = (uint32_t *) (bo->vaddr + indirect_csd->offset);
+
+       if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0)
+               return;
+
+       args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+       args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+       args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+       args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
+                      (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+
+       for (int i = 0; i < 3; i++) {
+               /* 0xffffffff indicates that the uniform rewrite is not needed 
*/
+               if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) {
+                       u32 uniform_idx = indirect_csd->wg_uniform_offsets[i];
+                       ((uint32_t *) indirect->vaddr)[uniform_idx] = 
wg_counts[i];
+               }
+       }
+
+       v3d_put_bo_vaddr(indirect);
+       v3d_put_bo_vaddr(bo);
+}
+
+static const v3d_cpu_job_fn cpu_job_function[] = {
+       [V3D_CPU_JOB_TYPE_INDIRECT_CSD] = 
v3d_rewrite_csd_job_wg_counts_from_indirect,
+};
 
 static struct dma_fence *
 v3d_cpu_job_run(struct drm_sched_job *sched_job)
diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
index eb26fe1e27e3..0320695b941b 100644
--- a/drivers/gpu/drm/v3d/v3d_submit.c
+++ b/drivers/gpu/drm/v3d/v3d_submit.c
@@ -391,6 +391,48 @@ v3d_get_multisync_submit_deps(struct drm_file *file_priv,
        return 0;
 }
 
+/* Get data for the indirect CSD job submission. */
+static int
+v3d_get_cpu_indirect_csd_params(struct drm_file *file_priv,
+                               struct drm_v3d_extension __user *ext,
+                               struct v3d_cpu_job *job)
+{
+       struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+       struct v3d_dev *v3d = v3d_priv->v3d;
+       struct drm_v3d_indirect_csd indirect_csd;
+       struct v3d_indirect_csd_info *info = &job->indirect_csd;
+
+       if (!job) {
+               DRM_DEBUG("CPU job extension was attached to a GPU job.\n");
+               return -EINVAL;
+       }
+
+       if (job->job_type) {
+               DRM_DEBUG("Two CPU job extensions were added to the same CPU 
job.\n");
+               return -EINVAL;
+       }
+
+       if (copy_from_user(&indirect_csd, ext, sizeof(indirect_csd)))
+               return -EFAULT;
+
+       if (!v3d_has_csd(v3d)) {
+               DRM_DEBUG("Attempting CSD submit on non-CSD hardware.\n");
+               return -EINVAL;
+       }
+
+       job->job_type = V3D_CPU_JOB_TYPE_INDIRECT_CSD;
+       info->offset = indirect_csd.offset;
+       info->wg_size = indirect_csd.wg_size;
+       memcpy(&info->wg_uniform_offsets, &indirect_csd.wg_uniform_offsets,
+              sizeof(indirect_csd.wg_uniform_offsets));
+
+       info->indirect = drm_gem_object_lookup(file_priv, 
indirect_csd.indirect);
+
+       return v3d_setup_csd_jobs_and_bos(file_priv, v3d, &indirect_csd.submit,
+                                         &info->job, &info->clean_job,
+                                         NULL, &info->acquire_ctx);
+}
+
 /* Whenever userspace sets ioctl extensions, v3d_get_extensions parses data
  * according to the extension id (name).
  */
@@ -416,6 +458,9 @@ v3d_get_extensions(struct drm_file *file_priv,
                case DRM_V3D_EXT_ID_MULTI_SYNC:
                        ret = v3d_get_multisync_submit_deps(file_priv, 
user_ext, se);
                        break;
+               case DRM_V3D_EXT_ID_CPU_INDIRECT_CSD:
+                       ret = v3d_get_cpu_indirect_csd_params(file_priv, 
user_ext, job);
+                       break;
                default:
                        DRM_DEBUG_DRIVER("Unknown extension id: %d\n", ext.id);
                        return -EINVAL;
@@ -790,7 +835,9 @@ v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
        return ret;
 }
 
-static const unsigned int cpu_job_bo_handle_count[] = { };
+static const unsigned int cpu_job_bo_handle_count[] = {
+       [V3D_CPU_JOB_TYPE_INDIRECT_CSD] = 1,
+};
 
 /**
  * v3d_submit_cpu_ioctl() - Submits a CPU job to the V3D.
@@ -808,7 +855,10 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data,
        struct v3d_dev *v3d = to_v3d_dev(dev);
        struct drm_v3d_submit_cpu *args = data;
        struct v3d_submit_ext se = {0};
+       struct v3d_submit_ext *out_se = NULL;
        struct v3d_cpu_job *cpu_job = NULL;
+       struct v3d_csd_job *csd_job = NULL;
+       struct v3d_job *clean_job = NULL;
        struct ww_acquire_ctx acquire_ctx;
        int ret;
 
@@ -847,6 +897,9 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data,
        if (ret)
                goto fail;
 
+       clean_job = cpu_job->indirect_csd.clean_job;
+       csd_job = cpu_job->indirect_csd.job;
+
        if (args->bo_handle_count) {
                ret = v3d_lookup_bos(dev, file_priv, &cpu_job->base,
                                     args->bo_handles, args->bo_handle_count);
@@ -860,19 +913,66 @@ v3d_submit_cpu_ioctl(struct drm_device *dev, void *data,
 
        mutex_lock(&v3d->sched_lock);
        v3d_push_job(&cpu_job->base);
+
+       switch (cpu_job->job_type) {
+       case V3D_CPU_JOB_TYPE_INDIRECT_CSD:
+               ret = drm_sched_job_add_dependency(&csd_job->base.base,
+                                                  
dma_fence_get(cpu_job->base.done_fence));
+               if (ret)
+                       goto fail_unreserve;
+
+               v3d_push_job(&csd_job->base);
+
+               ret = drm_sched_job_add_dependency(&clean_job->base,
+                                                  
dma_fence_get(csd_job->base.done_fence));
+               if (ret)
+                       goto fail_unreserve;
+
+               v3d_push_job(clean_job);
+
+               break;
+       default:
+               break;
+       }
        mutex_unlock(&v3d->sched_lock);
 
+       out_se = (cpu_job->job_type == V3D_CPU_JOB_TYPE_INDIRECT_CSD) ? NULL : 
&se;
+
        v3d_attach_fences_and_unlock_reservation(file_priv,
                                                 &cpu_job->base,
                                                 &acquire_ctx, 0,
-                                                NULL, 
cpu_job->base.done_fence);
+                                                out_se, 
cpu_job->base.done_fence);
+
+       switch (cpu_job->job_type) {
+       case V3D_CPU_JOB_TYPE_INDIRECT_CSD:
+               v3d_attach_fences_and_unlock_reservation(file_priv,
+                                                        clean_job,
+                                                        
&cpu_job->indirect_csd.acquire_ctx,
+                                                        0, &se, 
clean_job->done_fence);
+               break;
+       default:
+               break;
+       }
 
        v3d_job_put(&cpu_job->base);
+       v3d_job_put(&csd_job->base);
+       v3d_job_put(clean_job);
 
        return 0;
 
+fail_unreserve:
+       mutex_unlock(&v3d->sched_lock);
+
+       drm_gem_unlock_reservations(cpu_job->base.bo, cpu_job->base.bo_count,
+                                   &acquire_ctx);
+
+       drm_gem_unlock_reservations(clean_job->bo, clean_job->bo_count,
+                                   &cpu_job->indirect_csd.acquire_ctx);
+
 fail:
        v3d_job_cleanup((void *)cpu_job);
+       v3d_job_cleanup((void *)csd_job);
+       v3d_job_cleanup(clean_job);
        v3d_put_multisync_post_deps(&se);
 
        return ret;
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index 00abef9d0db7..0c0f47782528 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -71,7 +71,8 @@ extern "C" {
 struct drm_v3d_extension {
        __u64 next;
        __u32 id;
-#define DRM_V3D_EXT_ID_MULTI_SYNC              0x01
+#define DRM_V3D_EXT_ID_MULTI_SYNC                      0x01
+#define DRM_V3D_EXT_ID_CPU_INDIRECT_CSD                0x02
        __u32 flags; /* mbz */
 };
 
@@ -365,8 +366,46 @@ struct drm_v3d_submit_csd {
        __u32 pad;
 };
 
+/**
+ * struct drm_v3d_indirect_csd - ioctl extension for the CPU job to create an
+ * indirect CSD
+ *
+ * When an extension of DRM_V3D_EXT_ID_CPU_INDIRECT_CSD id is defined, it
+ * points to this extension to define a indirect CSD submission. It creates a
+ * CPU job linked to a CSD job. The CPU job waits for the indirect CSD
+ * dependencies and, once they are signaled, it updates the CSD job config
+ * before allowing the CSD job execution.
+ */
+struct drm_v3d_indirect_csd {
+       struct drm_v3d_extension base;
+
+       /* Indirect CSD */
+       struct drm_v3d_submit_csd submit;
+
+       /* Handle of the indirect BO, that should be also attached to the
+        * indirect CSD.
+        */
+       __u32 indirect;
+
+       /* Offset within the BO where the workgroup counts are stored */
+       __u32 offset;
+
+       /* Workgroups size */
+       __u32 wg_size;
+
+       /* Indices of the uniforms with the workgroup dispatch counts
+        * in the uniform stream. If the uniform rewrite is not needed,
+        * the offset must be 0xffffffff.
+        */
+       __u32 wg_uniform_offsets[3];
+};
+
 struct drm_v3d_submit_cpu {
-       /* Pointer to a u32 array of the BOs that are referenced by the job. */
+       /* Pointer to a u32 array of the BOs that are referenced by the job.
+        *
+        * For DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, it must contain only one BO,
+        * that contains the workgroup counts.
+        */
        __u64 bo_handles;
 
        /* Number of BO handles passed in (size is that times 4). */
-- 
2.42.0

Reply via email to