Hi Tvrtko,

On 06/03/26 09:04, Tvrtko Ursulin wrote:

On 05/02/2026 21:31, Maíra Canal wrote:
Implement per-file descriptor seqno tracking using an xarray, allowing
userspace to wait on specific job completions via vc4_wait_seqno_ioctl.

While this interface should ideally be deprecated in favor of syncobjs,
it has long been exposed to userspace and therefore must continue to be
supported.

Lay the groundwork to replace the existing `finished_seqno` logic with
DMA fence-based tracking. Each allocated seqno is associated with the
job's done_fence in a per-fd xarray. This allows vc4_wait_seqno_ioctl()
to simply look up the corresponding fence and wait on it.

This changes seqno semantics from global to per-file descriptor.
However, this doesn't affect userspace because a client can only wait
on seqnos returned from its own submissions.

Why is this better ie. why is the complication of a per file seqnos worth it?

The old seqno logic relied on a global monotonic counter with the
invariant that finished_seqno >= N implies all jobs with seqno <= N are
complete.

With the DRM sched, job ordering is only guaranteed within a scheduler
entity, which is inside the `struct vc4_file` (per-fd). Jobs from
different fd's can complete out of order, so a global seqno would break
that invariant. The problem is that this would break userspace
expectations. The IOCTL `vc4_wait_seqno_ioctl` relies on this invariant.

Considering that userspace sometimes uses `finished_seqno >= N` just
like a syncobj, using per-fd seqnos is the strategy I thought to avoid
breaking user-space, as the seqno monotonicity contract holds. Since
userspace can only wait on seqnos returned from its own submissions, the
change from global to per-fd scope doesn't affect behavior.



This design is inspired by the user fence handling in the Etnaviv and
msm drivers.

Signed-off-by: Maíra Canal <[email protected]>
---
  drivers/gpu/drm/vc4/vc4_drv.c    |  2 ++
  drivers/gpu/drm/vc4/vc4_drv.h    | 11 ++++++++
  drivers/gpu/drm/vc4/vc4_submit.c | 60 ++++++++++++++++++++++++++++++ ++++++++++
  3 files changed, 73 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/ vc4_drv.c index c18178f6c8cea0a182dc67c2b8a992d127c87fec..4d5760192ba090a58d1ec870c37c3459f2cac468 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -166,6 +166,7 @@ static int vc4_open(struct drm_device *dev, struct drm_file *file)
                        1, NULL);
      }
+    xa_init_flags(&vc4file->seqno_xa, XA_FLAGS_ALLOC1);
      vc4_perfmon_open_file(vc4file);
      file->driver_priv = vc4file;
      return 0;
@@ -186,6 +187,7 @@ static void vc4_close(struct drm_device *dev, struct drm_file *file)
      for (q = 0; q < VC4_MAX_QUEUES; q++)
          drm_sched_entity_destroy(&vc4file->sched_entity[q]);
+    xa_destroy(&vc4file->seqno_xa);
      vc4_perfmon_close_file(vc4file);
      kfree(vc4file);
  }
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/ vc4_drv.h index dc04803efad9fdf9179a98701b31b9e360e9ab15..ff6dc9aea52105359b72700ba367d390ddf84262 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -10,6 +10,7 @@
  #include <linux/of.h>
  #include <linux/refcount.h>
  #include <linux/uaccess.h>
+#include <linux/xarray.h>
  #include <drm/drm_atomic.h>
  #include <drm/drm_debugfs.h>
@@ -772,6 +773,10 @@ struct vc4_render_job {
       * Must remain allocated until the render job completes.
       */
      uint32_t bin_slots;
+
+    /* For userspace fence tracking. */
+    struct vc4_file *file;

Can jobs outlive clients?

Ack.


+    u32 seqno;
  };
  struct vc4_exec_info {
@@ -904,6 +909,12 @@ struct vc4_file {
      struct drm_sched_entity sched_entity[VC4_MAX_QUEUES];
+    /* Mapping of seqno to dma_fence for job completion tracking.
+     * Allows userspace to wait on specific submissions.
+     */
+    struct xarray seqno_xa;
+    u32 next_seqno;
+
      bool bin_bo_used;
  };
diff --git a/drivers/gpu/drm/vc4/vc4_submit.c b/drivers/gpu/drm/vc4/ vc4_submit.c index d6c684a14e6f9d9f7f456ad7fc985dbf631a7fb4..8d82fcb320a7ac16497c3f49e5106cf6b44ee706 100644
--- a/drivers/gpu/drm/vc4/vc4_submit.c
+++ b/drivers/gpu/drm/vc4/vc4_submit.c
@@ -203,6 +203,50 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
      return ret;
  }
+int
+vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
+             struct drm_file *file_priv)
+{
+    struct vc4_file *vc4_priv = file_priv->driver_priv;
+    struct vc4_dev *vc4 = to_vc4_dev(dev);
+    struct drm_vc4_wait_seqno *args = data;
+    unsigned long timeout_jiffies = nsecs_to_jiffies(args->timeout_ns);
+    unsigned long start = jiffies;
+    struct dma_fence *fence;
+    long ret;
+
+    if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
+        return -ENODEV;
+
+    rcu_read_lock();
+    fence = xa_load(&vc4_priv->seqno_xa, args->seqno);
+    if (fence)
+        fence = dma_fence_get_rcu(fence);

The RCU protection is for the drm_sched_fence? Comment maybe?

Yeah. While RCU guarantees the xarray entry won't be freed during the
lookup, it does not prevent the fence's refcount from being concurrently
dropped to zero from the irq context. I'll add a comment.

Best regards,
- Maíra


Regards,

Tvrtko

+    rcu_read_unlock();
+
+    if (!fence)
+        return 0;
+
+    trace_vc4_wait_for_seqno_begin(dev, args->seqno, args->timeout_ns);
+    ret = dma_fence_wait_timeout(fence, true, timeout_jiffies);
+    trace_vc4_wait_for_seqno_end(dev, args->seqno);
+
+    dma_fence_put(fence);
+
+    if (ret == -ERESTARTSYS) {
+        u64 delta = jiffies_to_nsecs(jiffies - start);
+
+        if (args->timeout_ns >= delta)
+            args->timeout_ns -= delta;
+        else
+            args->timeout_ns = 0;
+
+        return ret;
+    }
+
+    return ret > 0 ? 0 : -ETIME;
+}
+
  static void
  vc4_job_free(struct kref *ref)
  {
@@ -262,6 +306,9 @@ vc4_render_job_free(struct kref *ref)
      vc4->bin_alloc_used &= ~job->bin_slots;
      spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+    if (job->seqno)
+        xa_erase(&job->file->seqno_xa, job->seqno);
+
      vc4_job_free(ref);
  }
@@ -433,6 +480,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
          return ret;
      exec.render = render;
+    render->file = vc4_priv;
      INIT_LIST_HEAD(&render->unref_list);
      ret = vc4_lookup_bos(dev, file_priv, render, args->bo_handles,
@@ -488,10 +536,22 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
      vc4_push_job(&render->base);
      mutex_unlock(&vc4->sched_lock);
+    ret = xa_alloc_cyclic(&vc4_priv->seqno_xa, &render->seqno,
+                  render->base.done_fence,
+                  xa_limit_32b, &vc4_priv->next_seqno, GFP_KERNEL);
+    if (ret < 0) {
+        /* Jobs are already queued, just skip seqno tracking */
+        drm_err(dev, "Seqno allocation failed\n");
+        render->seqno = 0;
+    }
+
      vc4_attach_fences_and_unlock_reservation(file_priv, render,
                           &exec_ctx, args->out_sync,
                           render->base.done_fence);
+    /* Return the seqno for our job. */
+    args->seqno = render->seqno;
+
      vc4_job_put((void *)bin);
      vc4_job_put((void *)render);



Reply via email to