Re: [RFC PATCH] drm/sched: Add new KUnit test suite for concurrent job submission

Tvrtko Ursulin Thu, 22 Jan 2026 01:51:22 -0800


On 20/01/2026 20:52, Marco Pagani wrote:

Add a new test suite to simulate concurrent job submissions from userspace.
The suite includes a basic test case where each worker submits a single
job, and a more advanced case involving the submission of multiple jobs.


New test coverage is welcome!

But as Philipp has said some more context would be beneficial. Like areyou trying to hit a bug, or extend later with something which will hit abug and then you will propose improvements? Or simply improving thecoverage?

If it is about some race I would maybe consider putting this into a newtests_races.c. I actually have this file locally and some unfinishedtest cases already, although it is unclear when I will be happy withthem to post. But if the test is simply about adding coverage it is fineto live in tests_basic.c.

Signed-off-by: Marco Pagani <[email protected]>
---
  drivers/gpu/drm/scheduler/tests/tests_basic.c | 175 ++++++++++++++++++
  1 file changed, 175 insertions(+)

diff --git a/drivers/gpu/drm/scheduler/tests/tests_basic.c 
b/drivers/gpu/drm/scheduler/tests/tests_basic.c
index 82a41a456b0a..7c25bcbbe7c9 100644
--- a/drivers/gpu/drm/scheduler/tests/tests_basic.c
+++ b/drivers/gpu/drm/scheduler/tests/tests_basic.c
@@ -2,6 +2,7 @@
  /* Copyright (c) 2025 Valve Corporation */

#include <linux/delay.h>

+#include <linux/completion.h>

#include "sched_tests.h"@@ -235,6 +236,179 @@ static void drm_sched_basic_cancel(struct kunit *test)

        KUNIT_ASSERT_EQ(test, job->hw_fence.error, -ECANCELED);
  }

+struct sched_concurrent_test_context {

+       struct drm_mock_scheduler *sched;
+       struct workqueue_struct *sub_wq;
+       struct completion wait_go;
+};
+
+KUNIT_DEFINE_ACTION_WRAPPER(destroy_workqueue_wrap, destroy_workqueue,
+                           struct workqueue_struct *);
+
+KUNIT_DEFINE_ACTION_WRAPPER(drm_mock_sched_fini_wrap, drm_mock_sched_fini,
+                           struct drm_mock_scheduler *);
+
+KUNIT_DEFINE_ACTION_WRAPPER(drm_mock_sched_entity_free_wrap, 
drm_mock_sched_entity_free,
+                           struct drm_mock_sched_entity *);
+
+static int drm_sched_concurrent_init(struct kunit *test)
+{
+       struct sched_concurrent_test_context *ctx;
+       int ret;
+
+       ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL);
+
+       init_completion(&ctx->wait_go);
+
+       ctx->sched = drm_mock_sched_new(test, MAX_SCHEDULE_TIMEOUT);
+
+       ret = kunit_add_action_or_reset(test, drm_mock_sched_fini_wrap, 
ctx->sched);
+       KUNIT_ASSERT_EQ(test, ret, 0);
+
+       /* Use an unbounded workqueue to maximize job submission concurrency */
+       ctx->sub_wq = alloc_workqueue("drm-sched-submitters-wq", WQ_UNBOUND,
+                                     WQ_UNBOUND_MAX_ACTIVE);
+       KUNIT_ASSERT_NOT_NULL(test, ctx->sub_wq);
+
+       ret = kunit_add_action_or_reset(test, destroy_workqueue_wrap, 
ctx->sub_wq);
+       KUNIT_ASSERT_EQ(test, ret, 0);
+
+       test->priv = ctx;
+
+       return 0;
+}
+
+struct drm_sched_concurrent_params {
+       const char *description;
+       unsigned int job_base_us;
+       unsigned int num_jobs;
+       unsigned int num_subs;
+};
+
+static const struct drm_sched_concurrent_params drm_sched_concurrent_cases[] = 
{
+       {
+               .description = "Concurrently submit a single job in a single 
entity",
+               .job_base_us = 1000,
+               .num_jobs = 1,
+               .num_subs = 32,
+       },


Why is submission from a single thread interesting if it is already covered?

+       {
+               .description = "Concurrently submit multiple jobs in a single 
entity",
+               .job_base_us = 1000,
+               .num_jobs = 10,
+               .num_subs = 64,
+       },
+};
+
+static void
+drm_sched_concurrent_desc(const struct drm_sched_concurrent_params *params, 
char *desc)
+{
+       strscpy(desc, params->description, KUNIT_PARAM_DESC_SIZE);
+}
+
+KUNIT_ARRAY_PARAM(drm_sched_concurrent, drm_sched_concurrent_cases, 
drm_sched_concurrent_desc);
+
+struct submitter_data {
+       struct work_struct work;
+       struct sched_concurrent_test_context *ctx;
+       struct drm_mock_sched_entity *entity;
+       struct drm_mock_sched_job **jobs;
+       struct kunit *test;
+       unsigned int id;
+       bool timedout;
+};
+
+static void drm_sched_submitter_worker(struct work_struct *work)
+{
+       const struct drm_sched_concurrent_params *params;
+       struct sched_concurrent_test_context *ctx;
+       struct submitter_data *sub_data;
+       unsigned int i, duration_us;
+       unsigned long timeout_jiffies;
+       bool done;
+
+       sub_data = container_of(work, struct submitter_data, work);
+       ctx = sub_data->ctx;
+       params = sub_data->test->param_value;
+
+       wait_for_completion(&ctx->wait_go);
+
+       for (i = 0; i < params->num_jobs; i++) {
+               duration_us = params->job_base_us + (sub_data->id * 10);


Why is job duration dependent by the submitter id?

Would it be feasiable to not use auto-completing jobs and insteadadvance the timeline manually? Given how the premise of the test seemsto be about concurrent submission it sounds plausible that what happensafter submission maybe isn't very relevant.

+               drm_mock_sched_job_set_duration_us(sub_data->jobs[i], 
duration_us);
+               drm_mock_sched_job_submit(sub_data->jobs[i]);

On a related note, one interesting thing to add coverage for later ismulti-threaded submit of multiple jobs against a single entity. But itis not an immediate need. Just mentioning it as something interesting.

It would mean open coding drm_mock_sched_job_submit() asdrm_sched_job_arm() and drm_sched_entity_push_job() and sticking somedelay in between so two threads have the chance to interleave. Mockscheduler does not handle it today, neither does the scheduler itselfwho punts responsibility to callers. So adding a test and making themock scheduler handle that properly would serve as an example on howscheduler must be used. Or what can go bad if it isn't.

+       }
+
+       timeout_jiffies = usecs_to_jiffies(params->job_base_us * 
params->num_subs *
+                                          params->num_jobs * 10);

The timeout calculation could use a comment. You are using num_subs * 10to match the duratiot_us above being id * 10? With logic of calculatinga pessimistic timeout?


Have you tried it with qemu to check if it is pessimistic enough?

+       for (i = 0; i < params->num_jobs; i++) {
+               done = drm_mock_sched_job_wait_finished(sub_data->jobs[i],
+                                                       timeout_jiffies);
+               if (!done)
+                       sub_data->timedout = true;
+       }

Technically you only need to wait on the last job but it is passablelike this too.

Also, is it important for the worker to wait for completion or the mainthread could simply wait for everything? Maybe that would simplify things.

Manual timeline advance and this combined would mean the workers onlysubmit jobs, while the main thread simply doesdrm_mock_sched_advance(sched, num_subs * num_jobs) and waits for lastjob from each submitter to finish.

Again, auto-completion and timeout reporting is something I do notimmediate see is relevant for multi-threaded submission testing.

Maybe if you want to test the mock scheduler itself it could be, butthen I would add it as separate entry in drm_sched_concurrent_cases[].Like maybe have a flag/boolean "auto-complete jobs". So one without andone with that set.

Other than that it looks tidy and was easy to follow. Only thing whichslightly got me was the term "subs" since I don't intuitively associateit with a submitter but, well, a sub entity of some kind. Might be worthrenaming that to submitter(s), or even dropping the prefix in some casesmight be feasible (like sub(s)_data).


Regards,

Tvrtko

+}
+
+static void drm_sched_concurrent_submit_test(struct kunit *test)
+{
+       struct sched_concurrent_test_context *ctx = test->priv;
+       const struct drm_sched_concurrent_params *params = test->param_value;
+       struct submitter_data *subs_data;
+       unsigned int i, j;
+       int ret;
+
+       subs_data = kunit_kcalloc(test, params->num_subs, sizeof(*subs_data),
+                                 GFP_KERNEL);
+       KUNIT_ASSERT_NOT_NULL(test, subs_data);
+
+       /*
+        * Pre-allocate entities and jobs in the main thread to avoid KUnit
+        * assertions in submitters threads
+        */
+       for (i = 0; i < params->num_subs; i++) {
+               subs_data[i].id = i;
+               subs_data[i].ctx = ctx;
+               subs_data[i].test = test;
+               subs_data[i].timedout = false;
+               subs_data[i].entity = drm_mock_sched_entity_new(test,
+                                                               
DRM_SCHED_PRIORITY_NORMAL,
+                                                               ctx->sched);
+
+               ret = kunit_add_action_or_reset(test, 
drm_mock_sched_entity_free_wrap,
+                                               subs_data[i].entity);
+               KUNIT_ASSERT_EQ(test, ret, 0);
+
+               subs_data[i].jobs = kunit_kcalloc(test, params->num_jobs,
+                                                 sizeof(*subs_data[i].jobs), 
GFP_KERNEL);
+               KUNIT_ASSERT_NOT_NULL(test, subs_data[i].jobs);
+
+               for (j = 0; j < params->num_jobs; j++)
+                       subs_data[i].jobs[j] = drm_mock_sched_job_new(test,
+                                                                     
subs_data[i].entity);
+
+               INIT_WORK(&subs_data[i].work, drm_sched_submitter_worker);
+               queue_work(ctx->sub_wq, &subs_data[i].work);
+       }
+
+       complete_all(&ctx->wait_go);
+       flush_workqueue(ctx->sub_wq);
+
+       for (i = 0; i < params->num_subs; i++)
+               KUNIT_ASSERT_FALSE_MSG(test, subs_data[i].timedout,
+                                      "Job submitter worker %u timedout", i);
+}
+
+static struct kunit_case drm_sched_concurrent_tests[] = {
+       KUNIT_CASE_PARAM(drm_sched_concurrent_submit_test, 
drm_sched_concurrent_gen_params),
+       {}
+};
+
+static struct kunit_suite drm_sched_concurrent = {
+       .name = "drm_sched_concurrent_tests",
+       .init = drm_sched_concurrent_init,
+       .test_cases = drm_sched_concurrent_tests,
+};
+
  static struct kunit_case drm_sched_cancel_tests[] = {
        KUNIT_CASE(drm_sched_basic_cancel),
        {}
@@ -556,6 +730,7 @@ static struct kunit_suite drm_sched_credits = {
  };

kunit_test_suites(&drm_sched_basic,

+                 &drm_sched_concurrent,
                  &drm_sched_timeout,
                  &drm_sched_cancel,
                  &drm_sched_priority,

Re: [RFC PATCH] drm/sched: Add new KUnit test suite for concurrent job submission

Reply via email to