Hi Krzysztof,
On Fri, Jan 30, 2026 at 07:45:08PM +0100, Krzysztof Niemiec wrote:
> The i915_active selftests live_active_wait and live_active_retire
> operate on an i915_active attached to a mock, empty request, created as
> part of test setup. A fence is attached to this request to control when
> the request is processed. The tests then wait for the completion of the
> active with __i915_active_wait(), and the test is considered successful
> if this results in setting a variable in the active callback.
>
> However, the behavior of __i915_active_wait() is such that if the
> refcount for the active is 0, the function is almost completely skipped;
> waiting on a already completed active yields no effect. This includes a
> subsequent call to the retire() function of the active (which is the
> callback that the test is interested about, and which dictates whether
> its successful or not). So, if the active is completed before the
> aforementioned call to __i915_active_wait(), the test will fail.
>
> Most of the test runs in a single thread, including creating the
> request, creating the fence for it, signalling that fence, and calling
> __i915_active_wait(). However, the request itself is handled
> asynchronously. This creates a race condition where if the request is
> completed after signalling the fence, but before waiting on its active,
> the active callback will not be invoked, failing the test.
>
> Defer signalling the request's fence, to ensure the main test thread
> gets to call __i915_active_wait() before request completion.
>
> v4:
> - Lower the delay timeout to 50ms (Jonathan)
> - Put the check on work_finished inside a helper function (Jonathan)
>
> v3:
> - Embed the variables inside the live_active struct (Andi)
> - Move the schedule_delayed_work call closer to the wait (Andi)
> - Implement error handling in case an error state - the wait has
> finished, but the deferred work didn't run - is somehow achieved (Andi)
>
> v2:
> - Clarify the need for a fix a little more (Krzysztof K., Janusz)
>
> Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14808
> Signed-off-by: Krzysztof Niemiec <[email protected]>
I'm sorry, but for now this patch, for the reason I explained you
in a previous version, is a nack.
You are trying to bypass locking issues by adding a random 50ms
delay. It's too fragile and looks hackish.
In any case, I've seen a few issues below, in case someone else
agrees and is willing to merge it.
> ---
> drivers/gpu/drm/i915/selftests/i915_active.c | 51 +++++++++++++++++---
> 1 file changed, 45 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c
> b/drivers/gpu/drm/i915/selftests/i915_active.c
> index 36c3a5460221..eadd0a8bc094 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_active.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_active.c
> @@ -21,6 +21,10 @@ struct live_active {
> struct i915_active base;
> struct kref ref;
> bool retired;
> +
> + struct i915_sw_fence *submit;
> + struct delayed_work work;
> + bool work_finished;
> };
>
> static void __live_get(struct live_active *active)
> @@ -76,11 +80,37 @@ static struct live_active *__live_alloc(struct
> drm_i915_private *i915)
> return active;
> }
>
> +static void __live_submit_work_handler(struct work_struct *work)
I don't see the point for the '__' here.
> +{
> + struct delayed_work *d_work = container_of(work, struct delayed_work,
> work);
> + struct live_active *active = container_of(d_work, struct live_active,
> work);
> + i915_sw_fence_commit(active->submit);
> + heap_fence_put(active->submit);
> + active->work_finished = true;
> +}
> +
> +static int
> +__live_work_confirm_finished(struct drm_i915_private *i915,
> + struct live_active *active)
> +{
> + int err = 0;
> +
> + if (!active->work_finished) {
> + struct drm_printer p = drm_err_printer(&i915->drm, __func__);
> +
> + drm_printf(&p, "active->work hasn't finished, something went\
> + terribly wrong\n");
until 100 characters per line is fine, but I'm sure you can
reword to something better.
> + err = -EINVAL;
> + cancel_delayed_work_sync(&active->work);
> + }
> +
> + return err;
> +}
> +
> static struct live_active *
> __live_active_setup(struct drm_i915_private *i915)
> {
> struct intel_engine_cs *engine;
> - struct i915_sw_fence *submit;
> struct live_active *active;
> unsigned int count = 0;
> int err = 0;
> @@ -89,8 +119,11 @@ __live_active_setup(struct drm_i915_private *i915)
> if (!active)
> return ERR_PTR(-ENOMEM);
>
> - submit = heap_fence_create(GFP_KERNEL);
> - if (!submit) {
> + INIT_DELAYED_WORK(&active->work, __live_submit_work_handler);
> + active->work_finished = false;
> +
> + active->submit = heap_fence_create(GFP_KERNEL);
> + if (!active->submit) {
> kfree(active);
> return ERR_PTR(-ENOMEM);
> }
> @@ -109,7 +142,7 @@ __live_active_setup(struct drm_i915_private *i915)
> }
>
> err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> - submit,
> + active->submit,
> GFP_KERNEL);
> if (err >= 0)
> err = i915_active_add_request(&active->base, rq);
> @@ -134,8 +167,6 @@ __live_active_setup(struct drm_i915_private *i915)
> }
>
> out:
> - i915_sw_fence_commit(submit);
> - heap_fence_put(submit);
> if (err) {
> __live_put(active);
> active = ERR_PTR(err);
> @@ -156,6 +187,8 @@ static int live_active_wait(void *arg)
> if (IS_ERR(active))
> return PTR_ERR(active);
if we return with an error...
>
> + schedule_delayed_work(&active->work, msecs_to_jiffies(50));
... we don't schedule the work and therefore you leak the fence.
> +
> __i915_active_wait(&active->base, TASK_UNINTERRUPTIBLE);
> if (!READ_ONCE(active->retired)) {
> struct drm_printer p = drm_err_printer(&i915->drm, __func__);
> @@ -166,6 +199,8 @@ static int live_active_wait(void *arg)
> err = -EINVAL;
> }
>
> + err = __live_work_confirm_finished(i915, active);
Here you are overwriting err.
> +
> __live_put(active);
>
> if (igt_flush_test(i915))
> @@ -186,6 +221,8 @@ static int live_active_retire(void *arg)
> if (IS_ERR(active))
> return PTR_ERR(active);
>
> + schedule_delayed_work(&active->work, msecs_to_jiffies(50));
> +
> /* waits for & retires all requests */
> if (igt_flush_test(i915))
> err = -EIO;
> @@ -199,6 +236,8 @@ static int live_active_retire(void *arg)
> err = -EINVAL;
> }
>
> + err = __live_work_confirm_finished(i915, active);
here as well.
Andi
> +
> __live_put(active);
>
> return err;
> --
> 2.45.2
>