Re: [Intel-gfx] [PATCH 1/6] drm/i915: Individual request cancellation
On 22/03/2021 15:38, Matthew Auld wrote: On Thu, 18 Mar 2021 at 17:04, Tvrtko Ursulin wrote: From: Chris Wilson Currently, we cancel outstanding requests within a context when the context is closed. We may also want to cancel individual requests using the same graceful preemption mechanism. v2 (Tvrtko): * Cancel waiters carefully considering no timeline lock and RCU. * Fixed selftests. v3 (Tvrtko): * Remove error propagation to waiters for now. Signed-off-by: Chris Wilson Signed-off-by: Tvrtko Ursulin --- .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 1 + .../drm/i915/gt/intel_execlists_submission.c | 9 +- drivers/gpu/drm/i915/i915_request.c | 52 - drivers/gpu/drm/i915/i915_request.h | 4 +- drivers/gpu/drm/i915/selftests/i915_request.c | 201 ++ 5 files changed, 261 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c index 0b062fad1837..e2fb3ae2aaf3 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -314,6 +314,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine) mutex_unlock(>timeline->mutex); } + intel_engine_flush_scheduler(engine); intel_engine_pm_put(engine); return err; } diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 85ff5fe861b4..4c2acb5a6c0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -421,6 +421,11 @@ static void reset_active(struct i915_request *rq, ce->lrc.lrca = lrc_update_regs(ce, engine, head); } +static bool bad_request(const struct i915_request *rq) +{ + return rq->fence.error && i915_request_started(rq); +} + static struct intel_engine_cs * __execlists_schedule_in(struct i915_request *rq) { @@ -433,7 +438,7 @@ __execlists_schedule_in(struct i915_request *rq) !intel_engine_has_heartbeat(engine))) intel_context_set_banned(ce); - if (unlikely(intel_context_is_banned(ce))) + if (unlikely(intel_context_is_banned(ce) || bad_request(rq))) reset_active(rq, engine); if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) @@ -1112,7 +1117,7 @@ static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, return 0; /* Force a fast reset for terminated contexts (ignoring sysfs!) */ - if (unlikely(intel_context_is_banned(rq->context))) + if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq))) return 1; return READ_ONCE(engine->props.preempt_timeout_ms); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index e7b4c4bc41a6..b4511ac05e9a 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -33,7 +33,10 @@ #include "gem/i915_gem_context.h" #include "gt/intel_breadcrumbs.h" #include "gt/intel_context.h" +#include "gt/intel_engine.h" +#include "gt/intel_engine_heartbeat.h" #include "gt/intel_gpu_commands.h" +#include "gt/intel_reset.h" #include "gt/intel_ring.h" #include "gt/intel_rps.h" @@ -429,20 +432,22 @@ void __i915_request_skip(struct i915_request *rq) rq->infix = rq->postfix; } -void i915_request_set_error_once(struct i915_request *rq, int error) +bool i915_request_set_error_once(struct i915_request *rq, int error) { int old; GEM_BUG_ON(!IS_ERR_VALUE((long)error)); if (i915_request_signaled(rq)) - return; + return false; old = READ_ONCE(rq->fence.error); do { if (fatal_error(old)) - return; + return false; } while (!try_cmpxchg(>fence.error, , error)); + + return true; } struct i915_request *i915_request_mark_eio(struct i915_request *rq) @@ -609,6 +614,47 @@ void i915_request_unsubmit(struct i915_request *request) spin_unlock_irqrestore(>lock, flags); } +static struct intel_engine_cs *active_engine(struct i915_request *rq) +{ + struct intel_engine_cs *engine, *locked; + + locked = READ_ONCE(rq->engine); + spin_lock_irq(>sched.lock); + while (unlikely(locked != (engine = READ_ONCE(rq->engine { + spin_unlock(>sched.lock); + locked = engine; + spin_lock(>sched.lock); + } + + engine = NULL; + if (i915_request_is_active(rq) && !__i915_request_is_complete(rq)) + engine = locked; + + spin_unlock_irq(>sched.lock); + + return engine; Bad idea to reuse __active_engine() somehow? I can try and see how it ends up looking. Reviewed-by: Matthew Auld Thanks, Tvrtko
Re: [Intel-gfx] [PATCH 1/6] drm/i915: Individual request cancellation
On Thu, 18 Mar 2021 at 17:04, Tvrtko Ursulin wrote: > > From: Chris Wilson > > Currently, we cancel outstanding requests within a context when the > context is closed. We may also want to cancel individual requests using > the same graceful preemption mechanism. > > v2 (Tvrtko): > * Cancel waiters carefully considering no timeline lock and RCU. > * Fixed selftests. > > v3 (Tvrtko): > * Remove error propagation to waiters for now. > > Signed-off-by: Chris Wilson > Signed-off-by: Tvrtko Ursulin > --- > .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 1 + > .../drm/i915/gt/intel_execlists_submission.c | 9 +- > drivers/gpu/drm/i915/i915_request.c | 52 - > drivers/gpu/drm/i915/i915_request.h | 4 +- > drivers/gpu/drm/i915/selftests/i915_request.c | 201 ++ > 5 files changed, 261 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > index 0b062fad1837..e2fb3ae2aaf3 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c > @@ -314,6 +314,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine) > mutex_unlock(>timeline->mutex); > } > > + intel_engine_flush_scheduler(engine); > intel_engine_pm_put(engine); > return err; > } > diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c > b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c > index 85ff5fe861b4..4c2acb5a6c0a 100644 > --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c > +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c > @@ -421,6 +421,11 @@ static void reset_active(struct i915_request *rq, > ce->lrc.lrca = lrc_update_regs(ce, engine, head); > } > > +static bool bad_request(const struct i915_request *rq) > +{ > + return rq->fence.error && i915_request_started(rq); > +} > + > static struct intel_engine_cs * > __execlists_schedule_in(struct i915_request *rq) > { > @@ -433,7 +438,7 @@ __execlists_schedule_in(struct i915_request *rq) > !intel_engine_has_heartbeat(engine))) > intel_context_set_banned(ce); > > - if (unlikely(intel_context_is_banned(ce))) > + if (unlikely(intel_context_is_banned(ce) || bad_request(rq))) > reset_active(rq, engine); > > if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) > @@ -1112,7 +1117,7 @@ static unsigned long active_preempt_timeout(struct > intel_engine_cs *engine, > return 0; > > /* Force a fast reset for terminated contexts (ignoring sysfs!) */ > - if (unlikely(intel_context_is_banned(rq->context))) > + if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq))) > return 1; > > return READ_ONCE(engine->props.preempt_timeout_ms); > diff --git a/drivers/gpu/drm/i915/i915_request.c > b/drivers/gpu/drm/i915/i915_request.c > index e7b4c4bc41a6..b4511ac05e9a 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -33,7 +33,10 @@ > #include "gem/i915_gem_context.h" > #include "gt/intel_breadcrumbs.h" > #include "gt/intel_context.h" > +#include "gt/intel_engine.h" > +#include "gt/intel_engine_heartbeat.h" > #include "gt/intel_gpu_commands.h" > +#include "gt/intel_reset.h" > #include "gt/intel_ring.h" > #include "gt/intel_rps.h" > > @@ -429,20 +432,22 @@ void __i915_request_skip(struct i915_request *rq) > rq->infix = rq->postfix; > } > > -void i915_request_set_error_once(struct i915_request *rq, int error) > +bool i915_request_set_error_once(struct i915_request *rq, int error) > { > int old; > > GEM_BUG_ON(!IS_ERR_VALUE((long)error)); > > if (i915_request_signaled(rq)) > - return; > + return false; > > old = READ_ONCE(rq->fence.error); > do { > if (fatal_error(old)) > - return; > + return false; > } while (!try_cmpxchg(>fence.error, , error)); > + > + return true; > } > > struct i915_request *i915_request_mark_eio(struct i915_request *rq) > @@ -609,6 +614,47 @@ void i915_request_unsubmit(struct i915_request *request) > spin_unlock_irqrestore(>lock, flags); > } > > +static struct intel_engine_cs *active_engine(struct i915_request *rq) > +{ > + struct intel_engine_cs *engine, *locked; > + > + locked = READ_ONCE(rq->engine); > + spin_lock_irq(>sched.lock); > + while (unlikely(locked != (engine = READ_ONCE(rq->engine { > + spin_unlock(>sched.lock); > + locked = engine; > + spin_lock(>sched.lock); > + } > + > + engine = NULL; > + if (i915_request_is_active(rq) && !__i915_request_is_complete(rq)) > + engine = locked; > + > + spin_unlock_irq(>sched.lock); > + > + return