Re: [Intel-gfx] [PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

2022-01-26 Thread Matthew Brost
On Wed, Jan 26, 2022 at 11:03:24AM -0800, John Harrison wrote:
> On 1/24/2022 07:01, Matthew Brost wrote:
> > More than 1 request can be submitted to a single ELSP at a time if
> > multiple requests are ready run to on the same context. When a request
> > is canceled it is marked bad, an idle pulse is triggered to the engine
> > (high priority kernel request), the execlists scheduler sees that
> > running request is bad and sets preemption timeout to minimum value (1
> > ms). This fails to work if multiple requests are combined on the ELSP as
> > only the most recent request is stored in the execlists schedule (the
> > request stored in the ELSP isn't marked bad, thus preemption timeout
> > isn't set to the minimum value). If the preempt timeout is configured to
> > zero, the engine is permanently hung. This is shown by an upcoming
> > selftest.
> > 
> > To work around this, mark the idle pulse with a flag to force a preempt
> > with the minimum value.
> > 
> > Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
> > Signed-off-by: Matthew Brost 
> > ---
> >   .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++
> >   .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
> >   .../drm/i915/gt/intel_execlists_submission.c  | 18 ++-
> >   drivers/gpu/drm/i915/i915_request.h   |  6 +
> >   4 files changed, 38 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
> > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > index a3698f611f457..efd1c719b4072 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > @@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs 
> > *engine)
> > INIT_DELAYED_WORK(>heartbeat.work, heartbeat);
> >   }
> > -static int __intel_engine_pulse(struct intel_engine_cs *engine)
> > +static int __intel_engine_pulse(struct intel_engine_cs *engine,
> > +   bool force_preempt)
> >   {
> > struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
> > struct intel_context *ce = engine->kernel_context;
> > @@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs 
> > *engine)
> > return PTR_ERR(rq);
> > __set_bit(I915_FENCE_FLAG_SENTINEL, >fence.flags);
> > +   if (force_preempt)
> > +   __set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, >fence.flags);
> > heartbeat_commit(rq, );
> > GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
> > @@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
> > *engine,
> > /* recheck current execution */
> > if (intel_engine_has_preemption(engine)) {
> > -   err = __intel_engine_pulse(engine);
> > +   err = __intel_engine_pulse(engine, false);
> > if (err)
> > set_heartbeat(engine, saved);
> > }
> > @@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
> > *engine,
> > return err;
> >   }
> > -int intel_engine_pulse(struct intel_engine_cs *engine)
> > +static int _intel_engine_pulse(struct intel_engine_cs *engine,
> > +  bool force_preempt)
> >   {
> > struct intel_context *ce = engine->kernel_context;
> > int err;
> > @@ -325,7 +329,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
> > err = -EINTR;
> > if (!mutex_lock_interruptible(>timeline->mutex)) {
> > -   err = __intel_engine_pulse(engine);
> > +   err = __intel_engine_pulse(engine, force_preempt);
> > mutex_unlock(>timeline->mutex);
> > }
> > @@ -334,6 +338,17 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
> > return err;
> >   }
> > +int intel_engine_pulse(struct intel_engine_cs *engine)
> > +{
> > +   return _intel_engine_pulse(engine, false);
> > +}
> > +
> > +
> > +int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine)
> > +{
> > +   return _intel_engine_pulse(engine, true);
> > +}
> > +
> >   int intel_engine_flush_barriers(struct intel_engine_cs *engine)
> >   {
> > struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h 
> > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> > index 5da6d809a87a2..d9c8386754cb3 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> > @@ -21,6 +21,7 @@ void intel_gt_park_heartbeats(struct intel_gt *gt);
> >   void intel_gt_unpark_heartbeats(struct intel_gt *gt);
> >   int intel_engine_pulse(struct intel_engine_cs *engine);
> > +int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine);
> >   int intel_engine_flush_barriers(struct intel_engine_cs *engine);
> >   #endif /* INTEL_ENGINE_HEARTBEAT_H */
> > diff --git 

Re: [Intel-gfx] [PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

2022-01-26 Thread John Harrison

On 1/24/2022 07:01, Matthew Brost wrote:

More than 1 request can be submitted to a single ELSP at a time if
multiple requests are ready run to on the same context. When a request
is canceled it is marked bad, an idle pulse is triggered to the engine
(high priority kernel request), the execlists scheduler sees that
running request is bad and sets preemption timeout to minimum value (1
ms). This fails to work if multiple requests are combined on the ELSP as
only the most recent request is stored in the execlists schedule (the
request stored in the ELSP isn't marked bad, thus preemption timeout
isn't set to the minimum value). If the preempt timeout is configured to
zero, the engine is permanently hung. This is shown by an upcoming
selftest.

To work around this, mark the idle pulse with a flag to force a preempt
with the minimum value.

Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
Signed-off-by: Matthew Brost 
---
  .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++
  .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
  .../drm/i915/gt/intel_execlists_submission.c  | 18 ++-
  drivers/gpu/drm/i915/i915_request.h   |  6 +
  4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index a3698f611f457..efd1c719b4072 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs 
*engine)
INIT_DELAYED_WORK(>heartbeat.work, heartbeat);
  }
  
-static int __intel_engine_pulse(struct intel_engine_cs *engine)

+static int __intel_engine_pulse(struct intel_engine_cs *engine,
+   bool force_preempt)
  {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
struct intel_context *ce = engine->kernel_context;
@@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs 
*engine)
return PTR_ERR(rq);
  
  	__set_bit(I915_FENCE_FLAG_SENTINEL, >fence.flags);

+   if (force_preempt)
+   __set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, >fence.flags);
  
  	heartbeat_commit(rq, );

GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
@@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
  
  		/* recheck current execution */

if (intel_engine_has_preemption(engine)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, false);
if (err)
set_heartbeat(engine, saved);
}
@@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
return err;
  }
  
-int intel_engine_pulse(struct intel_engine_cs *engine)

+static int _intel_engine_pulse(struct intel_engine_cs *engine,
+  bool force_preempt)
  {
struct intel_context *ce = engine->kernel_context;
int err;
@@ -325,7 +329,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
  
  	err = -EINTR;

if (!mutex_lock_interruptible(>timeline->mutex)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, force_preempt);
mutex_unlock(>timeline->mutex);
}
  
@@ -334,6 +338,17 @@ int intel_engine_pulse(struct intel_engine_cs *engine)

return err;
  }
  
+int intel_engine_pulse(struct intel_engine_cs *engine)

+{
+   return _intel_engine_pulse(engine, false);
+}
+
+
+int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine)
+{
+   return _intel_engine_pulse(engine, true);
+}
+
  int intel_engine_flush_barriers(struct intel_engine_cs *engine)
  {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
index 5da6d809a87a2..d9c8386754cb3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -21,6 +21,7 @@ void intel_gt_park_heartbeats(struct intel_gt *gt);
  void intel_gt_unpark_heartbeats(struct intel_gt *gt);
  
  int intel_engine_pulse(struct intel_engine_cs *engine);

+int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine);
  int intel_engine_flush_barriers(struct intel_engine_cs *engine);
  
  #endif /* INTEL_ENGINE_HEARTBEAT_H */

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 960a9aaf4f3a3..f0c2024058731 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1222,26 +1222,29 @@ static void record_preemption(struct 
intel_engine_execlists 

Re: [Intel-gfx] [PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

2022-01-26 Thread Tvrtko Ursulin



On 25/01/2022 16:32, Matthew Brost wrote:

On Tue, Jan 25, 2022 at 03:27:31PM +, Tvrtko Ursulin wrote:


On 24/01/2022 15:01, Matthew Brost wrote:

More than 1 request can be submitted to a single ELSP at a time if
multiple requests are ready run to on the same context. When a request
is canceled it is marked bad, an idle pulse is triggered to the engine
(high priority kernel request), the execlists scheduler sees that
running request is bad and sets preemption timeout to minimum value (1
ms). This fails to work if multiple requests are combined on the ELSP as
only the most recent request is stored in the execlists schedule (the
request stored in the ELSP isn't marked bad, thus preemption timeout
isn't set to the minimum value). If the preempt timeout is configured to
zero, the engine is permanently hung. This is shown by an upcoming
selftest.

To work around this, mark the idle pulse with a flag to force a preempt
with the minimum value.


A couple of quick questions first before I find time to dig deeper.

First about the "permanently hung" statement. How permanent? Does the
heartbeat eventually resolve it and if not why not? Naive view is that
missed heartbeats would identify the stuck non-preemptible request and then
engine reset would skip over it.



Yes, if the heartbeat is enabled then the heartbeat would eventually
recover the engine. It is not always enabled though...


If it does resolve, then the problem is only that request timeout works less
well if someone set preempt timeout to zero? Which may not be as bad, since
request timeout was never about any time guarantees.



Yes, if the heartbeat is enabled the problem isn't as bad.


Good, so commit message needs some work to be accurate.

On the technical side of the patch it looks reasonable to me. And the 
idea that cancellation pulse is made special also sounds plausible. 
Question is whether we want to add code to support this considering the 
opens I have:


1)
Do we care about request cancellation working for non-preemptible 
batches, *if* someone explicitly disabled both preemption timeout and 
hearbteats?


2)
Do we care to improve the responsiveness of request cancellation if only 
preempt timeout was disabled?


Conclusions here will also dictate whether Fixes: tag is warranted. Best 
to avoid hairy backports if we decide it is not really needed.


Also, in the next patch you change one selftest to only run with preempt 
timeout disabled. I think it makes sense to have this test run in the 
default config (preempt timeout enabled) to reflect the typical 
configuration. You may add a second pass with it disabled to execise the 
corner case, again, depending on conclusions after above questions.


Regards,

Tvrtko



Matt


Regards,

Tvrtko



Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
Signed-off-by: Matthew Brost 
---
   .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++
   .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
   .../drm/i915/gt/intel_execlists_submission.c  | 18 ++-
   drivers/gpu/drm/i915/i915_request.h   |  6 +
   4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index a3698f611f457..efd1c719b4072 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs 
*engine)
INIT_DELAYED_WORK(>heartbeat.work, heartbeat);
   }
-static int __intel_engine_pulse(struct intel_engine_cs *engine)
+static int __intel_engine_pulse(struct intel_engine_cs *engine,
+   bool force_preempt)
   {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
struct intel_context *ce = engine->kernel_context;
@@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs 
*engine)
return PTR_ERR(rq);
__set_bit(I915_FENCE_FLAG_SENTINEL, >fence.flags);
+   if (force_preempt)
+   __set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, >fence.flags);
heartbeat_commit(rq, );
GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
@@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
/* recheck current execution */
if (intel_engine_has_preemption(engine)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, false);
if (err)
set_heartbeat(engine, saved);
}
@@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
return err;
   }
-int intel_engine_pulse(struct intel_engine_cs *engine)
+static int _intel_engine_pulse(struct intel_engine_cs *engine,
+  bool 

Re: [Intel-gfx] [PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

2022-01-25 Thread Matthew Brost
On Tue, Jan 25, 2022 at 03:27:31PM +, Tvrtko Ursulin wrote:
> 
> On 24/01/2022 15:01, Matthew Brost wrote:
> > More than 1 request can be submitted to a single ELSP at a time if
> > multiple requests are ready run to on the same context. When a request
> > is canceled it is marked bad, an idle pulse is triggered to the engine
> > (high priority kernel request), the execlists scheduler sees that
> > running request is bad and sets preemption timeout to minimum value (1
> > ms). This fails to work if multiple requests are combined on the ELSP as
> > only the most recent request is stored in the execlists schedule (the
> > request stored in the ELSP isn't marked bad, thus preemption timeout
> > isn't set to the minimum value). If the preempt timeout is configured to
> > zero, the engine is permanently hung. This is shown by an upcoming
> > selftest.
> > 
> > To work around this, mark the idle pulse with a flag to force a preempt
> > with the minimum value.
> 
> A couple of quick questions first before I find time to dig deeper.
> 
> First about the "permanently hung" statement. How permanent? Does the
> heartbeat eventually resolve it and if not why not? Naive view is that
> missed heartbeats would identify the stuck non-preemptible request and then
> engine reset would skip over it.
> 

Yes, if the heartbeat is enabled then the heartbeat would eventually
recover the engine. It is not always enabled though...

> If it does resolve, then the problem is only that request timeout works less
> well if someone set preempt timeout to zero? Which may not be as bad, since
> request timeout was never about any time guarantees.
>

Yes, if the heartbeat is enabled the problem isn't as bad.

Matt

> Regards,
> 
> Tvrtko
> 
> > 
> > Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
> > Signed-off-by: Matthew Brost 
> > ---
> >   .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++
> >   .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
> >   .../drm/i915/gt/intel_execlists_submission.c  | 18 ++-
> >   drivers/gpu/drm/i915/i915_request.h   |  6 +
> >   4 files changed, 38 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
> > b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > index a3698f611f457..efd1c719b4072 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > @@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs 
> > *engine)
> > INIT_DELAYED_WORK(>heartbeat.work, heartbeat);
> >   }
> > -static int __intel_engine_pulse(struct intel_engine_cs *engine)
> > +static int __intel_engine_pulse(struct intel_engine_cs *engine,
> > +   bool force_preempt)
> >   {
> > struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
> > struct intel_context *ce = engine->kernel_context;
> > @@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs 
> > *engine)
> > return PTR_ERR(rq);
> > __set_bit(I915_FENCE_FLAG_SENTINEL, >fence.flags);
> > +   if (force_preempt)
> > +   __set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, >fence.flags);
> > heartbeat_commit(rq, );
> > GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
> > @@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
> > *engine,
> > /* recheck current execution */
> > if (intel_engine_has_preemption(engine)) {
> > -   err = __intel_engine_pulse(engine);
> > +   err = __intel_engine_pulse(engine, false);
> > if (err)
> > set_heartbeat(engine, saved);
> > }
> > @@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
> > *engine,
> > return err;
> >   }
> > -int intel_engine_pulse(struct intel_engine_cs *engine)
> > +static int _intel_engine_pulse(struct intel_engine_cs *engine,
> > +  bool force_preempt)
> >   {
> > struct intel_context *ce = engine->kernel_context;
> > int err;
> > @@ -325,7 +329,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
> > err = -EINTR;
> > if (!mutex_lock_interruptible(>timeline->mutex)) {
> > -   err = __intel_engine_pulse(engine);
> > +   err = __intel_engine_pulse(engine, force_preempt);
> > mutex_unlock(>timeline->mutex);
> > }
> > @@ -334,6 +338,17 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
> > return err;
> >   }
> > +int intel_engine_pulse(struct intel_engine_cs *engine)
> > +{
> > +   return _intel_engine_pulse(engine, false);
> > +}
> > +
> > +
> > +int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine)
> > +{
> > +   return _intel_engine_pulse(engine, true);
> > +}
> > +
> >   int intel_engine_flush_barriers(struct intel_engine_cs *engine)
> >   {
> > 

Re: [Intel-gfx] [PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

2022-01-25 Thread Tvrtko Ursulin



On 24/01/2022 15:01, Matthew Brost wrote:

More than 1 request can be submitted to a single ELSP at a time if
multiple requests are ready run to on the same context. When a request
is canceled it is marked bad, an idle pulse is triggered to the engine
(high priority kernel request), the execlists scheduler sees that
running request is bad and sets preemption timeout to minimum value (1
ms). This fails to work if multiple requests are combined on the ELSP as
only the most recent request is stored in the execlists schedule (the
request stored in the ELSP isn't marked bad, thus preemption timeout
isn't set to the minimum value). If the preempt timeout is configured to
zero, the engine is permanently hung. This is shown by an upcoming
selftest.

To work around this, mark the idle pulse with a flag to force a preempt
with the minimum value.


A couple of quick questions first before I find time to dig deeper.

First about the "permanently hung" statement. How permanent? Does the 
heartbeat eventually resolve it and if not why not? Naive view is that 
missed heartbeats would identify the stuck non-preemptible request and 
then engine reset would skip over it.


If it does resolve, then the problem is only that request timeout works 
less well if someone set preempt timeout to zero? Which may not be as 
bad, since request timeout was never about any time guarantees.


Regards,

Tvrtko



Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
Signed-off-by: Matthew Brost 
---
  .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++
  .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
  .../drm/i915/gt/intel_execlists_submission.c  | 18 ++-
  drivers/gpu/drm/i915/i915_request.h   |  6 +
  4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index a3698f611f457..efd1c719b4072 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs 
*engine)
INIT_DELAYED_WORK(>heartbeat.work, heartbeat);
  }
  
-static int __intel_engine_pulse(struct intel_engine_cs *engine)

+static int __intel_engine_pulse(struct intel_engine_cs *engine,
+   bool force_preempt)
  {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
struct intel_context *ce = engine->kernel_context;
@@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs 
*engine)
return PTR_ERR(rq);
  
  	__set_bit(I915_FENCE_FLAG_SENTINEL, >fence.flags);

+   if (force_preempt)
+   __set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, >fence.flags);
  
  	heartbeat_commit(rq, );

GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
@@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
  
  		/* recheck current execution */

if (intel_engine_has_preemption(engine)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, false);
if (err)
set_heartbeat(engine, saved);
}
@@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
return err;
  }
  
-int intel_engine_pulse(struct intel_engine_cs *engine)

+static int _intel_engine_pulse(struct intel_engine_cs *engine,
+  bool force_preempt)
  {
struct intel_context *ce = engine->kernel_context;
int err;
@@ -325,7 +329,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
  
  	err = -EINTR;

if (!mutex_lock_interruptible(>timeline->mutex)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, force_preempt);
mutex_unlock(>timeline->mutex);
}
  
@@ -334,6 +338,17 @@ int intel_engine_pulse(struct intel_engine_cs *engine)

return err;
  }
  
+int intel_engine_pulse(struct intel_engine_cs *engine)

+{
+   return _intel_engine_pulse(engine, false);
+}
+
+
+int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine)
+{
+   return _intel_engine_pulse(engine, true);
+}
+
  int intel_engine_flush_barriers(struct intel_engine_cs *engine)
  {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
index 5da6d809a87a2..d9c8386754cb3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -21,6 +21,7 @@ void intel_gt_park_heartbeats(struct intel_gt *gt);
  void intel_gt_unpark_heartbeats(struct intel_gt *gt);
  
  int intel_engine_pulse(struct intel_engine_cs 

[Intel-gfx] [PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

2022-01-24 Thread Matthew Brost
More than 1 request can be submitted to a single ELSP at a time if
multiple requests are ready run to on the same context. When a request
is canceled it is marked bad, an idle pulse is triggered to the engine
(high priority kernel request), the execlists scheduler sees that
running request is bad and sets preemption timeout to minimum value (1
ms). This fails to work if multiple requests are combined on the ELSP as
only the most recent request is stored in the execlists schedule (the
request stored in the ELSP isn't marked bad, thus preemption timeout
isn't set to the minimum value). If the preempt timeout is configured to
zero, the engine is permanently hung. This is shown by an upcoming
selftest.

To work around this, mark the idle pulse with a flag to force a preempt
with the minimum value.

Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
Signed-off-by: Matthew Brost 
---
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++
 .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
 .../drm/i915/gt/intel_execlists_submission.c  | 18 ++-
 drivers/gpu/drm/i915/i915_request.h   |  6 +
 4 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index a3698f611f457..efd1c719b4072 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs 
*engine)
INIT_DELAYED_WORK(>heartbeat.work, heartbeat);
 }
 
-static int __intel_engine_pulse(struct intel_engine_cs *engine)
+static int __intel_engine_pulse(struct intel_engine_cs *engine,
+   bool force_preempt)
 {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
struct intel_context *ce = engine->kernel_context;
@@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs 
*engine)
return PTR_ERR(rq);
 
__set_bit(I915_FENCE_FLAG_SENTINEL, >fence.flags);
+   if (force_preempt)
+   __set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, >fence.flags);
 
heartbeat_commit(rq, );
GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
@@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
 
/* recheck current execution */
if (intel_engine_has_preemption(engine)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, false);
if (err)
set_heartbeat(engine, saved);
}
@@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs 
*engine,
return err;
 }
 
-int intel_engine_pulse(struct intel_engine_cs *engine)
+static int _intel_engine_pulse(struct intel_engine_cs *engine,
+  bool force_preempt)
 {
struct intel_context *ce = engine->kernel_context;
int err;
@@ -325,7 +329,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
 
err = -EINTR;
if (!mutex_lock_interruptible(>timeline->mutex)) {
-   err = __intel_engine_pulse(engine);
+   err = __intel_engine_pulse(engine, force_preempt);
mutex_unlock(>timeline->mutex);
}
 
@@ -334,6 +338,17 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
return err;
 }
 
+int intel_engine_pulse(struct intel_engine_cs *engine)
+{
+   return _intel_engine_pulse(engine, false);
+}
+
+
+int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine)
+{
+   return _intel_engine_pulse(engine, true);
+}
+
 int intel_engine_flush_barriers(struct intel_engine_cs *engine)
 {
struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h 
b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
index 5da6d809a87a2..d9c8386754cb3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -21,6 +21,7 @@ void intel_gt_park_heartbeats(struct intel_gt *gt);
 void intel_gt_unpark_heartbeats(struct intel_gt *gt);
 
 int intel_engine_pulse(struct intel_engine_cs *engine);
+int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine);
 int intel_engine_flush_barriers(struct intel_engine_cs *engine);
 
 #endif /* INTEL_ENGINE_HEARTBEAT_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 960a9aaf4f3a3..f0c2024058731 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1222,26 +1222,29 @@ static void record_preemption(struct 
intel_engine_execlists *execlists)
 }
 
 static unsigned long