Re: [Intel-gfx] [PATCH 4/4] drm/i915/gem: Always test execution status on closing the context

2020-09-25 Thread Tvrtko Ursulin



On 25/09/2020 11:05, Chris Wilson wrote:

Quoting Tvrtko Ursulin (2020-09-24 15:26:56)


On 16/09/2020 10:42, Chris Wilson wrote:

Verify that if a context is active at the time it is closed, that it is
either persistent and preemptible (with hangcheck running) or it shall
be removed from execution.

Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
Testcase: igt/gem_ctx_persistence/heartbeat-close
Signed-off-by: Chris Wilson 
Cc: Joonas Lahtinen 
Cc:  # v5.7+
---
   drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +
   1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index a548626fa8bc..4fd38101bb56 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context 
*ctx)
   return rcu_dereference_protected(ctx->engines, true);
   }
   
-static bool __reset_engine(struct intel_engine_cs *engine)

-{
- struct intel_gt *gt = engine->gt;
- bool success = false;
-
- if (!intel_has_reset_engine(gt))
- return false;
-
- if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
-   >reset.flags)) {
- success = intel_engine_reset(engine, NULL) == 0;
- clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
-   >reset.flags);
- }
-
- return success;
-}
-
   static void __reset_context(struct i915_gem_context *ctx,
   struct intel_engine_cs *engine)
   {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine)
* kill the banned context, we fallback to doing a local reset
* instead.
*/
- if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
- !intel_engine_pulse(engine))
- return true;
-
- /* If we are unable to send a pulse, try resetting this engine. */
- return __reset_engine(engine);
+ return intel_engine_pulse(engine) == 0;


Where is the path now which actually resets the currently running
workload (engine) of a non-persistent context? Pulse will be sent and
then rely on hangcheck but otherwise let it run?


If the pulse fails, we just call intel_handle_error() on the engine. On
looking at this code again, I could not justify the open-coding of the
engine reset fragment of the general error handler, especially as we end
up taking that route anyway for device resets. (Unlike inside the
tasklet, there's no atomicity concerns on this engine-reset path.)


I think yesterday I got lost in boolean logic and flows here. Today it 
looks fine. Bool ban will be true and code will indeed enter the 
__kill_context path.


Reviewed-by: Tvrtko Ursulin 

Regards,

Tvrtko



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 4/4] drm/i915/gem: Always test execution status on closing the context

2020-09-25 Thread Chris Wilson
Quoting Tvrtko Ursulin (2020-09-24 15:26:56)
> 
> On 16/09/2020 10:42, Chris Wilson wrote:
> > Verify that if a context is active at the time it is closed, that it is
> > either persistent and preemptible (with hangcheck running) or it shall
> > be removed from execution.
> > 
> > Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
> > Testcase: igt/gem_ctx_persistence/heartbeat-close
> > Signed-off-by: Chris Wilson 
> > Cc: Joonas Lahtinen 
> > Cc:  # v5.7+
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +
> >   1 file changed, 10 insertions(+), 38 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
> > b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > index a548626fa8bc..4fd38101bb56 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context 
> > *ctx)
> >   return rcu_dereference_protected(ctx->engines, true);
> >   }
> >   
> > -static bool __reset_engine(struct intel_engine_cs *engine)
> > -{
> > - struct intel_gt *gt = engine->gt;
> > - bool success = false;
> > -
> > - if (!intel_has_reset_engine(gt))
> > - return false;
> > -
> > - if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
> > -   >reset.flags)) {
> > - success = intel_engine_reset(engine, NULL) == 0;
> > - clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
> > -   >reset.flags);
> > - }
> > -
> > - return success;
> > -}
> > -
> >   static void __reset_context(struct i915_gem_context *ctx,
> >   struct intel_engine_cs *engine)
> >   {
> > @@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs 
> > *engine)
> >* kill the banned context, we fallback to doing a local reset
> >* instead.
> >*/
> > - if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
> > - !intel_engine_pulse(engine))
> > - return true;
> > -
> > - /* If we are unable to send a pulse, try resetting this engine. */
> > - return __reset_engine(engine);
> > + return intel_engine_pulse(engine) == 0;
> 
> Where is the path now which actually resets the currently running 
> workload (engine) of a non-persistent context? Pulse will be sent and 
> then rely on hangcheck but otherwise let it run?

If the pulse fails, we just call intel_handle_error() on the engine. On
looking at this code again, I could not justify the open-coding of the
engine reset fragment of the general error handler, especially as we end
up taking that route anyway for device resets. (Unlike inside the
tasklet, there's no atomicity concerns on this engine-reset path.)
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 4/4] drm/i915/gem: Always test execution status on closing the context

2020-09-24 Thread Tvrtko Ursulin



On 16/09/2020 10:42, Chris Wilson wrote:

Verify that if a context is active at the time it is closed, that it is
either persistent and preemptible (with hangcheck running) or it shall
be removed from execution.

Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
Testcase: igt/gem_ctx_persistence/heartbeat-close
Signed-off-by: Chris Wilson 
Cc: Joonas Lahtinen 
Cc:  # v5.7+
---
  drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +
  1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index a548626fa8bc..4fd38101bb56 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context 
*ctx)
return rcu_dereference_protected(ctx->engines, true);
  }
  
-static bool __reset_engine(struct intel_engine_cs *engine)

-{
-   struct intel_gt *gt = engine->gt;
-   bool success = false;
-
-   if (!intel_has_reset_engine(gt))
-   return false;
-
-   if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
- >reset.flags)) {
-   success = intel_engine_reset(engine, NULL) == 0;
-   clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
- >reset.flags);
-   }
-
-   return success;
-}
-
  static void __reset_context(struct i915_gem_context *ctx,
struct intel_engine_cs *engine)
  {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine)
 * kill the banned context, we fallback to doing a local reset
 * instead.
 */
-   if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
-   !intel_engine_pulse(engine))
-   return true;
-
-   /* If we are unable to send a pulse, try resetting this engine. */
-   return __reset_engine(engine);
+   return intel_engine_pulse(engine) == 0;


Where is the path now which actually resets the currently running 
workload (engine) of a non-persistent context? Pulse will be sent and 
then rely on hangcheck but otherwise let it run?


Regards,

Tvrtko


  }
  
  static bool

@@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct 
intel_context *ce)
return engine;
  }
  
-static void kill_engines(struct i915_gem_engines *engines)

+static void kill_engines(struct i915_gem_engines *engines, bool ban)
  {
struct i915_gem_engines_iter it;
struct intel_context *ce;
@@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
  
-		if (intel_context_set_banned(ce))

+   if (ban && intel_context_set_banned(ce))
continue;
  
  		/*

@@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines)
engine = active_engine(ce);
  
  		/* First attempt to gracefully cancel the context */

-   if (engine && !__cancel_engine(engine))
+   if (engine && !__cancel_engine(engine) && ban)
/*
 * If we are unable to send a preemptive pulse to bump
 * the context from the GPU, we have to resort to a full
@@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines)
}
  }
  
-static void kill_stale_engines(struct i915_gem_context *ctx)

+static void kill_context(struct i915_gem_context *ctx)
  {
+   bool ban = (!i915_gem_context_is_persistent(ctx) ||
+   !ctx->i915->params.enable_hangcheck);
struct i915_gem_engines *pos, *next;
  
  	spin_lock_irq(>stale.lock);

@@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
  
  		spin_unlock_irq(>stale.lock);
  
-		kill_engines(pos);

+   kill_engines(pos, ban);
  
  		spin_lock_irq(>stale.lock);

GEM_BUG_ON(i915_sw_fence_signaled(>fence));
@@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context 
*ctx)
spin_unlock_irq(>stale.lock);
  }
  
-static void kill_context(struct i915_gem_context *ctx)

-{
-   kill_stale_engines(ctx);
-}
-
  static void engines_idle_release(struct i915_gem_context *ctx,
 struct i915_gem_engines *engines)
  {
@@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context 
*ctx,
  
  kill:

if (list_empty(>link)) /* raced, already closed */
-   kill_engines(engines);
+   kill_engines(engines, true);
  
  	i915_sw_fence_commit(>fence);

  }
@@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx)
 * case we opt to forcibly kill off all remaining requests on
 * context close.
 */
-   if 

[Intel-gfx] [PATCH 4/4] drm/i915/gem: Always test execution status on closing the context

2020-09-16 Thread Chris Wilson
Verify that if a context is active at the time it is closed, that it is
either persistent and preemptible (with hangcheck running) or it shall
be removed from execution.

Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
Testcase: igt/gem_ctx_persistence/heartbeat-close
Signed-off-by: Chris Wilson 
Cc: Joonas Lahtinen 
Cc:  # v5.7+
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +
 1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index a548626fa8bc..4fd38101bb56 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context 
*ctx)
return rcu_dereference_protected(ctx->engines, true);
 }
 
-static bool __reset_engine(struct intel_engine_cs *engine)
-{
-   struct intel_gt *gt = engine->gt;
-   bool success = false;
-
-   if (!intel_has_reset_engine(gt))
-   return false;
-
-   if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
- >reset.flags)) {
-   success = intel_engine_reset(engine, NULL) == 0;
-   clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
- >reset.flags);
-   }
-
-   return success;
-}
-
 static void __reset_context(struct i915_gem_context *ctx,
struct intel_engine_cs *engine)
 {
@@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine)
 * kill the banned context, we fallback to doing a local reset
 * instead.
 */
-   if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
-   !intel_engine_pulse(engine))
-   return true;
-
-   /* If we are unable to send a pulse, try resetting this engine. */
-   return __reset_engine(engine);
+   return intel_engine_pulse(engine) == 0;
 }
 
 static bool
@@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct 
intel_context *ce)
return engine;
 }
 
-static void kill_engines(struct i915_gem_engines *engines)
+static void kill_engines(struct i915_gem_engines *engines, bool ban)
 {
struct i915_gem_engines_iter it;
struct intel_context *ce;
@@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
 
-   if (intel_context_set_banned(ce))
+   if (ban && intel_context_set_banned(ce))
continue;
 
/*
@@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines)
engine = active_engine(ce);
 
/* First attempt to gracefully cancel the context */
-   if (engine && !__cancel_engine(engine))
+   if (engine && !__cancel_engine(engine) && ban)
/*
 * If we are unable to send a preemptive pulse to bump
 * the context from the GPU, we have to resort to a full
@@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines)
}
 }
 
-static void kill_stale_engines(struct i915_gem_context *ctx)
+static void kill_context(struct i915_gem_context *ctx)
 {
+   bool ban = (!i915_gem_context_is_persistent(ctx) ||
+   !ctx->i915->params.enable_hangcheck);
struct i915_gem_engines *pos, *next;
 
spin_lock_irq(>stale.lock);
@@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
 
spin_unlock_irq(>stale.lock);
 
-   kill_engines(pos);
+   kill_engines(pos, ban);
 
spin_lock_irq(>stale.lock);
GEM_BUG_ON(i915_sw_fence_signaled(>fence));
@@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context 
*ctx)
spin_unlock_irq(>stale.lock);
 }
 
-static void kill_context(struct i915_gem_context *ctx)
-{
-   kill_stale_engines(ctx);
-}
-
 static void engines_idle_release(struct i915_gem_context *ctx,
 struct i915_gem_engines *engines)
 {
@@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context 
*ctx,
 
 kill:
if (list_empty(>link)) /* raced, already closed */
-   kill_engines(engines);
+   kill_engines(engines, true);
 
i915_sw_fence_commit(>fence);
 }
@@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx)
 * case we opt to forcibly kill off all remaining requests on
 * context close.
 */
-   if (!i915_gem_context_is_persistent(ctx) ||
-   !ctx->i915->params.enable_hangcheck)
-   kill_context(ctx);
+   kill_context(ctx);
 
i915_gem_context_put(ctx);
 }
-- 
2.20.1