[Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-07-22 Thread Matthew Brost
When using GuC submission, if a context gets banned disable scheduling
and mark all inflight requests as complete.

Cc: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: John Harrison 
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
 drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
 .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 --
 drivers/gpu/drm/i915/i915_trace.h |  10 ++
 8 files changed, 195 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e3df01a201d7..05c3ee191710 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
*engines, bool ban)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
 
-   if (ban && intel_context_set_banned(ce))
+   if (ban && intel_context_ban(ce, NULL))
continue;
 
/*
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
b/drivers/gpu/drm/i915/gt/intel_context.h
index 2ed9bf5f91a5..814d9277096a 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -16,6 +16,7 @@
 #include "intel_engine_types.h"
 #include "intel_ring_types.h"
 #include "intel_timeline_types.h"
+#include "i915_trace.h"
 
 #define CE_TRACE(ce, fmt, ...) do {\
const struct intel_context *ce__ = (ce);\
@@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
intel_context *ce)
return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
 }
 
+static inline bool intel_context_ban(struct intel_context *ce,
+struct i915_request *rq)
+{
+   bool ret = intel_context_set_banned(ce);
+
+   trace_intel_context_ban(ce);
+   if (ce->ops->ban)
+   ce->ops->ban(ce, rq);
+
+   return ret;
+}
+
 static inline bool
 intel_context_force_single_submission(const struct intel_context *ce)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 035108c10b2c..57c19ee3e313 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -35,6 +35,8 @@ struct intel_context_ops {
 
int (*alloc)(struct intel_context *ce);
 
+   void (*ban)(struct intel_context *ce, struct i915_request *rq);
+
int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, 
void **vaddr);
int (*pin)(struct intel_context *ce, void *vaddr);
void (*unpin)(struct intel_context *ce);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index f3cdbf4ba5c8..3ed694cab5af 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -22,7 +22,6 @@
 #include "intel_reset.h"
 
 #include "uc/intel_guc.h"
-#include "uc/intel_guc_submission.h"
 
 #define RESET_MAX_RETRIES 3
 
@@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, 
i915_reg_t reg, u32 clr)
intel_uncore_rmw_fw(uncore, reg, clr, 0);
 }
 
-static void skip_context(struct i915_request *rq)
-{
-   struct intel_context *hung_ctx = rq->context;
-
-   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
-   if (!i915_request_is_active(rq))
-   return;
-
-   if (rq->context == hung_ctx) {
-   i915_request_set_error_once(rq, -EIO);
-   __i915_request_skip(rq);
-   }
-   }
-}
-
 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
 {
struct drm_i915_file_private *file_priv = ctx->file_priv;
@@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
bool banned;
int i;
 
-   if (intel_context_is_closed(rq->context)) {
-   intel_context_set_banned(rq->context);
+   if (intel_context_is_closed(rq->context))
return true;
-   }
 
rcu_read_lock();
ctx = rcu_dereference(rq->context->gem_context);
@@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
banned = !i915_gem_context_is_recoverable(ctx);
if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
banned = true;
-   if (banned) {
+   if (banned)
drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
ctx->name, atomic_read(&ctx->guilty_count));
-   intel_context_set_banned(rq->con

[Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-07-26 Thread Matthew Brost
When using GuC submission, if a context gets banned disable scheduling
and mark all inflight requests as complete.

Cc: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: John Harrison 
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
 drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
 drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
 drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
 .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 --
 drivers/gpu/drm/i915/i915_trace.h |  10 ++
 8 files changed, 195 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e3df01a201d7..05c3ee191710 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
*engines, bool ban)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
 
-   if (ban && intel_context_set_banned(ce))
+   if (ban && intel_context_ban(ce, NULL))
continue;
 
/*
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
b/drivers/gpu/drm/i915/gt/intel_context.h
index 2ed9bf5f91a5..814d9277096a 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -16,6 +16,7 @@
 #include "intel_engine_types.h"
 #include "intel_ring_types.h"
 #include "intel_timeline_types.h"
+#include "i915_trace.h"
 
 #define CE_TRACE(ce, fmt, ...) do {\
const struct intel_context *ce__ = (ce);\
@@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
intel_context *ce)
return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
 }
 
+static inline bool intel_context_ban(struct intel_context *ce,
+struct i915_request *rq)
+{
+   bool ret = intel_context_set_banned(ce);
+
+   trace_intel_context_ban(ce);
+   if (ce->ops->ban)
+   ce->ops->ban(ce, rq);
+
+   return ret;
+}
+
 static inline bool
 intel_context_force_single_submission(const struct intel_context *ce)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 035108c10b2c..57c19ee3e313 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -35,6 +35,8 @@ struct intel_context_ops {
 
int (*alloc)(struct intel_context *ce);
 
+   void (*ban)(struct intel_context *ce, struct i915_request *rq);
+
int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, 
void **vaddr);
int (*pin)(struct intel_context *ce, void *vaddr);
void (*unpin)(struct intel_context *ce);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 4d281bc8a38c..91200c43951f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -22,7 +22,6 @@
 #include "intel_reset.h"
 
 #include "uc/intel_guc.h"
-#include "uc/intel_guc_submission.h"
 
 #define RESET_MAX_RETRIES 3
 
@@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, 
i915_reg_t reg, u32 clr)
intel_uncore_rmw_fw(uncore, reg, clr, 0);
 }
 
-static void skip_context(struct i915_request *rq)
-{
-   struct intel_context *hung_ctx = rq->context;
-
-   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
-   if (!i915_request_is_active(rq))
-   return;
-
-   if (rq->context == hung_ctx) {
-   i915_request_set_error_once(rq, -EIO);
-   __i915_request_skip(rq);
-   }
-   }
-}
-
 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
 {
struct drm_i915_file_private *file_priv = ctx->file_priv;
@@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
bool banned;
int i;
 
-   if (intel_context_is_closed(rq->context)) {
-   intel_context_set_banned(rq->context);
+   if (intel_context_is_closed(rq->context))
return true;
-   }
 
rcu_read_lock();
ctx = rcu_dereference(rq->context->gem_context);
@@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
banned = !i915_gem_context_is_recoverable(ctx);
if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
banned = true;
-   if (banned) {
+   if (banned)
drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
ctx->name, atomic_read(&ctx->guilty_count));
-   intel_context_set_banned(rq->con

Re: [Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-08-05 Thread Tvrtko Ursulin



On 27/07/2021 01:23, Matthew Brost wrote:

When using GuC submission, if a context gets banned disable scheduling
and mark all inflight requests as complete.

Cc: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: John Harrison 
---
  drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
  drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
  drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
  drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
  .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 --
  drivers/gpu/drm/i915/i915_trace.h |  10 ++
  8 files changed, 195 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e3df01a201d7..05c3ee191710 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
*engines, bool ban)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
  
-		if (ban && intel_context_set_banned(ce))

+   if (ban && intel_context_ban(ce, NULL))
continue;
  
  		/*

diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
b/drivers/gpu/drm/i915/gt/intel_context.h
index 2ed9bf5f91a5..814d9277096a 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -16,6 +16,7 @@
  #include "intel_engine_types.h"
  #include "intel_ring_types.h"
  #include "intel_timeline_types.h"
+#include "i915_trace.h"
  
  #define CE_TRACE(ce, fmt, ...) do {	\

const struct intel_context *ce__ = (ce);\
@@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
intel_context *ce)
return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
  }
  
+static inline bool intel_context_ban(struct intel_context *ce,

+struct i915_request *rq)
+{
+   bool ret = intel_context_set_banned(ce);
+
+   trace_intel_context_ban(ce);
+   if (ce->ops->ban)
+   ce->ops->ban(ce, rq);


Do you want to skip this call if already banned?


+
+   return ret;
+}
+
  static inline bool
  intel_context_force_single_submission(const struct intel_context *ce)
  {
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 035108c10b2c..57c19ee3e313 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -35,6 +35,8 @@ struct intel_context_ops {
  
  	int (*alloc)(struct intel_context *ce);
  
+	void (*ban)(struct intel_context *ce, struct i915_request *rq);

+
int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, 
void **vaddr);
int (*pin)(struct intel_context *ce, void *vaddr);
void (*unpin)(struct intel_context *ce);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 4d281bc8a38c..91200c43951f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -22,7 +22,6 @@
  #include "intel_reset.h"
  
  #include "uc/intel_guc.h"

-#include "uc/intel_guc_submission.h"
  
  #define RESET_MAX_RETRIES 3
  
@@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)

intel_uncore_rmw_fw(uncore, reg, clr, 0);
  }
  
-static void skip_context(struct i915_request *rq)

-{
-   struct intel_context *hung_ctx = rq->context;
-
-   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
-   if (!i915_request_is_active(rq))
-   return;
-
-   if (rq->context == hung_ctx) {
-   i915_request_set_error_once(rq, -EIO);
-   __i915_request_skip(rq);
-   }
-   }
-}


More importantly I must be missing something - this code has been moved 
to ring_context_ban - what am I not seeing on the execlists side of things?!


Regards,

Tvrtko


-
  static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
  {
struct drm_i915_file_private *file_priv = ctx->file_priv;
@@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
bool banned;
int i;
  
-	if (intel_context_is_closed(rq->context)) {

-   intel_context_set_banned(rq->context);
+   if (intel_context_is_closed(rq->context))
return true;
-   }
  
  	rcu_read_lock();

ctx = rcu_dereference(rq->context->gem_context);
@@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
banned = !i915_gem_context_is_recoverable(ctx);
if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
banned = tru

Re: [Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-08-25 Thread Tvrtko Ursulin



On 27/07/2021 01:23, Matthew Brost wrote:

When using GuC submission, if a context gets banned disable scheduling
and mark all inflight requests as complete.

Cc: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: John Harrison 
---
  drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
  drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
  drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
  drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
  .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 --
  drivers/gpu/drm/i915/i915_trace.h |  10 ++
  8 files changed, 195 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e3df01a201d7..05c3ee191710 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
*engines, bool ban)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
  
-		if (ban && intel_context_set_banned(ce))

+   if (ban && intel_context_ban(ce, NULL))
continue;
  
  		/*

diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
b/drivers/gpu/drm/i915/gt/intel_context.h
index 2ed9bf5f91a5..814d9277096a 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -16,6 +16,7 @@
  #include "intel_engine_types.h"
  #include "intel_ring_types.h"
  #include "intel_timeline_types.h"
+#include "i915_trace.h"
  
  #define CE_TRACE(ce, fmt, ...) do {	\

const struct intel_context *ce__ = (ce);\
@@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
intel_context *ce)
return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
  }
  
+static inline bool intel_context_ban(struct intel_context *ce,

+struct i915_request *rq)
+{
+   bool ret = intel_context_set_banned(ce);
+
+   trace_intel_context_ban(ce);
+   if (ce->ops->ban)
+   ce->ops->ban(ce, rq);
+
+   return ret;
+}
+
  static inline bool
  intel_context_force_single_submission(const struct intel_context *ce)
  {
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 035108c10b2c..57c19ee3e313 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -35,6 +35,8 @@ struct intel_context_ops {
  
  	int (*alloc)(struct intel_context *ce);
  
+	void (*ban)(struct intel_context *ce, struct i915_request *rq);

+
int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, 
void **vaddr);
int (*pin)(struct intel_context *ce, void *vaddr);
void (*unpin)(struct intel_context *ce);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 4d281bc8a38c..91200c43951f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -22,7 +22,6 @@
  #include "intel_reset.h"
  
  #include "uc/intel_guc.h"

-#include "uc/intel_guc_submission.h"
  
  #define RESET_MAX_RETRIES 3
  
@@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)

intel_uncore_rmw_fw(uncore, reg, clr, 0);
  }
  
-static void skip_context(struct i915_request *rq)

-{
-   struct intel_context *hung_ctx = rq->context;
-
-   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
-   if (!i915_request_is_active(rq))
-   return;
-
-   if (rq->context == hung_ctx) {
-   i915_request_set_error_once(rq, -EIO);
-   __i915_request_skip(rq);
-   }
-   }
-}
-
  static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
  {
struct drm_i915_file_private *file_priv = ctx->file_priv;
@@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
bool banned;
int i;
  
-	if (intel_context_is_closed(rq->context)) {

-   intel_context_set_banned(rq->context);
+   if (intel_context_is_closed(rq->context))
return true;
-   }
  
  	rcu_read_lock();

ctx = rcu_dereference(rq->context->gem_context);
@@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
banned = !i915_gem_context_is_recoverable(ctx);
if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
banned = true;
-   if (banned) {
+   if (banned)
drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
ctx->name, atomic_read(&ctx->guilty_count));
-   intel_context_set_ba

Re: [Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-08-25 Thread Matthew Brost
On Wed, Aug 25, 2021 at 11:39:10AM +0100, Tvrtko Ursulin wrote:
> 
> On 27/07/2021 01:23, Matthew Brost wrote:
> > When using GuC submission, if a context gets banned disable scheduling
> > and mark all inflight requests as complete.
> > 
> > Cc: John Harrison 
> > Signed-off-by: Matthew Brost 
> > Reviewed-by: John Harrison 
> > ---
> >   drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
> >   drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
> >   drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
> >   drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
> >   .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 --
> >   drivers/gpu/drm/i915/i915_trace.h |  10 ++
> >   8 files changed, 195 insertions(+), 37 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
> > b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > index e3df01a201d7..05c3ee191710 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > @@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
> > *engines, bool ban)
> > for_each_gem_engine(ce, engines, it) {
> > struct intel_engine_cs *engine;
> > -   if (ban && intel_context_set_banned(ce))
> > +   if (ban && intel_context_ban(ce, NULL))
> > continue;
> > /*
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
> > b/drivers/gpu/drm/i915/gt/intel_context.h
> > index 2ed9bf5f91a5..814d9277096a 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> > @@ -16,6 +16,7 @@
> >   #include "intel_engine_types.h"
> >   #include "intel_ring_types.h"
> >   #include "intel_timeline_types.h"
> > +#include "i915_trace.h"
> >   #define CE_TRACE(ce, fmt, ...) do {   
> > \
> > const struct intel_context *ce__ = (ce);\
> > @@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
> > intel_context *ce)
> > return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
> >   }
> > +static inline bool intel_context_ban(struct intel_context *ce,
> > +struct i915_request *rq)
> > +{
> > +   bool ret = intel_context_set_banned(ce);
> > +
> > +   trace_intel_context_ban(ce);
> > +   if (ce->ops->ban)
> > +   ce->ops->ban(ce, rq);
> > +
> > +   return ret;
> > +}
> > +
> >   static inline bool
> >   intel_context_force_single_submission(const struct intel_context *ce)
> >   {
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > index 035108c10b2c..57c19ee3e313 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > @@ -35,6 +35,8 @@ struct intel_context_ops {
> > int (*alloc)(struct intel_context *ce);
> > +   void (*ban)(struct intel_context *ce, struct i915_request *rq);
> > +
> > int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, 
> > void **vaddr);
> > int (*pin)(struct intel_context *ce, void *vaddr);
> > void (*unpin)(struct intel_context *ce);
> > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
> > b/drivers/gpu/drm/i915/gt/intel_reset.c
> > index 4d281bc8a38c..91200c43951f 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> > @@ -22,7 +22,6 @@
> >   #include "intel_reset.h"
> >   #include "uc/intel_guc.h"
> > -#include "uc/intel_guc_submission.h"
> >   #define RESET_MAX_RETRIES 3
> > @@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, 
> > i915_reg_t reg, u32 clr)
> > intel_uncore_rmw_fw(uncore, reg, clr, 0);
> >   }
> > -static void skip_context(struct i915_request *rq)
> > -{
> > -   struct intel_context *hung_ctx = rq->context;
> > -
> > -   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
> > -   if (!i915_request_is_active(rq))
> > -   return;
> > -
> > -   if (rq->context == hung_ctx) {
> > -   i915_request_set_error_once(rq, -EIO);
> > -   __i915_request_skip(rq);
> > -   }
> > -   }
> > -}
> > -
> >   static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
> >   {
> > struct drm_i915_file_private *file_priv = ctx->file_priv;
> > @@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
> > bool banned;
> > int i;
> > -   if (intel_context_is_closed(rq->context)) {
> > -   intel_context_set_banned(rq->context);
> > +   if (intel_context_is_closed(rq->context))
> > return true;
> > -   }
> > rcu_read_lock();
> > ctx = rcu_dereference(rq->context->gem_context);
> > @@ -123,11 +105,9 @@ static bool ma

Re: [Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-08-26 Thread Tvrtko Ursulin



On 26/08/2021 04:49, Matthew Brost wrote:

On Wed, Aug 25, 2021 at 11:39:10AM +0100, Tvrtko Ursulin wrote:


On 27/07/2021 01:23, Matthew Brost wrote:

When using GuC submission, if a context gets banned disable scheduling
and mark all inflight requests as complete.

Cc: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: John Harrison 
---
   drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
   drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
   drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
   drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
   .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
   drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 --
   drivers/gpu/drm/i915/i915_trace.h |  10 ++
   8 files changed, 195 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e3df01a201d7..05c3ee191710 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
*engines, bool ban)
for_each_gem_engine(ce, engines, it) {
struct intel_engine_cs *engine;
-   if (ban && intel_context_set_banned(ce))
+   if (ban && intel_context_ban(ce, NULL))
continue;
/*
diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
b/drivers/gpu/drm/i915/gt/intel_context.h
index 2ed9bf5f91a5..814d9277096a 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -16,6 +16,7 @@
   #include "intel_engine_types.h"
   #include "intel_ring_types.h"
   #include "intel_timeline_types.h"
+#include "i915_trace.h"
   #define CE_TRACE(ce, fmt, ...) do {  \
const struct intel_context *ce__ = (ce);\
@@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
intel_context *ce)
return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
   }
+static inline bool intel_context_ban(struct intel_context *ce,
+struct i915_request *rq)
+{
+   bool ret = intel_context_set_banned(ce);
+
+   trace_intel_context_ban(ce);
+   if (ce->ops->ban)
+   ce->ops->ban(ce, rq);
+
+   return ret;
+}
+
   static inline bool
   intel_context_force_single_submission(const struct intel_context *ce)
   {
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 035108c10b2c..57c19ee3e313 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -35,6 +35,8 @@ struct intel_context_ops {
int (*alloc)(struct intel_context *ce);
+   void (*ban)(struct intel_context *ce, struct i915_request *rq);
+
int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx *ww, 
void **vaddr);
int (*pin)(struct intel_context *ce, void *vaddr);
void (*unpin)(struct intel_context *ce);
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index 4d281bc8a38c..91200c43951f 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -22,7 +22,6 @@
   #include "intel_reset.h"
   #include "uc/intel_guc.h"
-#include "uc/intel_guc_submission.h"
   #define RESET_MAX_RETRIES 3
@@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore *uncore, 
i915_reg_t reg, u32 clr)
intel_uncore_rmw_fw(uncore, reg, clr, 0);
   }
-static void skip_context(struct i915_request *rq)
-{
-   struct intel_context *hung_ctx = rq->context;
-
-   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, link) {
-   if (!i915_request_is_active(rq))
-   return;
-
-   if (rq->context == hung_ctx) {
-   i915_request_set_error_once(rq, -EIO);
-   __i915_request_skip(rq);
-   }
-   }
-}
-
   static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
   {
struct drm_i915_file_private *file_priv = ctx->file_priv;
@@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
bool banned;
int i;
-   if (intel_context_is_closed(rq->context)) {
-   intel_context_set_banned(rq->context);
+   if (intel_context_is_closed(rq->context))
return true;
-   }
rcu_read_lock();
ctx = rcu_dereference(rq->context->gem_context);
@@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
banned = !i915_gem_context_is_recoverable(ctx);
if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
banned = true;
-   if (banned) {
+   if (banned)
dr

Re: [Intel-gfx] [PATCH 24/33] drm/i915/guc: Implement banned contexts for GuC submission

2021-08-26 Thread Matthew Brost
On Thu, Aug 26, 2021 at 12:27:31PM +0100, Tvrtko Ursulin wrote:
> 
> On 26/08/2021 04:49, Matthew Brost wrote:
> > On Wed, Aug 25, 2021 at 11:39:10AM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 27/07/2021 01:23, Matthew Brost wrote:
> > > > When using GuC submission, if a context gets banned disable scheduling
> > > > and mark all inflight requests as complete.
> > > > 
> > > > Cc: John Harrison 
> > > > Signed-off-by: Matthew Brost 
> > > > Reviewed-by: John Harrison 
> > > > ---
> > > >drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
> > > >drivers/gpu/drm/i915/gt/intel_context.h   |  13 ++
> > > >drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
> > > >drivers/gpu/drm/i915/gt/intel_reset.c |  32 +---
> > > >.../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
> > > >drivers/gpu/drm/i915/gt/uc/intel_guc.h|   2 +
> > > >.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 
> > > > --
> > > >drivers/gpu/drm/i915/i915_trace.h |  10 ++
> > > >8 files changed, 195 insertions(+), 37 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
> > > > b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > > index e3df01a201d7..05c3ee191710 100644
> > > > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > > @@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
> > > > *engines, bool ban)
> > > > for_each_gem_engine(ce, engines, it) {
> > > > struct intel_engine_cs *engine;
> > > > -   if (ban && intel_context_set_banned(ce))
> > > > +   if (ban && intel_context_ban(ce, NULL))
> > > > continue;
> > > > /*
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
> > > > b/drivers/gpu/drm/i915/gt/intel_context.h
> > > > index 2ed9bf5f91a5..814d9277096a 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_context.h
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> > > > @@ -16,6 +16,7 @@
> > > >#include "intel_engine_types.h"
> > > >#include "intel_ring_types.h"
> > > >#include "intel_timeline_types.h"
> > > > +#include "i915_trace.h"
> > > >#define CE_TRACE(ce, fmt, ...) do {  
> > > > \
> > > > const struct intel_context *ce__ = (ce);
> > > > \
> > > > @@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
> > > > intel_context *ce)
> > > > return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
> > > >}
> > > > +static inline bool intel_context_ban(struct intel_context *ce,
> > > > +struct i915_request *rq)
> > > > +{
> > > > +   bool ret = intel_context_set_banned(ce);
> > > > +
> > > > +   trace_intel_context_ban(ce);
> > > > +   if (ce->ops->ban)
> > > > +   ce->ops->ban(ce, rq);
> > > > +
> > > > +   return ret;
> > > > +}
> > > > +
> > > >static inline bool
> > > >intel_context_force_single_submission(const struct intel_context *ce)
> > > >{
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> > > > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > index 035108c10b2c..57c19ee3e313 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > @@ -35,6 +35,8 @@ struct intel_context_ops {
> > > > int (*alloc)(struct intel_context *ce);
> > > > +   void (*ban)(struct intel_context *ce, struct i915_request *rq);
> > > > +
> > > > int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx 
> > > > *ww, void **vaddr);
> > > > int (*pin)(struct intel_context *ce, void *vaddr);
> > > > void (*unpin)(struct intel_context *ce);
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
> > > > b/drivers/gpu/drm/i915/gt/intel_reset.c
> > > > index 4d281bc8a38c..91200c43951f 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> > > > @@ -22,7 +22,6 @@
> > > >#include "intel_reset.h"
> > > >#include "uc/intel_guc.h"
> > > > -#include "uc/intel_guc_submission.h"
> > > >#define RESET_MAX_RETRIES 3
> > > > @@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore 
> > > > *uncore, i915_reg_t reg, u32 clr)
> > > > intel_uncore_rmw_fw(uncore, reg, clr, 0);
> > > >}
> > > > -static void skip_context(struct i915_request *rq)
> > > > -{
> > > > -   struct intel_context *hung_ctx = rq->context;
> > > > -
> > > > -   list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, 
> > > > link) {
> > > > -   if (!i915_request_is_active(rq))
> > > > -   return;
> > > > -
> > > > -   if (rq->context == hung_ctx) {
> > > > -   i915_request_set_error_once(rq, -EIO);
> >