Re: [Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-22 Thread Matthew Brost
On Wed, Dec 22, 2021 at 04:25:13PM +, Tvrtko Ursulin wrote:
> 
> Ping?
>

Missed this.

This was merged before your comments landed on the list.
 
> Main two points being:
> 
> 1) Commit message seems in contradiction with the change in
> guc_flush_destroyed_contexts. And the lock drop to immediately re-acquire it
> looks questionable to start with.
> 
> 2) And in deregister_destroyed_contexts and in 1) I was therefore asking if
> you can unlink all at once and process with reduced hammering on the lock.
> 

Probably can address both concerns by using a llist, right?

Be on the look out for this rework patch over the next week or so.

Matt

> Regards,
> 
> Tvrtko
> 
> On 17/12/2021 11:14, Tvrtko Ursulin wrote:
> > 
> > On 17/12/2021 11:06, Tvrtko Ursulin wrote:
> > > On 14/12/2021 17:04, Matthew Brost wrote:
> > > > From: John Harrison 
> > > > 
> > > > While attempting to debug a CT deadlock issue in various CI failures
> > > > (most easily reproduced with gem_ctx_create/basic-files), I was seeing
> > > > CPU deadlock errors being reported. This were because the context
> > > > destroy loop was blocking waiting on H2G space from inside an IRQ
> > > > spinlock. There no was deadlock as such, it's just that the H2G queue
> > > > was full of context destroy commands and GuC was taking a long time to
> > > > process them. However, the kernel was seeing the large amount of time
> > > > spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
> > > > then happen (heartbeat failures, CT deadlock errors, outstanding H2G
> > > > WARNs, etc.).
> > > > 
> > > > Re-working the loop to only acquire the spinlock around the list
> > > > management (which is all it is meant to protect) rather than the
> > > > entire destroy operation seems to fix all the above issues.
> > > > 
> > > > v2:
> > > >   (John Harrison)
> > > >    - Fix typo in comment message
> > > > 
> > > > Signed-off-by: John Harrison 
> > > > Signed-off-by: Matthew Brost 
> > > > Reviewed-by: Matthew Brost 
> > > > ---
> > > >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
> > > >   1 file changed, 28 insertions(+), 17 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > index 36c2965db49b..96fcf869e3ff 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > @@ -2644,7 +2644,6 @@ static inline void
> > > > guc_lrc_desc_unpin(struct intel_context *ce)
> > > >   unsigned long flags;
> > > >   bool disabled;
> > > > -    lockdep_assert_held(>submission_state.lock);
> > > >   GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
> > > >   GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
> > > >   GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
> > > > @@ -2660,7 +2659,7 @@ static inline void
> > > > guc_lrc_desc_unpin(struct intel_context *ce)
> > > >   }
> > > >   spin_unlock_irqrestore(>guc_state.lock, flags);
> > > >   if (unlikely(disabled)) {
> > > > -    __release_guc_id(guc, ce);
> > > > +    release_guc_id(guc, ce);
> > > >   __guc_context_destroy(ce);
> > > >   return;
> > > >   }
> > > > @@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct
> > > > intel_context *ce)
> > > >   static void guc_flush_destroyed_contexts(struct intel_guc *guc)
> > > >   {
> > > > -    struct intel_context *ce, *cn;
> > > > +    struct intel_context *ce;
> > > >   unsigned long flags;
> > > >   GEM_BUG_ON(!submission_disabled(guc) &&
> > > >  guc_submission_initialized(guc));
> > > > -    spin_lock_irqsave(>submission_state.lock, flags);
> > > > -    list_for_each_entry_safe(ce, cn,
> > > > - >submission_state.destroyed_contexts,
> > > > - destroyed_link) {
> > > > -    list_del_init(>destroyed_link);
> > > > -    __release_guc_id(guc, ce);
> > > > +    while (!list_empty(>submission_state.destroyed_contexts)) {
> > > 
> > > Are lockless false negatives a concern here - I mean this thread not
> > > seeing something just got added to the list?
> > > 
> > > > +    spin_lock_irqsave(>submission_state.lock, flags);
> > > > +    ce =
> > > > list_first_entry_or_null(>submission_state.destroyed_contexts,
> > > > +  struct intel_context,
> > > > +  destroyed_link);
> > > > +    if (ce)
> > > > +    list_del_init(>destroyed_link);
> > > > +    spin_unlock_irqrestore(>submission_state.lock, flags);
> > > > +
> > > > +    if (!ce)
> > > > +    break;
> > > > +
> > > > +    release_guc_id(guc, ce);
> > > 
> > > This looks suboptimal and in conflict with this part of the commit
> > > message:
> > > 
> > > """
> > >   Re-working the loop to only acquire the spinlock around the list
> > >   management (which is all it is meant to protect) 

Re: [Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-22 Thread Tvrtko Ursulin



Ping?

Main two points being:

1) Commit message seems in contradiction with the change in 
guc_flush_destroyed_contexts. And the lock drop to immediately 
re-acquire it looks questionable to start with.


2) And in deregister_destroyed_contexts and in 1) I was therefore asking 
if you can unlink all at once and process with reduced hammering on the 
lock.


Regards,

Tvrtko

On 17/12/2021 11:14, Tvrtko Ursulin wrote:


On 17/12/2021 11:06, Tvrtko Ursulin wrote:

On 14/12/2021 17:04, Matthew Brost wrote:

From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There no was deadlock as such, it's just that the H2G queue
was full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

v2:
  (John Harrison)
   - Fix typo in comment message

Signed-off-by: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: Matthew Brost 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
  1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)

  unsigned long flags;
  bool disabled;
-    lockdep_assert_held(>submission_state.lock);
  GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
  GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
  GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)

  }
  spin_unlock_irqrestore(>guc_state.lock, flags);
  if (unlikely(disabled)) {
-    __release_guc_id(guc, ce);
+    release_guc_id(guc, ce);
  __guc_context_destroy(ce);
  return;
  }
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct 
intel_context *ce)

  static void guc_flush_destroyed_contexts(struct intel_guc *guc)
  {
-    struct intel_context *ce, *cn;
+    struct intel_context *ce;
  unsigned long flags;
  GEM_BUG_ON(!submission_disabled(guc) &&
 guc_submission_initialized(guc));
-    spin_lock_irqsave(>submission_state.lock, flags);
-    list_for_each_entry_safe(ce, cn,
- >submission_state.destroyed_contexts,
- destroyed_link) {
-    list_del_init(>destroyed_link);
-    __release_guc_id(guc, ce);
+    while (!list_empty(>submission_state.destroyed_contexts)) {


Are lockless false negatives a concern here - I mean this thread not 
seeing something just got added to the list?



+    spin_lock_irqsave(>submission_state.lock, flags);
+    ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,

+  struct intel_context,
+  destroyed_link);
+    if (ce)
+    list_del_init(>destroyed_link);
+    spin_unlock_irqrestore(>submission_state.lock, flags);
+
+    if (!ce)
+    break;
+
+    release_guc_id(guc, ce);


This looks suboptimal and in conflict with this part of the commit 
message:


"""
  Re-working the loop to only acquire the spinlock around the list
  management (which is all it is meant to protect) rather than the
  entire destroy operation seems to fix all the above issues.
"""

Because you end up doing:

... loop ...
   spin_lock_irqsave(>submission_state.lock, flags);
   list_del_init(>destroyed_link);
   spin_unlock_irqrestore(>submission_state.lock, flags);

   release_guc_id, which calls:
 spin_lock_irqsave(>submission_state.lock, flags);
 __release_guc_id(guc, ce);
 spin_unlock_irqrestore(>submission_state.lock, flags);

So a) the lock seems to be protecting more than just list management, 
or release_guc_if is wrong, and b) the loop ends up with highly 
questionable hammering on the lock.


Is there any point to this part of the patch? Or the only business end 
of the patch is below:



  __guc_context_destroy(ce);
  }
-    spin_unlock_irqrestore(>submission_state.lock, flags);
  }
  static void deregister_destroyed_contexts(struct intel_guc *guc)
  {
-    struct intel_context *ce, *cn;
+    struct intel_context *ce;
  unsigned long flags;
-    

Re: [Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-17 Thread Tvrtko Ursulin



On 17/12/2021 11:06, Tvrtko Ursulin wrote:

On 14/12/2021 17:04, Matthew Brost wrote:

From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There no was deadlock as such, it's just that the H2G queue
was full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

v2:
  (John Harrison)
   - Fix typo in comment message

Signed-off-by: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: Matthew Brost 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
  1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)

  unsigned long flags;
  bool disabled;
-    lockdep_assert_held(>submission_state.lock);
  GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
  GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
  GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)

  }
  spin_unlock_irqrestore(>guc_state.lock, flags);
  if (unlikely(disabled)) {
-    __release_guc_id(guc, ce);
+    release_guc_id(guc, ce);
  __guc_context_destroy(ce);
  return;
  }
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct 
intel_context *ce)

  static void guc_flush_destroyed_contexts(struct intel_guc *guc)
  {
-    struct intel_context *ce, *cn;
+    struct intel_context *ce;
  unsigned long flags;
  GEM_BUG_ON(!submission_disabled(guc) &&
 guc_submission_initialized(guc));
-    spin_lock_irqsave(>submission_state.lock, flags);
-    list_for_each_entry_safe(ce, cn,
- >submission_state.destroyed_contexts,
- destroyed_link) {
-    list_del_init(>destroyed_link);
-    __release_guc_id(guc, ce);
+    while (!list_empty(>submission_state.destroyed_contexts)) {


Are lockless false negatives a concern here - I mean this thread not 
seeing something just got added to the list?



+    spin_lock_irqsave(>submission_state.lock, flags);
+    ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,

+  struct intel_context,
+  destroyed_link);
+    if (ce)
+    list_del_init(>destroyed_link);
+    spin_unlock_irqrestore(>submission_state.lock, flags);
+
+    if (!ce)
+    break;
+
+    release_guc_id(guc, ce);


This looks suboptimal and in conflict with this part of the commit message:

"""
  Re-working the loop to only acquire the spinlock around the list
  management (which is all it is meant to protect) rather than the
  entire destroy operation seems to fix all the above issues.
"""

Because you end up doing:

... loop ...
   spin_lock_irqsave(>submission_state.lock, flags);
   list_del_init(>destroyed_link);
   spin_unlock_irqrestore(>submission_state.lock, flags);

   release_guc_id, which calls:
     spin_lock_irqsave(>submission_state.lock, flags);
     __release_guc_id(guc, ce);
     spin_unlock_irqrestore(>submission_state.lock, flags);

So a) the lock seems to be protecting more than just list management, or 
release_guc_if is wrong, and b) the loop ends up with highly 
questionable hammering on the lock.


Is there any point to this part of the patch? Or the only business end 
of the patch is below:



  __guc_context_destroy(ce);
  }
-    spin_unlock_irqrestore(>submission_state.lock, flags);
  }
  static void deregister_destroyed_contexts(struct intel_guc *guc)
  {
-    struct intel_context *ce, *cn;
+    struct intel_context *ce;
  unsigned long flags;
-    spin_lock_irqsave(>submission_state.lock, flags);
-    list_for_each_entry_safe(ce, cn,
- >submission_state.destroyed_contexts,
- destroyed_link) {
-    list_del_init(>destroyed_link);
+    while (!list_empty(>submission_state.destroyed_contexts)) {
+    spin_lock_irqsave(>submission_state.lock, flags);
+    ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,

+ 

Re: [Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-17 Thread Tvrtko Ursulin

On 14/12/2021 17:04, Matthew Brost wrote:

From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There no was deadlock as such, it's just that the H2G queue
was full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

v2:
  (John Harrison)
   - Fix typo in comment message

Signed-off-by: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: Matthew Brost 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
  1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
unsigned long flags;
bool disabled;
  
-	lockdep_assert_held(>submission_state.lock);

GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
}
spin_unlock_irqrestore(>guc_state.lock, flags);
if (unlikely(disabled)) {
-   __release_guc_id(guc, ce);
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
return;
}
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context 
*ce)
  
  static void guc_flush_destroyed_contexts(struct intel_guc *guc)

  {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
  
  	GEM_BUG_ON(!submission_disabled(guc) &&

   guc_submission_initialized(guc));
  
-	spin_lock_irqsave(>submission_state.lock, flags);

-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
-   __release_guc_id(guc, ce);
+   while (!list_empty(>submission_state.destroyed_contexts)) {


Are lockless false negatives a concern here - I mean this thread not seeing 
something just got added to the list?


+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
+   release_guc_id(guc, ce);


This looks suboptimal and in conflict with this part of the commit message:

"""
 Re-working the loop to only acquire the spinlock around the list
 management (which is all it is meant to protect) rather than the
 entire destroy operation seems to fix all the above issues.
"""

Because you end up doing:

... loop ...
  spin_lock_irqsave(>submission_state.lock, flags);
  list_del_init(>destroyed_link);
  spin_unlock_irqrestore(>submission_state.lock, flags);

  release_guc_id, which calls:
spin_lock_irqsave(>submission_state.lock, flags);
__release_guc_id(guc, ce);
spin_unlock_irqrestore(>submission_state.lock, flags);

So a) the lock seems to be protecting more than just list management, or 
release_guc_if is wrong, and b) the loop ends up with highly questionable 
hammering on the lock.

Is there any point to this part of the patch? Or the only business end of the 
patch is below:


__guc_context_destroy(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
  }
  
  static void deregister_destroyed_contexts(struct intel_guc *guc)

  {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
  
-	spin_lock_irqsave(>submission_state.lock, flags);

-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   

[Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-14 Thread Matthew Brost
From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There no was deadlock as such, it's just that the H2G queue
was full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

v2:
 (John Harrison)
  - Fix typo in comment message

Signed-off-by: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: Matthew Brost 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
unsigned long flags;
bool disabled;
 
-   lockdep_assert_held(>submission_state.lock);
GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
}
spin_unlock_irqrestore(>guc_state.lock, flags);
if (unlikely(disabled)) {
-   __release_guc_id(guc, ce);
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
return;
}
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context 
*ce)
 
 static void guc_flush_destroyed_contexts(struct intel_guc *guc)
 {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
 
GEM_BUG_ON(!submission_disabled(guc) &&
   guc_submission_initialized(guc));
 
-   spin_lock_irqsave(>submission_state.lock, flags);
-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
-   __release_guc_id(guc, ce);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
 }
 
 static void deregister_destroyed_contexts(struct intel_guc *guc)
 {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
 
-   spin_lock_irqsave(>submission_state.lock, flags);
-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
guc_lrc_desc_unpin(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
 }
 
 static void destroyed_worker_func(struct work_struct *w)
-- 
2.33.1



[Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-11 Thread Matthew Brost
From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There no was deadlock as such, it's just that the H2G queue
was full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

v2:
 (John Harrison)
  - Fix typo in comment message

Signed-off-by: John Harrison 
Signed-off-by: Matthew Brost 
Reviewed-by: Matthew Brost 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
unsigned long flags;
bool disabled;
 
-   lockdep_assert_held(>submission_state.lock);
GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
}
spin_unlock_irqrestore(>guc_state.lock, flags);
if (unlikely(disabled)) {
-   __release_guc_id(guc, ce);
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
return;
}
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context 
*ce)
 
 static void guc_flush_destroyed_contexts(struct intel_guc *guc)
 {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
 
GEM_BUG_ON(!submission_disabled(guc) &&
   guc_submission_initialized(guc));
 
-   spin_lock_irqsave(>submission_state.lock, flags);
-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
-   __release_guc_id(guc, ce);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
 }
 
 static void deregister_destroyed_contexts(struct intel_guc *guc)
 {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
 
-   spin_lock_irqsave(>submission_state.lock, flags);
-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
guc_lrc_desc_unpin(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
 }
 
 static void destroyed_worker_func(struct work_struct *w)
-- 
2.33.1



Re: [Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-10 Thread Matthew Brost
On Fri, Dec 10, 2021 at 05:07:12PM -0800, John Harrison wrote:
> On 12/10/2021 16:56, Matthew Brost wrote:
> > From: John Harrison 
> > 
> > While attempting to debug a CT deadlock issue in various CI failures
> > (most easily reproduced with gem_ctx_create/basic-files), I was seeing
> > CPU deadlock errors being reported. This were because the context
> > destroy loop was blocking waiting on H2G space from inside an IRQ
> > spinlock. There was deadlock as such, it's just that the H2G queue was
> There was *no* deadlock as such
> 

Let's fix this up when applying the series.

With that:
Reviewed-by: Matthew Brost 

> John.
> 
> > full of context destroy commands and GuC was taking a long time to
> > process them. However, the kernel was seeing the large amount of time
> > spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
> > then happen (heartbeat failures, CT deadlock errors, outstanding H2G
> > WARNs, etc.).
> > 
> > Re-working the loop to only acquire the spinlock around the list
> > management (which is all it is meant to protect) rather than the
> > entire destroy operation seems to fix all the above issues.
> > 
> > Signed-off-by: John Harrison 
> > ---
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
> >   1 file changed, 28 insertions(+), 17 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 36c2965db49b..96fcf869e3ff 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
> > intel_context *ce)
> > unsigned long flags;
> > bool disabled;
> > -   lockdep_assert_held(>submission_state.lock);
> > GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
> > GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
> > GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
> > @@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
> > intel_context *ce)
> > }
> > spin_unlock_irqrestore(>guc_state.lock, flags);
> > if (unlikely(disabled)) {
> > -   __release_guc_id(guc, ce);
> > +   release_guc_id(guc, ce);
> > __guc_context_destroy(ce);
> > return;
> > }
> > @@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct 
> > intel_context *ce)
> >   static void guc_flush_destroyed_contexts(struct intel_guc *guc)
> >   {
> > -   struct intel_context *ce, *cn;
> > +   struct intel_context *ce;
> > unsigned long flags;
> > GEM_BUG_ON(!submission_disabled(guc) &&
> >guc_submission_initialized(guc));
> > -   spin_lock_irqsave(>submission_state.lock, flags);
> > -   list_for_each_entry_safe(ce, cn,
> > ->submission_state.destroyed_contexts,
> > -destroyed_link) {
> > -   list_del_init(>destroyed_link);
> > -   __release_guc_id(guc, ce);
> > +   while (!list_empty(>submission_state.destroyed_contexts)) {
> > +   spin_lock_irqsave(>submission_state.lock, flags);
> > +   ce = 
> > list_first_entry_or_null(>submission_state.destroyed_contexts,
> > + struct intel_context,
> > + destroyed_link);
> > +   if (ce)
> > +   list_del_init(>destroyed_link);
> > +   spin_unlock_irqrestore(>submission_state.lock, flags);
> > +
> > +   if (!ce)
> > +   break;
> > +
> > +   release_guc_id(guc, ce);
> > __guc_context_destroy(ce);
> > }
> > -   spin_unlock_irqrestore(>submission_state.lock, flags);
> >   }
> >   static void deregister_destroyed_contexts(struct intel_guc *guc)
> >   {
> > -   struct intel_context *ce, *cn;
> > +   struct intel_context *ce;
> > unsigned long flags;
> > -   spin_lock_irqsave(>submission_state.lock, flags);
> > -   list_for_each_entry_safe(ce, cn,
> > ->submission_state.destroyed_contexts,
> > -destroyed_link) {
> > -   list_del_init(>destroyed_link);
> > +   while (!list_empty(>submission_state.destroyed_contexts)) {
> > +   spin_lock_irqsave(>submission_state.lock, flags);
> > +   ce = 
> > list_first_entry_or_null(>submission_state.destroyed_contexts,
> > + struct intel_context,
> > + destroyed_link);
> > +   if (ce)
> > +   list_del_init(>destroyed_link);
> > +   spin_unlock_irqrestore(>submission_state.lock, flags);
> > +
> > +   if (!ce)
> > +   break;
> > +
> > guc_lrc_desc_unpin(ce);
> > }
> > -   spin_unlock_irqrestore(>submission_state.lock, flags);
> >   }
> >   static void destroyed_worker_func(struct work_struct *w)
> 


Re: [Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-10 Thread John Harrison

On 12/10/2021 16:56, Matthew Brost wrote:

From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There was deadlock as such, it's just that the H2G queue was

There was *no* deadlock as such

John.


full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

Signed-off-by: John Harrison 
---
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
  1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
unsigned long flags;
bool disabled;
  
-	lockdep_assert_held(>submission_state.lock);

GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
}
spin_unlock_irqrestore(>guc_state.lock, flags);
if (unlikely(disabled)) {
-   __release_guc_id(guc, ce);
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
return;
}
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context 
*ce)
  
  static void guc_flush_destroyed_contexts(struct intel_guc *guc)

  {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
  
  	GEM_BUG_ON(!submission_disabled(guc) &&

   guc_submission_initialized(guc));
  
-	spin_lock_irqsave(>submission_state.lock, flags);

-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
-   __release_guc_id(guc, ce);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
  }
  
  static void deregister_destroyed_contexts(struct intel_guc *guc)

  {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
  
-	spin_lock_irqsave(>submission_state.lock, flags);

-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
guc_lrc_desc_unpin(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
  }
  
  static void destroyed_worker_func(struct work_struct *w)




[Intel-gfx] [PATCH 4/7] drm/i915/guc: Don't hog IRQs when destroying contexts

2021-12-10 Thread Matthew Brost
From: John Harrison 

While attempting to debug a CT deadlock issue in various CI failures
(most easily reproduced with gem_ctx_create/basic-files), I was seeing
CPU deadlock errors being reported. This were because the context
destroy loop was blocking waiting on H2G space from inside an IRQ
spinlock. There was deadlock as such, it's just that the H2G queue was
full of context destroy commands and GuC was taking a long time to
process them. However, the kernel was seeing the large amount of time
spent inside the IRQ lock as a dead CPU. Various Bad Things(tm) would
then happen (heartbeat failures, CT deadlock errors, outstanding H2G
WARNs, etc.).

Re-working the loop to only acquire the spinlock around the list
management (which is all it is meant to protect) rather than the
entire destroy operation seems to fix all the above issues.

Signed-off-by: John Harrison 
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 45 ---
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 36c2965db49b..96fcf869e3ff 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2644,7 +2644,6 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
unsigned long flags;
bool disabled;
 
-   lockdep_assert_held(>submission_state.lock);
GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
GEM_BUG_ON(!lrc_desc_registered(guc, ce->guc_id.id));
GEM_BUG_ON(ce != __get_context(guc, ce->guc_id.id));
@@ -2660,7 +2659,7 @@ static inline void guc_lrc_desc_unpin(struct 
intel_context *ce)
}
spin_unlock_irqrestore(>guc_state.lock, flags);
if (unlikely(disabled)) {
-   __release_guc_id(guc, ce);
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
return;
}
@@ -2694,36 +2693,48 @@ static void __guc_context_destroy(struct intel_context 
*ce)
 
 static void guc_flush_destroyed_contexts(struct intel_guc *guc)
 {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
 
GEM_BUG_ON(!submission_disabled(guc) &&
   guc_submission_initialized(guc));
 
-   spin_lock_irqsave(>submission_state.lock, flags);
-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
-   __release_guc_id(guc, ce);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
+   release_guc_id(guc, ce);
__guc_context_destroy(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
 }
 
 static void deregister_destroyed_contexts(struct intel_guc *guc)
 {
-   struct intel_context *ce, *cn;
+   struct intel_context *ce;
unsigned long flags;
 
-   spin_lock_irqsave(>submission_state.lock, flags);
-   list_for_each_entry_safe(ce, cn,
->submission_state.destroyed_contexts,
-destroyed_link) {
-   list_del_init(>destroyed_link);
+   while (!list_empty(>submission_state.destroyed_contexts)) {
+   spin_lock_irqsave(>submission_state.lock, flags);
+   ce = 
list_first_entry_or_null(>submission_state.destroyed_contexts,
+ struct intel_context,
+ destroyed_link);
+   if (ce)
+   list_del_init(>destroyed_link);
+   spin_unlock_irqrestore(>submission_state.lock, flags);
+
+   if (!ce)
+   break;
+
guc_lrc_desc_unpin(ce);
}
-   spin_unlock_irqrestore(>submission_state.lock, flags);
 }
 
 static void destroyed_worker_func(struct work_struct *w)
-- 
2.33.1