Re: [Intel-gfx] [PATCH 3/3] drm/i915/selftests: Add initial GuC selftest for scrubbing lost G2H

2021-08-10 Thread Daniel Vetter
On Mon, Aug 09, 2021 at 07:41:29PM +, Matthew Brost wrote:
> On Mon, Aug 09, 2021 at 04:03:28PM +0200, Daniel Vetter wrote:
> > On Sun, Aug 08, 2021 at 11:07:57AM -0700, Matthew Brost wrote:
> > > While debugging an issue with full GT resets I went down a rabbit hole
> > > thinking the scrubbing of lost G2H wasn't working correctly. This proved
> > > to be incorrect as this was working just fine but this chase inspired me
> > > to write a selftest to prove that this works. This simple selftest
> > > injects errors dropping various G2H and then issues a full GT reset
> > > proving that the scrubbing of these G2H doesn't blow up.
> > > 
> > > Signed-off-by: Matthew Brost 
> > > ---
> > >  drivers/gpu/drm/i915/gt/intel_context_types.h |   4 +
> > >  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  18 +++
> > >  drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 126 ++
> > >  .../drm/i915/selftests/i915_live_selftests.h  |   1 +
> > >  .../i915/selftests/intel_scheduler_helpers.c  |  12 ++
> > >  .../i915/selftests/intel_scheduler_helpers.h  |   2 +
> > >  6 files changed, 163 insertions(+)
> > >  create mode 100644 drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> > > 
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> > > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > index e54351a170e2..fec5ff7ef168 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > @@ -198,6 +198,10 @@ struct intel_context {
> > >*/
> > >   u8 guc_prio;
> > >   u32 guc_prio_count[GUC_CLIENT_PRIORITY_NUM];
> > > +
> > 
> > I know the existing stuff isn't following this at all, but for anything
> > new we really should put some kerneldoc into structures. This probably
> > means you need to open-code the #ifdef here, since this macro will likely
> > upset kerneldoc parsing.
> > 
> 
> Ok, got it.
> 
> > > + I915_SELFTEST_DECLARE(bool drop_schedule_enable);
> > > + I915_SELFTEST_DECLARE(bool drop_schedule_disable);
> > > + I915_SELFTEST_DECLARE(bool drop_deregister);
> > >  };
> > >  
> > >  #endif /* __INTEL_CONTEXT_TYPES__ */
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > index cd8df078ca87..d13dc56bae43 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > @@ -2618,6 +2618,11 @@ int intel_guc_deregister_done_process_msg(struct 
> > > intel_guc *guc,
> > >  
> > >   trace_intel_context_deregister_done(ce);
> > >  
> > > + if (I915_SELFTEST_ONLY(ce->drop_deregister)) {
> > > + I915_SELFTEST_DECLARE(ce->drop_deregister = false;)
> > 
> > This macro wrapping is quite nasty, can't we just #ifdef this? Especially
> > the _DECLARE name really doesn't expect a statement.
> >
> 
> Had it like that originally then remember these marcos and in the past
> people have complained when I didn't use them, so yes pretty much a
> bikeshed. I personally like the ifdef myself.

I think the plain #ifdef is much clearer in the code. The
I915_SELFTEST_ONLY macro makes some sense to compile stuff out in some
cases and make sure it's wrapped in unlikely when enabled, and often
that's good enough. But not here.

Also because it breaks kerneldoc I don't like the macro really in structs
either. Anything that discourages people from writing solid comments is
Not Good at All :-) So another reason to not like I915_SELFTEST_DECLARE()
macro.
-Daniel

> 
> Matt
>  
> > Aside from these bikesheds I don't have a much to say on the test logic
> > itself, since I'm far from knowledgable on guc stuff ...
> > -Daniel
> > 
> > 
> > > + return 0;
> > > + }
> > > +
> > >   if (context_wait_for_deregister_to_register(ce)) {
> > >   struct intel_runtime_pm *runtime_pm =
> > >   >engine->gt->i915->runtime_pm;
> > > @@ -2672,10 +2677,19 @@ int intel_guc_sched_done_process_msg(struct 
> > > intel_guc *guc,
> > >   trace_intel_context_sched_done(ce);
> > >  
> > >   if (context_pending_enable(ce)) {
> > > + if (I915_SELFTEST_ONLY(ce->drop_schedule_enable)) {
> > > + I915_SELFTEST_DECLARE(ce->drop_schedule_enable = false;)
> > > + return 0;
> > > + }
> > >   clr_context_pending_enable(ce);
> > >   } else if (context_pending_disable(ce)) {
> > >   bool banned;
> > >  
> > > + if (I915_SELFTEST_ONLY(ce->drop_schedule_disable)) {
> > > + I915_SELFTEST_DECLARE(ce->drop_schedule_disable = 
> > > false;)
> > > + return 0;
> > > + }
> > > +
> > >   /*
> > >* Unpin must be done before __guc_signal_context_fence,
> > >* otherwise a race exists between the requests getting
> > > @@ -3047,3 +3061,7 @@ bool intel_guc_virtual_engine_has_heartbeat(const 
> > > struct intel_engine_cs *ve)
> > >  
> > >   return false;
> > >  }
> > > +
> > > 

Re: [Intel-gfx] [PATCH 3/3] drm/i915/selftests: Add initial GuC selftest for scrubbing lost G2H

2021-08-09 Thread Matthew Brost
On Mon, Aug 09, 2021 at 04:03:28PM +0200, Daniel Vetter wrote:
> On Sun, Aug 08, 2021 at 11:07:57AM -0700, Matthew Brost wrote:
> > While debugging an issue with full GT resets I went down a rabbit hole
> > thinking the scrubbing of lost G2H wasn't working correctly. This proved
> > to be incorrect as this was working just fine but this chase inspired me
> > to write a selftest to prove that this works. This simple selftest
> > injects errors dropping various G2H and then issues a full GT reset
> > proving that the scrubbing of these G2H doesn't blow up.
> > 
> > Signed-off-by: Matthew Brost 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_context_types.h |   4 +
> >  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  18 +++
> >  drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 126 ++
> >  .../drm/i915/selftests/i915_live_selftests.h  |   1 +
> >  .../i915/selftests/intel_scheduler_helpers.c  |  12 ++
> >  .../i915/selftests/intel_scheduler_helpers.h  |   2 +
> >  6 files changed, 163 insertions(+)
> >  create mode 100644 drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > index e54351a170e2..fec5ff7ef168 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > @@ -198,6 +198,10 @@ struct intel_context {
> >  */
> > u8 guc_prio;
> > u32 guc_prio_count[GUC_CLIENT_PRIORITY_NUM];
> > +
> 
> I know the existing stuff isn't following this at all, but for anything
> new we really should put some kerneldoc into structures. This probably
> means you need to open-code the #ifdef here, since this macro will likely
> upset kerneldoc parsing.
> 

Ok, got it.

> > +   I915_SELFTEST_DECLARE(bool drop_schedule_enable);
> > +   I915_SELFTEST_DECLARE(bool drop_schedule_disable);
> > +   I915_SELFTEST_DECLARE(bool drop_deregister);
> >  };
> >  
> >  #endif /* __INTEL_CONTEXT_TYPES__ */
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index cd8df078ca87..d13dc56bae43 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -2618,6 +2618,11 @@ int intel_guc_deregister_done_process_msg(struct 
> > intel_guc *guc,
> >  
> > trace_intel_context_deregister_done(ce);
> >  
> > +   if (I915_SELFTEST_ONLY(ce->drop_deregister)) {
> > +   I915_SELFTEST_DECLARE(ce->drop_deregister = false;)
> 
> This macro wrapping is quite nasty, can't we just #ifdef this? Especially
> the _DECLARE name really doesn't expect a statement.
>

Had it like that originally then remember these marcos and in the past
people have complained when I didn't use them, so yes pretty much a
bikeshed. I personally like the ifdef myself.

Matt
 
> Aside from these bikesheds I don't have a much to say on the test logic
> itself, since I'm far from knowledgable on guc stuff ...
> -Daniel
> 
> 
> > +   return 0;
> > +   }
> > +
> > if (context_wait_for_deregister_to_register(ce)) {
> > struct intel_runtime_pm *runtime_pm =
> > >engine->gt->i915->runtime_pm;
> > @@ -2672,10 +2677,19 @@ int intel_guc_sched_done_process_msg(struct 
> > intel_guc *guc,
> > trace_intel_context_sched_done(ce);
> >  
> > if (context_pending_enable(ce)) {
> > +   if (I915_SELFTEST_ONLY(ce->drop_schedule_enable)) {
> > +   I915_SELFTEST_DECLARE(ce->drop_schedule_enable = false;)
> > +   return 0;
> > +   }
> > clr_context_pending_enable(ce);
> > } else if (context_pending_disable(ce)) {
> > bool banned;
> >  
> > +   if (I915_SELFTEST_ONLY(ce->drop_schedule_disable)) {
> > +   I915_SELFTEST_DECLARE(ce->drop_schedule_disable = 
> > false;)
> > +   return 0;
> > +   }
> > +
> > /*
> >  * Unpin must be done before __guc_signal_context_fence,
> >  * otherwise a race exists between the requests getting
> > @@ -3047,3 +3061,7 @@ bool intel_guc_virtual_engine_has_heartbeat(const 
> > struct intel_engine_cs *ve)
> >  
> > return false;
> >  }
> > +
> > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > +#include "selftest_guc.c"
> > +#endif
> > diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c 
> > b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> > new file mode 100644
> > index ..46ca6554f65d
> > --- /dev/null
> > +++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> > @@ -0,0 +1,126 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright �� 2021 Intel Corporation
> > + */
> > +
> > +#include "selftests/intel_scheduler_helpers.h"
> > +
> > +static struct i915_request *nop_user_request(struct intel_context *ce,
> > +struct i915_request *from)
> > +{
> > +   struct 

Re: [Intel-gfx] [PATCH 3/3] drm/i915/selftests: Add initial GuC selftest for scrubbing lost G2H

2021-08-09 Thread Daniel Vetter
On Sun, Aug 08, 2021 at 11:07:57AM -0700, Matthew Brost wrote:
> While debugging an issue with full GT resets I went down a rabbit hole
> thinking the scrubbing of lost G2H wasn't working correctly. This proved
> to be incorrect as this was working just fine but this chase inspired me
> to write a selftest to prove that this works. This simple selftest
> injects errors dropping various G2H and then issues a full GT reset
> proving that the scrubbing of these G2H doesn't blow up.
> 
> Signed-off-by: Matthew Brost 
> ---
>  drivers/gpu/drm/i915/gt/intel_context_types.h |   4 +
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  18 +++
>  drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 126 ++
>  .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>  .../i915/selftests/intel_scheduler_helpers.c  |  12 ++
>  .../i915/selftests/intel_scheduler_helpers.h  |   2 +
>  6 files changed, 163 insertions(+)
>  create mode 100644 drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index e54351a170e2..fec5ff7ef168 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -198,6 +198,10 @@ struct intel_context {
>*/
>   u8 guc_prio;
>   u32 guc_prio_count[GUC_CLIENT_PRIORITY_NUM];
> +

I know the existing stuff isn't following this at all, but for anything
new we really should put some kerneldoc into structures. This probably
means you need to open-code the #ifdef here, since this macro will likely
upset kerneldoc parsing.

> + I915_SELFTEST_DECLARE(bool drop_schedule_enable);
> + I915_SELFTEST_DECLARE(bool drop_schedule_disable);
> + I915_SELFTEST_DECLARE(bool drop_deregister);
>  };
>  
>  #endif /* __INTEL_CONTEXT_TYPES__ */
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index cd8df078ca87..d13dc56bae43 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -2618,6 +2618,11 @@ int intel_guc_deregister_done_process_msg(struct 
> intel_guc *guc,
>  
>   trace_intel_context_deregister_done(ce);
>  
> + if (I915_SELFTEST_ONLY(ce->drop_deregister)) {
> + I915_SELFTEST_DECLARE(ce->drop_deregister = false;)

This macro wrapping is quite nasty, can't we just #ifdef this? Especially
the _DECLARE name really doesn't expect a statement.

Aside from these bikesheds I don't have a much to say on the test logic
itself, since I'm far from knowledgable on guc stuff ...
-Daniel


> + return 0;
> + }
> +
>   if (context_wait_for_deregister_to_register(ce)) {
>   struct intel_runtime_pm *runtime_pm =
>   >engine->gt->i915->runtime_pm;
> @@ -2672,10 +2677,19 @@ int intel_guc_sched_done_process_msg(struct intel_guc 
> *guc,
>   trace_intel_context_sched_done(ce);
>  
>   if (context_pending_enable(ce)) {
> + if (I915_SELFTEST_ONLY(ce->drop_schedule_enable)) {
> + I915_SELFTEST_DECLARE(ce->drop_schedule_enable = false;)
> + return 0;
> + }
>   clr_context_pending_enable(ce);
>   } else if (context_pending_disable(ce)) {
>   bool banned;
>  
> + if (I915_SELFTEST_ONLY(ce->drop_schedule_disable)) {
> + I915_SELFTEST_DECLARE(ce->drop_schedule_disable = 
> false;)
> + return 0;
> + }
> +
>   /*
>* Unpin must be done before __guc_signal_context_fence,
>* otherwise a race exists between the requests getting
> @@ -3047,3 +3061,7 @@ bool intel_guc_virtual_engine_has_heartbeat(const 
> struct intel_engine_cs *ve)
>  
>   return false;
>  }
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftest_guc.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c 
> b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> new file mode 100644
> index ..46ca6554f65d
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
> @@ -0,0 +1,126 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright �� 2021 Intel Corporation
> + */
> +
> +#include "selftests/intel_scheduler_helpers.h"
> +
> +static struct i915_request *nop_user_request(struct intel_context *ce,
> +  struct i915_request *from)
> +{
> + struct i915_request *rq;
> + int ret;
> +
> + rq = intel_context_create_request(ce);
> + if (IS_ERR(rq))
> + return rq;
> +
> + if (from) {
> + ret = i915_sw_fence_await_dma_fence(>submit,
> + >fence, 0,
> + I915_FENCE_GFP);
> + if (ret < 0) {
> + i915_request_put(rq);
> +  

[PATCH 3/3] drm/i915/selftests: Add initial GuC selftest for scrubbing lost G2H

2021-08-08 Thread Matthew Brost
While debugging an issue with full GT resets I went down a rabbit hole
thinking the scrubbing of lost G2H wasn't working correctly. This proved
to be incorrect as this was working just fine but this chase inspired me
to write a selftest to prove that this works. This simple selftest
injects errors dropping various G2H and then issues a full GT reset
proving that the scrubbing of these G2H doesn't blow up.

Signed-off-by: Matthew Brost 
---
 drivers/gpu/drm/i915/gt/intel_context_types.h |   4 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  18 +++
 drivers/gpu/drm/i915/gt/uc/selftest_guc.c | 126 ++
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 .../i915/selftests/intel_scheduler_helpers.c  |  12 ++
 .../i915/selftests/intel_scheduler_helpers.h  |   2 +
 6 files changed, 163 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/gt/uc/selftest_guc.c

diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index e54351a170e2..fec5ff7ef168 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -198,6 +198,10 @@ struct intel_context {
 */
u8 guc_prio;
u32 guc_prio_count[GUC_CLIENT_PRIORITY_NUM];
+
+   I915_SELFTEST_DECLARE(bool drop_schedule_enable);
+   I915_SELFTEST_DECLARE(bool drop_schedule_disable);
+   I915_SELFTEST_DECLARE(bool drop_deregister);
 };
 
 #endif /* __INTEL_CONTEXT_TYPES__ */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index cd8df078ca87..d13dc56bae43 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2618,6 +2618,11 @@ int intel_guc_deregister_done_process_msg(struct 
intel_guc *guc,
 
trace_intel_context_deregister_done(ce);
 
+   if (I915_SELFTEST_ONLY(ce->drop_deregister)) {
+   I915_SELFTEST_DECLARE(ce->drop_deregister = false;)
+   return 0;
+   }
+
if (context_wait_for_deregister_to_register(ce)) {
struct intel_runtime_pm *runtime_pm =
>engine->gt->i915->runtime_pm;
@@ -2672,10 +2677,19 @@ int intel_guc_sched_done_process_msg(struct intel_guc 
*guc,
trace_intel_context_sched_done(ce);
 
if (context_pending_enable(ce)) {
+   if (I915_SELFTEST_ONLY(ce->drop_schedule_enable)) {
+   I915_SELFTEST_DECLARE(ce->drop_schedule_enable = false;)
+   return 0;
+   }
clr_context_pending_enable(ce);
} else if (context_pending_disable(ce)) {
bool banned;
 
+   if (I915_SELFTEST_ONLY(ce->drop_schedule_disable)) {
+   I915_SELFTEST_DECLARE(ce->drop_schedule_disable = 
false;)
+   return 0;
+   }
+
/*
 * Unpin must be done before __guc_signal_context_fence,
 * otherwise a race exists between the requests getting
@@ -3047,3 +3061,7 @@ bool intel_guc_virtual_engine_has_heartbeat(const struct 
intel_engine_cs *ve)
 
return false;
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_guc.c"
+#endif
diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc.c 
b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
new file mode 100644
index ..46ca6554f65d
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright �� 2021 Intel Corporation
+ */
+
+#include "selftests/intel_scheduler_helpers.h"
+
+static struct i915_request *nop_user_request(struct intel_context *ce,
+struct i915_request *from)
+{
+   struct i915_request *rq;
+   int ret;
+
+   rq = intel_context_create_request(ce);
+   if (IS_ERR(rq))
+   return rq;
+
+   if (from) {
+   ret = i915_sw_fence_await_dma_fence(>submit,
+   >fence, 0,
+   I915_FENCE_GFP);
+   if (ret < 0) {
+   i915_request_put(rq);
+   return ERR_PTR(ret);
+   }
+   }
+
+   i915_request_get(rq);
+   i915_request_add(rq);
+
+   return rq;
+}
+
+static int intel_guc_scrub_ctbs(void *arg)
+{
+   struct intel_gt *gt = arg;
+   int ret = 0;
+   int i;
+   struct i915_request *last[3] = {NULL, NULL, NULL}, *rq;
+   intel_wakeref_t wakeref;
+   struct intel_engine_cs *engine;
+   struct intel_context *ce;
+
+   wakeref = intel_runtime_pm_get(gt->uncore->rpm);
+   engine = intel_selftest_find_any_engine(gt);
+
+   /* Submit requests and inject errors forcing G2H to be dropped */
+   for (i = 0; i < 3; ++i) {
+   ce = intel_context_create(engine);
+