Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-09-08 Thread Tvrtko Ursulin



On 07/09/2022 16:03, Dixit, Ashutosh wrote:

On Wed, 07 Sep 2022 00:28:48 -0700, Tvrtko Ursulin wrote:


On 06/09/2022 19:29, Umesh Nerlige Ramappa wrote:

On Thu, Sep 01, 2022 at 04:55:22PM -0700, Dixit, Ashutosh wrote:

On Wed, 31 Aug 2022 15:45:49 -0700, Umesh Nerlige Ramappa wrote:


[snip]



     intel_gt_reset_unlock(gt, srcu);

@@ -1476,17 +1476,21 @@ void intel_guc_busyness_unpark(struct

intel_gt *gt)

  guc->timestamp.ping_delay);
}

-static void __guc_context_update_clks(struct intel_context *ce)
+static u64 guc_context_update_stats(struct intel_context *ce)
{
     struct intel_guc *guc = ce_to_guc(ce);
     struct intel_gt *gt = ce->engine->gt;
     u32 *pphwsp, last_switch, engine_id;
-    u64 start_gt_clk, active;
     unsigned long flags;
+    u64 total, active = 0;
     ktime_t unused;

+    intel_context_pin(ce);


intel_context_pin can sleep and we are not allowed to sleep in this
path -
intel_context_get_total_runtime_ns(), however we can sleep in the ping
worker path, so ideally we want to separate it out for the 2 paths.


Do we know which intel_context_get_total_runtime_ns() call is not allowed
to sleep? E.g. the code path from i915_drm_client_fdinfo() is allowed to
sleep. As mentioned I have done this is v2 of RFC patch which I think is
sufficient, but a more complicated scheme (which I think we can avoid for
now) would be to pin in code paths when sleeping is allowed.



Hmm, maybe intel_context_get_total_runtime_ns can sleep, not sure why I
was assuming that this falls in the perf_pmu path. This is now in the
drm_fdinfo query path. + Tvrtko.

@Tvrtko, any idea if intel_context_get_total_runtime_ns path can sleep?


Not at the moment - it calls it from a lockless (rcu) section when walking
all the contexts belonging to a client. Idea being performance queries
should have minimum effect on a running system.


Hmm my bad, missing the rcu and assuming a userspace thread will be able to
sleep.


I think it would be possible to change in theory but not sure how much
work. There is a hard need to sleep in there or what?


GuC contexts need to be pinned/unpinned which can sleep but investigating
if we can return a previously computed busyness when we cannot pin/sleep.


Yeah it would be conceptually nice to keep that query light weight. 
Doing too much work to query accumulated state, like in case of pinning 
the unpinned contexts, feels a bit over the top. Hopefully there aren't 
any nasty races which would make this hard.


Regards,

Tvrtko


Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-09-07 Thread Dixit, Ashutosh
On Wed, 07 Sep 2022 00:28:48 -0700, Tvrtko Ursulin wrote:
>
> On 06/09/2022 19:29, Umesh Nerlige Ramappa wrote:
> > On Thu, Sep 01, 2022 at 04:55:22PM -0700, Dixit, Ashutosh wrote:
> >> On Wed, 31 Aug 2022 15:45:49 -0700, Umesh Nerlige Ramappa wrote:
>
> [snip]
>
> >>> >
> >>> >    intel_gt_reset_unlock(gt, srcu);
> >>> >
> >>> > @@ -1476,17 +1476,21 @@ void intel_guc_busyness_unpark(struct
> >>> intel_gt *gt)
> >>> > guc->timestamp.ping_delay);
> >>> > }
> >>> >
> >>> > -static void __guc_context_update_clks(struct intel_context *ce)
> >>> > +static u64 guc_context_update_stats(struct intel_context *ce)
> >>> > {
> >>> >    struct intel_guc *guc = ce_to_guc(ce);
> >>> >    struct intel_gt *gt = ce->engine->gt;
> >>> >    u32 *pphwsp, last_switch, engine_id;
> >>> > -    u64 start_gt_clk, active;
> >>> >    unsigned long flags;
> >>> > +    u64 total, active = 0;
> >>> >    ktime_t unused;
> >>> >
> >>> > +    intel_context_pin(ce);
> >>>
> >>> intel_context_pin can sleep and we are not allowed to sleep in this
> >>> path -
> >>> intel_context_get_total_runtime_ns(), however we can sleep in the ping
> >>> worker path, so ideally we want to separate it out for the 2 paths.
> >>
> >> Do we know which intel_context_get_total_runtime_ns() call is not allowed
> >> to sleep? E.g. the code path from i915_drm_client_fdinfo() is allowed to
> >> sleep. As mentioned I have done this is v2 of RFC patch which I think is
> >> sufficient, but a more complicated scheme (which I think we can avoid for
> >> now) would be to pin in code paths when sleeping is allowed.
> >>
> >
> > Hmm, maybe intel_context_get_total_runtime_ns can sleep, not sure why I
> > was assuming that this falls in the perf_pmu path. This is now in the
> > drm_fdinfo query path. + Tvrtko.
> >
> > @Tvrtko, any idea if intel_context_get_total_runtime_ns path can sleep?
>
> Not at the moment - it calls it from a lockless (rcu) section when walking
> all the contexts belonging to a client. Idea being performance queries
> should have minimum effect on a running system.

Hmm my bad, missing the rcu and assuming a userspace thread will be able to
sleep.

> I think it would be possible to change in theory but not sure how much
> work. There is a hard need to sleep in there or what?

GuC contexts need to be pinned/unpinned which can sleep but investigating
if we can return a previously computed busyness when we cannot pin/sleep.

Thanks.
--
Ashutosh


Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-09-07 Thread Tvrtko Ursulin




On 06/09/2022 19:29, Umesh Nerlige Ramappa wrote:

On Thu, Sep 01, 2022 at 04:55:22PM -0700, Dixit, Ashutosh wrote:

On Wed, 31 Aug 2022 15:45:49 -0700, Umesh Nerlige Ramappa wrote:


[snip]


>
>    intel_gt_reset_unlock(gt, srcu);
>
> @@ -1476,17 +1476,21 @@ void intel_guc_busyness_unpark(struct 
intel_gt *gt)

> guc->timestamp.ping_delay);
> }
>
> -static void __guc_context_update_clks(struct intel_context *ce)
> +static u64 guc_context_update_stats(struct intel_context *ce)
> {
>    struct intel_guc *guc = ce_to_guc(ce);
>    struct intel_gt *gt = ce->engine->gt;
>    u32 *pphwsp, last_switch, engine_id;
> -    u64 start_gt_clk, active;
>    unsigned long flags;
> +    u64 total, active = 0;
>    ktime_t unused;
>
> +    intel_context_pin(ce);

intel_context_pin can sleep and we are not allowed to sleep in this 
path -

intel_context_get_total_runtime_ns(), however we can sleep in the ping
worker path, so ideally we want to separate it out for the 2 paths.


Do we know which intel_context_get_total_runtime_ns() call is not allowed
to sleep? E.g. the code path from i915_drm_client_fdinfo() is allowed to
sleep. As mentioned I have done this is v2 of RFC patch which I think is
sufficient, but a more complicated scheme (which I think we can avoid for
now) would be to pin in code paths when sleeping is allowed.



Hmm, maybe intel_context_get_total_runtime_ns can sleep, not sure why I 
was assuming that this falls in the perf_pmu path. This is now in the 
drm_fdinfo query path. + Tvrtko.


@Tvrtko, any idea if intel_context_get_total_runtime_ns path can sleep?


Not at the moment - it calls it from a lockless (rcu) section when 
walking all the contexts belonging to a client. Idea being performance 
queries should have minimum effect on a running system. I think it would 
be possible to change in theory but not sure how much work. There is a 
hard need to sleep in there or what?


Regards,

Tvrtko


Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-09-07 Thread Dixit, Ashutosh
On Tue, 06 Sep 2022 11:29:15 -0700, Umesh Nerlige Ramappa wrote:
>
> On Thu, Sep 01, 2022 at 04:55:22PM -0700, Dixit, Ashutosh wrote:
> > On Wed, 31 Aug 2022 15:45:49 -0700, Umesh Nerlige Ramappa wrote:
> >>
> >
> > Hi Umesh,
> >
> > I have updated my RFC patch based on your feedback so we can discuss again.
> >
> >> On Wed, Aug 31, 2022 at 12:33:55PM -0700, Ashutosh Dixit wrote:
> >> > 1. Do all ce->stats updates and reads under guc->timestamp.lock
> >>
> >> Other than the question about READ_ONCE/WRITE_ONCE, I am not sure I
> >> understand what's broken in the locking logic here. Can you please add some
> >> description? thanks
> >
> > Basically ce->stats.runtime.total and ce->stats.active are tied together so
> > should be accessed/updated atomically. Also just the update of
> > ce->stats.runtime.total (via lrc_update_runtime()) can have multiple
> > concurrent writers so even that has to be protected (since that update is
> > not atomic).
> >
> > These was the reason for:
> > * Introducing lrc_update_runtime_locked
> > * Returning early from intel_context_get_total_runtime_ns otherwise we'll
> >  need to introduce the same locking there
> > * Enforcing locking in guc_context_update_stats (which was there in your
> >  original patch too)
> >
> > (I think execlists code didn't have this issue because there
> > lrc_update_runtime is called from a tasklet (which iirc is like a single
> > thread/cpu). I am not sure how 'active' is updated in execlist mode so
> > there may or may not be a problem in intel_context_get_total_runtime_ns).
> >
> > I have another question: what about that GPU vs. GuC race mentioned in the
> > comment? What is the sequence of events there between GPU updating the
> > timestamp in the context image vs. GuC setting engine_id to -1 (at least
> > what is the sequence in which GPU and GuC supposed to do these updates
> > though it may not matter to i915 due to scheduling delays)?
>
> With GPU, KMD and GuC running concurrently, after a context is done, this
> is the sequence.
>
> GPU) updates context image (total_runtime)
> KMD) reads total_runtime.
> KMD) sees engine_id is valid. KMD uses start_time from pphwsp to calculate
> active_time.
> KMD) returns total_runtime + active_time (double accounted busyness!!)
> GuC) gets ctxt switchout event and sets engine_id and start_time to ~0

Thanks for the explanation, so yes I understand the double accounting now.

>
> This is not rare when running the IGT tests, so the total runtime is
> updated in the query only if engine_id is idle. continued below ...
>
> >
> >>
> >> > 2. Pin context image before reading
> >> > 3. Merge __guc_context_update_clks and guc_context_update_stats into a
> >> >   single function
> >> > 4. Call lrc_update_runtime() unconditionally in guc_context_update_stats
> >> > 5. Seems no need to update ce->stats.active with this approach
> >> >
> >> > Some of the above steps may not be correct or complete but the idea is to
> >> > discuss/improve the original patch.
> >> >
> >> > Cc: Umesh Nerlige Ramappa 
> >> > Signed-off-by: Ashutosh Dixit 
> >> > ---
> >> > drivers/gpu/drm/i915/gt/intel_context.c   |  2 +-
> >> > drivers/gpu/drm/i915/gt/intel_context_types.h |  2 +-
> >> > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 41 ++-
> >> > 3 files changed, 24 insertions(+), 21 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
> >> > b/drivers/gpu/drm/i915/gt/intel_context.c
> >> > index e2d70a9fdac0..c004f676de27 100644
> >> > --- a/drivers/gpu/drm/i915/gt/intel_context.c
> >> > +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> >> > @@ -581,7 +581,7 @@ u64 intel_context_get_total_runtime_ns(struct 
> >> > intel_context *ce)
> >> >  u64 total, active;
> >> >
> >> >  if (ce->ops->update_stats)
> >> > -ce->ops->update_stats(ce);
> >> > +return ce->ops->update_stats(ce);
> >>
> >> This is an improvement that we can add and eventually, we can make this a
> >> GuC specific vfunc. Doing so may also remove the COPS_RUNTIME_ACTIVE_TOTAL
> >> option that I added to GuC specific context ops.
> >
> > I went ahead and did this in v2 of the RFC patch.
> >
> >> >
> >> >  total = ce->stats.runtime.total;
> >> >  if (ce->ops->flags & COPS_RUNTIME_CYCLES)
> >> > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> >> > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> >> > index f7ff4c7d81c7..699435bfff99 100644
> >> > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> >> > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> >> > @@ -59,7 +59,7 @@ struct intel_context_ops {
> >> >
> >> >  void (*sched_disable)(struct intel_context *ce);
> >> >
> >> > -void (*update_stats)(struct intel_context *ce);
> >> > +u64 (*update_stats)(struct intel_context *ce);
> >> >
> >> >  void (*reset)(struct intel_context *ce);
> >> >  void (*destroy)(struct kref *kref);
> >> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> >> > 

Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-09-06 Thread Umesh Nerlige Ramappa

On Thu, Sep 01, 2022 at 04:55:22PM -0700, Dixit, Ashutosh wrote:

On Wed, 31 Aug 2022 15:45:49 -0700, Umesh Nerlige Ramappa wrote:




Hi Umesh,

I have updated my RFC patch based on your feedback so we can discuss again.


On Wed, Aug 31, 2022 at 12:33:55PM -0700, Ashutosh Dixit wrote:
> 1. Do all ce->stats updates and reads under guc->timestamp.lock

Other than the question about READ_ONCE/WRITE_ONCE, I am not sure I
understand what's broken in the locking logic here. Can you please add some
description? thanks


Basically ce->stats.runtime.total and ce->stats.active are tied together so
should be accessed/updated atomically. Also just the update of
ce->stats.runtime.total (via lrc_update_runtime()) can have multiple
concurrent writers so even that has to be protected (since that update is
not atomic).

These was the reason for:
* Introducing lrc_update_runtime_locked
* Returning early from intel_context_get_total_runtime_ns otherwise we'll
 need to introduce the same locking there
* Enforcing locking in guc_context_update_stats (which was there in your
 original patch too)

(I think execlists code didn't have this issue because there
lrc_update_runtime is called from a tasklet (which iirc is like a single
thread/cpu). I am not sure how 'active' is updated in execlist mode so
there may or may not be a problem in intel_context_get_total_runtime_ns).

I have another question: what about that GPU vs. GuC race mentioned in the
comment? What is the sequence of events there between GPU updating the
timestamp in the context image vs. GuC setting engine_id to -1 (at least
what is the sequence in which GPU and GuC supposed to do these updates
though it may not matter to i915 due to scheduling delays)?


With GPU, KMD and GuC running concurrently, after a context is done, 
this is the sequence.


GPU) updates context image (total_runtime)
KMD) reads total_runtime.
KMD) sees engine_id is valid. KMD uses start_time from pphwsp to 
calculate active_time.

KMD) returns total_runtime + active_time (double accounted busyness!!)
GuC) gets ctxt switchout event and sets engine_id and start_time to ~0

This is not rare when running the IGT tests, so the total runtime is 
updated in the query only if engine_id is idle. continued below ...






> 2. Pin context image before reading
> 3. Merge __guc_context_update_clks and guc_context_update_stats into a
>   single function
> 4. Call lrc_update_runtime() unconditionally in guc_context_update_stats
> 5. Seems no need to update ce->stats.active with this approach
>
> Some of the above steps may not be correct or complete but the idea is to
> discuss/improve the original patch.
>
> Cc: Umesh Nerlige Ramappa 
> Signed-off-by: Ashutosh Dixit 
> ---
> drivers/gpu/drm/i915/gt/intel_context.c   |  2 +-
> drivers/gpu/drm/i915/gt/intel_context_types.h |  2 +-
> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 41 ++-
> 3 files changed, 24 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
> index e2d70a9fdac0..c004f676de27 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.c
> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> @@ -581,7 +581,7 @@ u64 intel_context_get_total_runtime_ns(struct 
intel_context *ce)
>u64 total, active;
>
>if (ce->ops->update_stats)
> -  ce->ops->update_stats(ce);
> +  return ce->ops->update_stats(ce);

This is an improvement that we can add and eventually, we can make this a
GuC specific vfunc. Doing so may also remove the COPS_RUNTIME_ACTIVE_TOTAL
option that I added to GuC specific context ops.


I went ahead and did this in v2 of the RFC patch.


>
>total = ce->stats.runtime.total;
>if (ce->ops->flags & COPS_RUNTIME_CYCLES)
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index f7ff4c7d81c7..699435bfff99 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -59,7 +59,7 @@ struct intel_context_ops {
>
>void (*sched_disable)(struct intel_context *ce);
>
> -  void (*update_stats)(struct intel_context *ce);
> +  u64 (*update_stats)(struct intel_context *ce);
>
>void (*reset)(struct intel_context *ce);
>void (*destroy)(struct kref *kref);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index bee8cf10f749..40d0edaf2e0a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1376,7 +1376,6 @@ static void __update_guc_busyness_stats(struct 
intel_guc *guc)
>spin_unlock_irqrestore(>timestamp.lock, flags);
> }
>
> -static void __guc_context_update_clks(struct intel_context *ce);
> static void guc_timestamp_ping(struct work_struct *wrk)
> {
>struct intel_guc *guc = container_of(wrk, typeof(*guc),
> @@ -1401,7 +1400,8 @@ static void 

Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-09-01 Thread Dixit, Ashutosh
On Wed, 31 Aug 2022 15:45:49 -0700, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

I have updated my RFC patch based on your feedback so we can discuss again.

> On Wed, Aug 31, 2022 at 12:33:55PM -0700, Ashutosh Dixit wrote:
> > 1. Do all ce->stats updates and reads under guc->timestamp.lock
>
> Other than the question about READ_ONCE/WRITE_ONCE, I am not sure I
> understand what's broken in the locking logic here. Can you please add some
> description? thanks

Basically ce->stats.runtime.total and ce->stats.active are tied together so
should be accessed/updated atomically. Also just the update of
ce->stats.runtime.total (via lrc_update_runtime()) can have multiple
concurrent writers so even that has to be protected (since that update is
not atomic).

These was the reason for:
* Introducing lrc_update_runtime_locked
* Returning early from intel_context_get_total_runtime_ns otherwise we'll
  need to introduce the same locking there
* Enforcing locking in guc_context_update_stats (which was there in your
  original patch too)

(I think execlists code didn't have this issue because there
lrc_update_runtime is called from a tasklet (which iirc is like a single
thread/cpu). I am not sure how 'active' is updated in execlist mode so
there may or may not be a problem in intel_context_get_total_runtime_ns).

I have another question: what about that GPU vs. GuC race mentioned in the
comment? What is the sequence of events there between GPU updating the
timestamp in the context image vs. GuC setting engine_id to -1 (at least
what is the sequence in which GPU and GuC supposed to do these updates
though it may not matter to i915 due to scheduling delays)?

>
> > 2. Pin context image before reading
> > 3. Merge __guc_context_update_clks and guc_context_update_stats into a
> >   single function
> > 4. Call lrc_update_runtime() unconditionally in guc_context_update_stats
> > 5. Seems no need to update ce->stats.active with this approach
> >
> > Some of the above steps may not be correct or complete but the idea is to
> > discuss/improve the original patch.
> >
> > Cc: Umesh Nerlige Ramappa 
> > Signed-off-by: Ashutosh Dixit 
> > ---
> > drivers/gpu/drm/i915/gt/intel_context.c   |  2 +-
> > drivers/gpu/drm/i915/gt/intel_context_types.h |  2 +-
> > .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 41 ++-
> > 3 files changed, 24 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
> > b/drivers/gpu/drm/i915/gt/intel_context.c
> > index e2d70a9fdac0..c004f676de27 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_context.c
> > @@ -581,7 +581,7 @@ u64 intel_context_get_total_runtime_ns(struct 
> > intel_context *ce)
> > u64 total, active;
> >
> > if (ce->ops->update_stats)
> > -   ce->ops->update_stats(ce);
> > +   return ce->ops->update_stats(ce);
>
> This is an improvement that we can add and eventually, we can make this a
> GuC specific vfunc. Doing so may also remove the COPS_RUNTIME_ACTIVE_TOTAL
> option that I added to GuC specific context ops.

I went ahead and did this in v2 of the RFC patch.

> >
> > total = ce->stats.runtime.total;
> > if (ce->ops->flags & COPS_RUNTIME_CYCLES)
> > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > index f7ff4c7d81c7..699435bfff99 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > @@ -59,7 +59,7 @@ struct intel_context_ops {
> >
> > void (*sched_disable)(struct intel_context *ce);
> >
> > -   void (*update_stats)(struct intel_context *ce);
> > +   u64 (*update_stats)(struct intel_context *ce);
> >
> > void (*reset)(struct intel_context *ce);
> > void (*destroy)(struct kref *kref);
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index bee8cf10f749..40d0edaf2e0a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -1376,7 +1376,6 @@ static void __update_guc_busyness_stats(struct 
> > intel_guc *guc)
> > spin_unlock_irqrestore(>timestamp.lock, flags);
> > }
> >
> > -static void __guc_context_update_clks(struct intel_context *ce);
> > static void guc_timestamp_ping(struct work_struct *wrk)
> > {
> > struct intel_guc *guc = container_of(wrk, typeof(*guc),
> > @@ -1401,7 +1400,8 @@ static void guc_timestamp_ping(struct work_struct 
> > *wrk)
> >
> > /* adjust context stats for overflow */
> > xa_for_each(>context_lookup, index, ce)
> > -   __guc_context_update_clks(ce);
> > +   if (ce->ops->update_stats)
> > +   ce->ops->update_stats(ce);
>
> context pinning logic needs to be separated for this (ping) path vs. the
> query path - intel_context_get_total_runtime_ns().

Done in v2 of RFC patch.

> >
> > 

Re: [Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-08-31 Thread Umesh Nerlige Ramappa

On Wed, Aug 31, 2022 at 12:33:55PM -0700, Ashutosh Dixit wrote:

1. Do all ce->stats updates and reads under guc->timestamp.lock


Other than the question about READ_ONCE/WRITE_ONCE, I am not sure I 
understand what's broken in the locking logic here. Can you please add 
some description? thanks



2. Pin context image before reading
3. Merge __guc_context_update_clks and guc_context_update_stats into a
  single function
4. Call lrc_update_runtime() unconditionally in guc_context_update_stats
5. Seems no need to update ce->stats.active with this approach

Some of the above steps may not be correct or complete but the idea is to
discuss/improve the original patch.

Cc: Umesh Nerlige Ramappa 
Signed-off-by: Ashutosh Dixit 
---
drivers/gpu/drm/i915/gt/intel_context.c   |  2 +-
drivers/gpu/drm/i915/gt/intel_context_types.h |  2 +-
.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 41 ++-
3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
index e2d70a9fdac0..c004f676de27 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -581,7 +581,7 @@ u64 intel_context_get_total_runtime_ns(struct intel_context 
*ce)
u64 total, active;

if (ce->ops->update_stats)
-   ce->ops->update_stats(ce);
+   return ce->ops->update_stats(ce);


This is an improvement that we can add and eventually, we can make this 
a GuC specific vfunc. Doing so may also remove the 
COPS_RUNTIME_ACTIVE_TOTAL option that I added to GuC specific context 
ops.




total = ce->stats.runtime.total;
if (ce->ops->flags & COPS_RUNTIME_CYCLES)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index f7ff4c7d81c7..699435bfff99 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -59,7 +59,7 @@ struct intel_context_ops {

void (*sched_disable)(struct intel_context *ce);

-   void (*update_stats)(struct intel_context *ce);
+   u64 (*update_stats)(struct intel_context *ce);

void (*reset)(struct intel_context *ce);
void (*destroy)(struct kref *kref);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index bee8cf10f749..40d0edaf2e0a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1376,7 +1376,6 @@ static void __update_guc_busyness_stats(struct intel_guc 
*guc)
spin_unlock_irqrestore(>timestamp.lock, flags);
}

-static void __guc_context_update_clks(struct intel_context *ce);
static void guc_timestamp_ping(struct work_struct *wrk)
{
struct intel_guc *guc = container_of(wrk, typeof(*guc),
@@ -1401,7 +1400,8 @@ static void guc_timestamp_ping(struct work_struct *wrk)

/* adjust context stats for overflow */
xa_for_each(>context_lookup, index, ce)
-   __guc_context_update_clks(ce);
+   if (ce->ops->update_stats)
+   ce->ops->update_stats(ce);


context pinning logic needs to be separated for this (ping) path vs. the 
query path - intel_context_get_total_runtime_ns().


intel_gt_reset_unlock(gt, srcu);

@@ -1476,17 +1476,21 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 guc->timestamp.ping_delay);
}

-static void __guc_context_update_clks(struct intel_context *ce)
+static u64 guc_context_update_stats(struct intel_context *ce)
{
struct intel_guc *guc = ce_to_guc(ce);
struct intel_gt *gt = ce->engine->gt;
u32 *pphwsp, last_switch, engine_id;
-   u64 start_gt_clk, active;
unsigned long flags;
+   u64 total, active = 0;
ktime_t unused;

+   intel_context_pin(ce);


intel_context_pin can sleep and we are not allowed to sleep in this path 
- intel_context_get_total_runtime_ns(), however we can sleep in the ping 
  worker path, so ideally we want to separate it out for the 2 paths.



spin_lock_irqsave(>timestamp.lock, flags);

+   lrc_update_runtime(ce);
+   total = ce->stats.runtime.total;


For long running contexts and frequenct queries, calling this ^ when a 
context is active does not add value. I would just call it in the else 
like before.



+
/*
 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
@@ -1502,27 +1506,26 @@ static void __guc_context_update_clks(struct 
intel_context *ce)
guc_update_pm_timestamp(guc, );

if (engine_id != 0x && last_switch) {
-   start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
-   __extend_last_switch(guc, _gt_clk, last_switch);
-   active = intel_gt_clock_interval_to_ns(gt, 

[Intel-gfx] [RFC PATCH 2/2] Fix per client busyness locking

2022-08-31 Thread Ashutosh Dixit
1. Do all ce->stats updates and reads under guc->timestamp.lock
2. Pin context image before reading
3. Merge __guc_context_update_clks and guc_context_update_stats into a
   single function
4. Call lrc_update_runtime() unconditionally in guc_context_update_stats
5. Seems no need to update ce->stats.active with this approach

Some of the above steps may not be correct or complete but the idea is to
discuss/improve the original patch.

Cc: Umesh Nerlige Ramappa 
Signed-off-by: Ashutosh Dixit 
---
 drivers/gpu/drm/i915/gt/intel_context.c   |  2 +-
 drivers/gpu/drm/i915/gt/intel_context_types.h |  2 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 41 ++-
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
b/drivers/gpu/drm/i915/gt/intel_context.c
index e2d70a9fdac0..c004f676de27 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -581,7 +581,7 @@ u64 intel_context_get_total_runtime_ns(struct intel_context 
*ce)
u64 total, active;
 
if (ce->ops->update_stats)
-   ce->ops->update_stats(ce);
+   return ce->ops->update_stats(ce);
 
total = ce->stats.runtime.total;
if (ce->ops->flags & COPS_RUNTIME_CYCLES)
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
b/drivers/gpu/drm/i915/gt/intel_context_types.h
index f7ff4c7d81c7..699435bfff99 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -59,7 +59,7 @@ struct intel_context_ops {
 
void (*sched_disable)(struct intel_context *ce);
 
-   void (*update_stats)(struct intel_context *ce);
+   u64 (*update_stats)(struct intel_context *ce);
 
void (*reset)(struct intel_context *ce);
void (*destroy)(struct kref *kref);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index bee8cf10f749..40d0edaf2e0a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1376,7 +1376,6 @@ static void __update_guc_busyness_stats(struct intel_guc 
*guc)
spin_unlock_irqrestore(>timestamp.lock, flags);
 }
 
-static void __guc_context_update_clks(struct intel_context *ce);
 static void guc_timestamp_ping(struct work_struct *wrk)
 {
struct intel_guc *guc = container_of(wrk, typeof(*guc),
@@ -1401,7 +1400,8 @@ static void guc_timestamp_ping(struct work_struct *wrk)
 
/* adjust context stats for overflow */
xa_for_each(>context_lookup, index, ce)
-   __guc_context_update_clks(ce);
+   if (ce->ops->update_stats)
+   ce->ops->update_stats(ce);
 
intel_gt_reset_unlock(gt, srcu);
 
@@ -1476,17 +1476,21 @@ void intel_guc_busyness_unpark(struct intel_gt *gt)
 guc->timestamp.ping_delay);
 }
 
-static void __guc_context_update_clks(struct intel_context *ce)
+static u64 guc_context_update_stats(struct intel_context *ce)
 {
struct intel_guc *guc = ce_to_guc(ce);
struct intel_gt *gt = ce->engine->gt;
u32 *pphwsp, last_switch, engine_id;
-   u64 start_gt_clk, active;
unsigned long flags;
+   u64 total, active = 0;
ktime_t unused;
 
+   intel_context_pin(ce);
spin_lock_irqsave(>timestamp.lock, flags);
 
+   lrc_update_runtime(ce);
+   total = ce->stats.runtime.total;
+
/*
 * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when context is switched
 * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
@@ -1502,27 +1506,26 @@ static void __guc_context_update_clks(struct 
intel_context *ce)
guc_update_pm_timestamp(guc, );
 
if (engine_id != 0x && last_switch) {
-   start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
-   __extend_last_switch(guc, _gt_clk, last_switch);
-   active = intel_gt_clock_interval_to_ns(gt, 
guc->timestamp.gt_stamp - start_gt_clk);
-   WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
-   WRITE_ONCE(ce->stats.active, active);
-   } else {
-   lrc_update_runtime(ce);
+   __extend_last_switch(guc, >stats.runtime.start_gt_clk, 
last_switch);
+   active = intel_gt_clock_interval_to_ns(gt,
+ guc->timestamp.gt_stamp - 
ce->stats.runtime.start_gt_clk);
}
 
spin_unlock_irqrestore(>timestamp.lock, flags);
+   intel_context_unpin(ce);
+
+   return total + active;
 }
 
-static void guc_context_update_stats(struct intel_context *ce)
+void lrc_update_runtime_locked(struct intel_context *ce)
 {
-   if (!intel_context_pin_if_active(ce)) {
-   WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
-   WRITE_ONCE(ce->stats.active, 0);
-   return;
-   }
+   struct