[Intel-gfx] [PATCH 06/10] drm/i915/pmu: Wire up engine busy stats to PMU

2017-09-29 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We can use engine busy stats instead of the sampling timer for
better accuracy.

By doing this we replace the stohastic sampling with busyness
metric derived directly from engine activity. This is context
switch interrupt driven, so as accurate as we can get from
software tracking.

As a secondary benefit, we can also not run the sampling timer
in cases only busyness metric is enabled.

v2: Rebase.
v3:
 * Rebase, comments.
 * Leave engine busyness controls out of workers.
v4: Checkpatch cleanup.
v5: Added comment to pmu_needs_timer change.
v6:
 * Rebase.
 * Fix style of some comments. (Chris Wilson)
v7: Rebase and commit message update. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_pmu.c | 37 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index f341c904c159..93c0e7ec7d75 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -90,6 +90,11 @@ static unsigned int event_enabled_bit(struct perf_event 
*event)
return config_enabled_bit(event->attr.config);
 }
 
+static bool supports_busy_stats(void)
+{
+   return i915_modparams.enable_execlists;
+}
+
 static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active)
 {
u64 enable;
@@ -115,6 +120,12 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, 
bool gpu_active)
 */
if (!gpu_active)
enable &= ~ENGINE_SAMPLE_MASK;
+   /*
+* Also there is software busyness tracking available we do not
+* need the timer for I915_SAMPLE_BUSY counter.
+*/
+   else if (supports_busy_stats())
+   enable &= ~BIT(I915_SAMPLE_BUSY);
 
/*
 * If some bits remain it means we need the sampling timer running.
@@ -362,6 +373,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
 
if (WARN_ON_ONCE(!engine)) {
/* Do nothing */
+   } else if (sample == I915_SAMPLE_BUSY &&
+  engine->pmu.busy_stats) {
+   val = ktime_to_ns(intel_engine_get_busy_time(engine));
} else {
val = engine->pmu.sample[sample].cur;
}
@@ -398,6 +412,12 @@ static void i915_pmu_event_read(struct perf_event *event)
local64_add(new - prev, &event->count);
 }
 
+static bool engine_needs_busy_stats(struct intel_engine_cs *engine)
+{
+   return supports_busy_stats() &&
+  (engine->pmu.enable & BIT(I915_SAMPLE_BUSY));
+}
+
 static void i915_pmu_enable(struct perf_event *event)
 {
struct drm_i915_private *i915 =
@@ -437,7 +457,14 @@ static void i915_pmu_enable(struct perf_event *event)
 
GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
-   engine->pmu.enable_count[sample]++;
+   if (engine->pmu.enable_count[sample]++ == 0) {
+   if (engine_needs_busy_stats(engine) &&
+   !engine->pmu.busy_stats) {
+   engine->pmu.busy_stats =
+   intel_enable_engine_stats(engine) == 0;
+   WARN_ON_ONCE(!engine->pmu.busy_stats);
+   }
+   }
}
 
/*
@@ -473,8 +500,14 @@ static void i915_pmu_disable(struct perf_event *event)
 * Decrement the reference count and clear the enabled
 * bitmask when the last listener on an event goes away.
 */
-   if (--engine->pmu.enable_count[sample] == 0)
+   if (--engine->pmu.enable_count[sample] == 0) {
engine->pmu.enable &= ~BIT(sample);
+   if (!engine_needs_busy_stats(engine) &&
+   engine->pmu.busy_stats) {
+   engine->pmu.busy_stats = false;
+   intel_disable_engine_stats(engine);
+   }
+   }
}
 
GEM_BUG_ON(bit >= I915_PMU_MASK_BITS);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ee738c7694e5..0f8ccb243407 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -351,6 +351,11 @@ struct intel_engine_cs {
 * Our internal timer stores the current counters in this field.
 */
struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
+   /**
+* @busy_stats: Has enablement of engine stats tracking been
+*  requested.
+*/
+   bool busy_stats;
} pmu;
 

Re: [Intel-gfx] [PATCH 06/10] drm/i915/pmu: Wire up engine busy stats to PMU

2017-10-04 Thread Tvrtko Ursulin


On 29/09/2017 13:34, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

We can use engine busy stats instead of the sampling timer for
better accuracy.

By doing this we replace the stohastic sampling with busyness
metric derived directly from engine activity. This is context
switch interrupt driven, so as accurate as we can get from
software tracking.


To quantify the precision benefit with a graph:

https://people.freedesktop.org/~tursulin/sampling-vs-busy-stats.png

This represents the render engine busyness on neverball menu screen 
looping a few times. PMU sampling interval is 100ms. It was generated 
from a special build with both PMU counters running in parallel so the 
data is absolutely aligned.


Anomaly of busyness going over 100% is caused by me not bothering to use 
the actual timestamps got from PMU, but using fixed interval (perf stat 
-I 100). But I don't think it matters for the rough comparison and 
looking at the noise and error in the sampling version.



As a secondary benefit, we can also not run the sampling timer
in cases only busyness metric is enabled.


For this one data is not showing anything significant. I've seen the 
i915_sample function use <0.1% of CPU time but that's it. Probably with 
the original version which used MMIO it was much worse.


Regards,

Tvrtko


v2: Rebase.
v3:
  * Rebase, comments.
  * Leave engine busyness controls out of workers.
v4: Checkpatch cleanup.
v5: Added comment to pmu_needs_timer change.
v6:
  * Rebase.
  * Fix style of some comments. (Chris Wilson)
v7: Rebase and commit message update. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Chris Wilson 
---
  drivers/gpu/drm/i915/i915_pmu.c | 37 +++--
  drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +
  2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index f341c904c159..93c0e7ec7d75 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -90,6 +90,11 @@ static unsigned int event_enabled_bit(struct perf_event 
*event)
return config_enabled_bit(event->attr.config);
  }
  
+static bool supports_busy_stats(void)

+{
+   return i915_modparams.enable_execlists;
+}
+
  static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active)
  {
u64 enable;
@@ -115,6 +120,12 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, 
bool gpu_active)
 */
if (!gpu_active)
enable &= ~ENGINE_SAMPLE_MASK;
+   /*
+* Also there is software busyness tracking available we do not
+* need the timer for I915_SAMPLE_BUSY counter.
+*/
+   else if (supports_busy_stats())
+   enable &= ~BIT(I915_SAMPLE_BUSY);
  
  	/*

 * If some bits remain it means we need the sampling timer running.
@@ -362,6 +373,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
  
  		if (WARN_ON_ONCE(!engine)) {

/* Do nothing */
+   } else if (sample == I915_SAMPLE_BUSY &&
+  engine->pmu.busy_stats) {
+   val = ktime_to_ns(intel_engine_get_busy_time(engine));
} else {
val = engine->pmu.sample[sample].cur;
}
@@ -398,6 +412,12 @@ static void i915_pmu_event_read(struct perf_event *event)
local64_add(new - prev, &event->count);
  }
  
+static bool engine_needs_busy_stats(struct intel_engine_cs *engine)

+{
+   return supports_busy_stats() &&
+  (engine->pmu.enable & BIT(I915_SAMPLE_BUSY));
+}
+
  static void i915_pmu_enable(struct perf_event *event)
  {
struct drm_i915_private *i915 =
@@ -437,7 +457,14 @@ static void i915_pmu_enable(struct perf_event *event)
  
  		GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);

GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
-   engine->pmu.enable_count[sample]++;
+   if (engine->pmu.enable_count[sample]++ == 0) {
+   if (engine_needs_busy_stats(engine) &&
+   !engine->pmu.busy_stats) {
+   engine->pmu.busy_stats =
+   intel_enable_engine_stats(engine) == 0;
+   WARN_ON_ONCE(!engine->pmu.busy_stats);
+   }
+   }
}
  
  	/*

@@ -473,8 +500,14 @@ static void i915_pmu_disable(struct perf_event *event)
 * Decrement the reference count and clear the enabled
 * bitmask when the last listener on an event goes away.
 */
-   if (--engine->pmu.enable_count[sample] == 0)
+   if (--engine->pmu.enable_count[sample] == 0) {
engine->pmu.enable &= ~BIT(sample);
+   if (!engine_needs_busy_stats(engine) &&
+   engine->pmu.busy_stats) {
+ 

Re: [Intel-gfx] [PATCH 06/10] drm/i915/pmu: Wire up engine busy stats to PMU

2017-10-04 Thread Rogozhkin, Dmitry V
Nice data, thank you!

-Original Message-
From: Tvrtko Ursulin [mailto:tvrtko.ursu...@linux.intel.com] 
Sent: Wednesday, October 4, 2017 10:35 AM
To: Tvrtko Ursulin ; Intel-gfx@lists.freedesktop.org
Cc: Chris Wilson ; Rogozhkin, Dmitry V 

Subject: Re: [Intel-gfx] [PATCH 06/10] drm/i915/pmu: Wire up engine busy stats 
to PMU


On 29/09/2017 13:34, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin 
> 
> We can use engine busy stats instead of the sampling timer for better 
> accuracy.
> 
> By doing this we replace the stohastic sampling with busyness metric 
> derived directly from engine activity. This is context switch 
> interrupt driven, so as accurate as we can get from software tracking.

To quantify the precision benefit with a graph:

https://people.freedesktop.org/~tursulin/sampling-vs-busy-stats.png

This represents the render engine busyness on neverball menu screen looping a 
few times. PMU sampling interval is 100ms. It was generated from a special 
build with both PMU counters running in parallel so the data is absolutely 
aligned.

Anomaly of busyness going over 100% is caused by me not bothering to use the 
actual timestamps got from PMU, but using fixed interval (perf stat -I 100). 
But I don't think it matters for the rough comparison and looking at the noise 
and error in the sampling version.

> As a secondary benefit, we can also not run the sampling timer in 
> cases only busyness metric is enabled.

For this one data is not showing anything significant. I've seen the 
i915_sample function use <0.1% of CPU time but that's it. Probably with the 
original version which used MMIO it was much worse.

Regards,

Tvrtko

> v2: Rebase.
> v3:
>   * Rebase, comments.
>   * Leave engine busyness controls out of workers.
> v4: Checkpatch cleanup.
> v5: Added comment to pmu_needs_timer change.
> v6:
>   * Rebase.
>   * Fix style of some comments. (Chris Wilson)
> v7: Rebase and commit message update. (Chris Wilson)
> 
> Signed-off-by: Tvrtko Ursulin 
> Reviewed-by: Chris Wilson 
> ---
>   drivers/gpu/drm/i915/i915_pmu.c | 37 
> +++--
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +
>   2 files changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c 
> b/drivers/gpu/drm/i915/i915_pmu.c index f341c904c159..93c0e7ec7d75 
> 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -90,6 +90,11 @@ static unsigned int event_enabled_bit(struct perf_event 
> *event)
>   return config_enabled_bit(event->attr.config);
>   }
>   
> +static bool supports_busy_stats(void) {
> + return i915_modparams.enable_execlists; }
> +
>   static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active)
>   {
>   u64 enable;
> @@ -115,6 +120,12 @@ static bool pmu_needs_timer(struct drm_i915_private 
> *i915, bool gpu_active)
>*/
>   if (!gpu_active)
>   enable &= ~ENGINE_SAMPLE_MASK;
> + /*
> +  * Also there is software busyness tracking available we do not
> +  * need the timer for I915_SAMPLE_BUSY counter.
> +  */
> + else if (supports_busy_stats())
> + enable &= ~BIT(I915_SAMPLE_BUSY);
>   
>   /*
>* If some bits remain it means we need the sampling timer running.
> @@ -362,6 +373,9 @@ static u64 __i915_pmu_event_read(struct perf_event 
> *event)
>   
>   if (WARN_ON_ONCE(!engine)) {
>   /* Do nothing */
> + } else if (sample == I915_SAMPLE_BUSY &&
> +engine->pmu.busy_stats) {
> + val = ktime_to_ns(intel_engine_get_busy_time(engine));
>   } else {
>   val = engine->pmu.sample[sample].cur;
>   }
> @@ -398,6 +412,12 @@ static void i915_pmu_event_read(struct perf_event *event)
>   local64_add(new - prev, &event->count);
>   }
>   
> +static bool engine_needs_busy_stats(struct intel_engine_cs *engine) {
> + return supports_busy_stats() &&
> +(engine->pmu.enable & BIT(I915_SAMPLE_BUSY)); }
> +
>   static void i915_pmu_enable(struct perf_event *event)
>   {
>   struct drm_i915_private *i915 =
> @@ -437,7 +457,14 @@ static void i915_pmu_enable(struct perf_event 
> *event)
>   
>   GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
>   GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
> - engine->pmu.enable_count[sample]++;
> + if (engine->pmu.enable_count[sample]++ == 0) {
> + if (engine_needs_busy_stats(engine) &&
> + !engine->pmu.bus