Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-06-07 Thread Chris Wilson
Quoting Tvrtko Ursulin (2018-06-07 14:25:28)
> From: Tvrtko Ursulin 
> 
> We add a PMU counter to expose the number of requests currently executing
> on the GPU.
> 
> This is useful to analyze the overall load of the system.
> 
> v2:
>  * Rebase.
>  * Drop floating point constant. (Chris Wilson)
> 
> v3:
>  * Change scale to 1024 for faster arithmetics. (Chris Wilson)
> 
> v4:
>  * Refactored for timer period accounting.
> 
> v5:
>  * Avoid 64-division. (Chris Wilson)
> 
> v6:
>  * Do fewer divisions by accumulating in qd.ns units. (Chris Wilson)
>  * Change counter scale to avoid multiplication in readout and increase
>counter headroom.
> 
> Signed-off-by: Tvrtko Ursulin 

I can't spot any nits to pick. That means I actually have to review it
now, right?
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-06-07 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
 * Rebase.
 * Drop floating point constant. (Chris Wilson)

v3:
 * Change scale to 1024 for faster arithmetics. (Chris Wilson)

v4:
 * Refactored for timer period accounting.

v5:
 * Avoid 64-division. (Chris Wilson)

v6:
 * Do fewer divisions by accumulating in qd.ns units. (Chris Wilson)
 * Change counter scale to avoid multiplication in readout and increase
   counter headroom.

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_pmu.c | 20 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +-
 include/uapi/drm/i915_drm.h |  5 +
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index bdfb430909b4..73b6fe7cc6af 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -17,7 +17,8 @@
 BIT(I915_SAMPLE_WAIT) | \
 BIT(I915_SAMPLE_SEMA) | \
 BIT(I915_SAMPLE_QUEUED) | \
-BIT(I915_SAMPLE_RUNNABLE))
+BIT(I915_SAMPLE_RUNNABLE) | \
+BIT(I915_SAMPLE_RUNNING))
 
 #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
 
@@ -222,6 +223,11 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned 
int period_ns)

add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNABLE],
engine->request_stats.runnable,
period_ns);
+
+   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
+   
add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING],
+   last_seqno - current_seqno,
+   period_ns);
}
 
if (fw)
@@ -337,6 +343,7 @@ engine_event_status(struct intel_engine_cs *engine,
case I915_SAMPLE_WAIT:
case I915_SAMPLE_QUEUED:
case I915_SAMPLE_RUNNABLE:
+   case I915_SAMPLE_RUNNING:
break;
case I915_SAMPLE_SEMA:
if (INTEL_GEN(engine->i915) < 6)
@@ -556,11 +563,14 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
val = engine->pmu.sample[sample].cur;
 
if (sample == I915_SAMPLE_QUEUED ||
-   sample == I915_SAMPLE_RUNNABLE) {
+   sample == I915_SAMPLE_RUNNABLE ||
+   sample == I915_SAMPLE_RUNNING) {
BUILD_BUG_ON(NSEC_PER_SEC %
 I915_SAMPLE_QUEUED_DIVISOR);
BUILD_BUG_ON(I915_SAMPLE_QUEUED_DIVISOR !=
 I915_SAMPLE_RUNNABLE_DIVISOR);
+   BUILD_BUG_ON(I915_SAMPLE_QUEUED_DIVISOR !=
+I915_SAMPLE_RUNNING_DIVISOR);
/* to qd */
val = div_u64(val,
  NSEC_PER_SEC /
@@ -862,6 +872,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char 
*name,
 /* No brackets or quotes below please. */
 #define I915_SAMPLE_QUEUED_SCALE 0.001
 #define I915_SAMPLE_RUNNABLE_SCALE 0.001
+#define I915_SAMPLE_RUNNING_SCALE 0.001
 
 static struct attribute **
 create_event_attributes(struct drm_i915_private *i915)
@@ -889,6 +900,8 @@ create_event_attributes(struct drm_i915_private *i915)
 __stringify(I915_SAMPLE_QUEUED_SCALE)),
__engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable",
 __stringify(I915_SAMPLE_RUNNABLE_SCALE)),
+   __engine_event_scale(I915_SAMPLE_RUNNING, "running",
+__stringify(I915_SAMPLE_RUNNING_SCALE)),
};
unsigned int count = 0;
struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
@@ -904,6 +917,9 @@ create_event_attributes(struct drm_i915_private *i915)
BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR !=
 (1 / I915_SAMPLE_RUNNABLE_SCALE));
 
+   BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR !=
+(1 / I915_SAMPLE_RUNNING_SCALE));
+
/* Count how many counters we will be exposing. */
for (i = 0; i < ARRAY_SIZE(events); i++) {
if (!config_status(i915, events[i].config))
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 703cea694f0d..bff20cfd6870 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -420,7 +420,7 @@ struct intel_engine_cs {
 *
 * Our internal timer stores the current counters in this field.
 */
-#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNN

Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-06-06 Thread Tvrtko Ursulin


On 06/06/2018 16:23, Chris Wilson wrote:

Quoting Tvrtko Ursulin (2018-06-06 15:40:10)

From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
  * Rebase.
  * Drop floating point constant. (Chris Wilson)

v3:
  * Change scale to 1024 for faster arithmetics. (Chris Wilson)

v4:
  * Refactored for timer period accounting.

v5:
  * Avoid 64-division. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
---
  #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
  
@@ -226,6 +227,13 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns)

 div_u64((u64)period_ns *
 I915_SAMPLE_QUEUED_DIVISOR,
 100));
+
+   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
+   
add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING],
+   last_seqno - current_seqno,
+   div_u64((u64)period_ns *
+   I915_SAMPLE_QUEUED_DIVISOR,
+   100));


Are we worried about losing precision with qd.ns?

add_sample_mult(SAMPLE, x, period_ns); here


@@ -560,7 +569,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
 val = engine->pmu.sample[sample].cur;
  
 if (sample == I915_SAMPLE_QUEUED ||

-   sample == I915_SAMPLE_RUNNABLE)
+   sample == I915_SAMPLE_RUNNABLE ||
+   sample == I915_SAMPLE_RUNNING)
 val = div_u64(val, MSEC_PER_SEC);  /* to qd */


and val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR, NSEC_PER_SEC);


Yeah that works, thanks.


So that gives us a limit of ~1 million qd (assuming the user cares for
about 1s intervals). Up to 8 million wlog with

val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR/8, NSEC_PER_SEC/8);


Or keep in qd.us as for frequency. I think precision is plenty in any case.


Anyway, just concerned to have more than one 64b division and want to
provoke you into thinking of a way of avoiding it :)


It is an optimized 64-bit divide, or 64-divide as I faltered in the 
commit message :), so not as bad as 64/64, but still your idea is very good.


Regards,

Tvrtko
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-06-06 Thread Chris Wilson
Quoting Tvrtko Ursulin (2018-06-06 15:40:10)
> From: Tvrtko Ursulin 
> 
> We add a PMU counter to expose the number of requests currently executing
> on the GPU.
> 
> This is useful to analyze the overall load of the system.
> 
> v2:
>  * Rebase.
>  * Drop floating point constant. (Chris Wilson)
> 
> v3:
>  * Change scale to 1024 for faster arithmetics. (Chris Wilson)
> 
> v4:
>  * Refactored for timer period accounting.
> 
> v5:
>  * Avoid 64-division. (Chris Wilson)
> 
> Signed-off-by: Tvrtko Ursulin 
> ---
>  #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
>  
> @@ -226,6 +227,13 @@ engines_sample(struct drm_i915_private *dev_priv, 
> unsigned int period_ns)
> div_u64((u64)period_ns *
> I915_SAMPLE_QUEUED_DIVISOR,
> 100));
> +
> +   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
> +   
> add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING],
> +   last_seqno - current_seqno,
> +   div_u64((u64)period_ns *
> +   I915_SAMPLE_QUEUED_DIVISOR,
> +   100));

Are we worried about losing precision with qd.ns?

add_sample_mult(SAMPLE, x, period_ns); here

> @@ -560,7 +569,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
> val = engine->pmu.sample[sample].cur;
>  
> if (sample == I915_SAMPLE_QUEUED ||
> -   sample == I915_SAMPLE_RUNNABLE)
> +   sample == I915_SAMPLE_RUNNABLE ||
> +   sample == I915_SAMPLE_RUNNING)
> val = div_u64(val, MSEC_PER_SEC);  /* to qd */

and val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR, NSEC_PER_SEC);

So that gives us a limit of ~1 million qd (assuming the user cares for
about 1s intervals). Up to 8 million wlog with

val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR/8, NSEC_PER_SEC/8);

Anyway, just concerned to have more than one 64b division and want to
provoke you into thinking of a way of avoiding it :)
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-06-06 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
 * Rebase.
 * Drop floating point constant. (Chris Wilson)

v3:
 * Change scale to 1024 for faster arithmetics. (Chris Wilson)

v4:
 * Refactored for timer period accounting.

v5:
 * Avoid 64-division. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_pmu.c | 20 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +-
 include/uapi/drm/i915_drm.h |  5 +
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 46a516a748c8..9ecaf662b5c1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -17,7 +17,8 @@
 BIT(I915_SAMPLE_WAIT) | \
 BIT(I915_SAMPLE_SEMA) | \
 BIT(I915_SAMPLE_QUEUED) | \
-BIT(I915_SAMPLE_RUNNABLE))
+BIT(I915_SAMPLE_RUNNABLE) | \
+BIT(I915_SAMPLE_RUNNING))
 
 #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
 
@@ -226,6 +227,13 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned 
int period_ns)
div_u64((u64)period_ns *
I915_SAMPLE_QUEUED_DIVISOR,
100));
+
+   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
+   
add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING],
+   last_seqno - current_seqno,
+   div_u64((u64)period_ns *
+   I915_SAMPLE_QUEUED_DIVISOR,
+   100));
}
 
if (fw)
@@ -341,6 +349,7 @@ engine_event_status(struct intel_engine_cs *engine,
case I915_SAMPLE_WAIT:
case I915_SAMPLE_QUEUED:
case I915_SAMPLE_RUNNABLE:
+   case I915_SAMPLE_RUNNING:
break;
case I915_SAMPLE_SEMA:
if (INTEL_GEN(engine->i915) < 6)
@@ -560,7 +569,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
val = engine->pmu.sample[sample].cur;
 
if (sample == I915_SAMPLE_QUEUED ||
-   sample == I915_SAMPLE_RUNNABLE)
+   sample == I915_SAMPLE_RUNNABLE ||
+   sample == I915_SAMPLE_RUNNING)
val = div_u64(val, MSEC_PER_SEC);  /* to qd */
}
} else {
@@ -858,6 +868,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char 
*name,
 /* No brackets or quotes below please. */
 #define I915_SAMPLE_QUEUED_SCALE 0.0009765625
 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625
+#define I915_SAMPLE_RUNNING_SCALE 0.0009765625
 
 static struct attribute **
 create_event_attributes(struct drm_i915_private *i915)
@@ -885,6 +896,8 @@ create_event_attributes(struct drm_i915_private *i915)
 __stringify(I915_SAMPLE_QUEUED_SCALE)),
__engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable",
 __stringify(I915_SAMPLE_RUNNABLE_SCALE)),
+   __engine_event_scale(I915_SAMPLE_RUNNING, "running",
+__stringify(I915_SAMPLE_RUNNING_SCALE)),
};
unsigned int count = 0;
struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
@@ -900,6 +913,9 @@ create_event_attributes(struct drm_i915_private *i915)
BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR !=
 (1 / I915_SAMPLE_RUNNABLE_SCALE));
 
+   BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR !=
+(1 / I915_SAMPLE_RUNNING_SCALE));
+
/* Count how many counters we will be exposing. */
for (i = 0; i < ARRAY_SIZE(events); i++) {
if (!config_status(i915, events[i].config))
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 703cea694f0d..bff20cfd6870 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -420,7 +420,7 @@ struct intel_engine_cs {
 *
 * Our internal timer stores the current counters in this field.
 */
-#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1)
+#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1)
struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
} pmu;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index cf0265b20e37..9a00c30e4071 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample {
I915_SAMPLE_SEMA = 2,
I915_SAMPLE_QUEUED = 3,
I915_SAMPLE_RUNNABLE = 4,
+ 

[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-06-06 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
 * Rebase.
 * Drop floating point constant. (Chris Wilson)

v3:
 * Change scale to 1024 for faster arithmetics. (Chris Wilson)

v4:
 * Refactored for timer period accounting.

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_pmu.c | 19 +--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +-
 include/uapi/drm/i915_drm.h |  5 +
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 41527b682c72..60dc68e4911c 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -17,7 +17,8 @@
 BIT(I915_SAMPLE_WAIT) | \
 BIT(I915_SAMPLE_SEMA) | \
 BIT(I915_SAMPLE_QUEUED) | \
-BIT(I915_SAMPLE_RUNNABLE))
+BIT(I915_SAMPLE_RUNNABLE) | \
+BIT(I915_SAMPLE_RUNNING))
 
 #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
 
@@ -224,6 +225,12 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned 
int period_ns)
engine->request_stats.runnable,
(u64)period_ns *
I915_SAMPLE_QUEUED_DIVISOR / 100);
+
+   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
+   
add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING],
+   last_seqno - current_seqno,
+   (u64)period_ns *
+   I915_SAMPLE_QUEUED_DIVISOR / 100);
}
 
if (fw)
@@ -339,6 +346,7 @@ engine_event_status(struct intel_engine_cs *engine,
case I915_SAMPLE_WAIT:
case I915_SAMPLE_QUEUED:
case I915_SAMPLE_RUNNABLE:
+   case I915_SAMPLE_RUNNING:
break;
case I915_SAMPLE_SEMA:
if (INTEL_GEN(engine->i915) < 6)
@@ -558,7 +566,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
val = engine->pmu.sample[sample].cur;
 
if (sample == I915_SAMPLE_QUEUED ||
-   sample == I915_SAMPLE_RUNNABLE)
+   sample == I915_SAMPLE_RUNNABLE ||
+   sample == I915_SAMPLE_RUNNING)
val = div_u64(val, MSEC_PER_SEC);  /* to qd */
}
} else {
@@ -856,6 +865,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char 
*name,
 /* No brackets or quotes below please. */
 #define I915_SAMPLE_QUEUED_SCALE 0.0009765625
 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625
+#define I915_SAMPLE_RUNNING_SCALE 0.0009765625
 
 static struct attribute **
 create_event_attributes(struct drm_i915_private *i915)
@@ -883,6 +893,8 @@ create_event_attributes(struct drm_i915_private *i915)
 __stringify(I915_SAMPLE_QUEUED_SCALE)),
__engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable",
 __stringify(I915_SAMPLE_RUNNABLE_SCALE)),
+   __engine_event_scale(I915_SAMPLE_RUNNING, "running",
+__stringify(I915_SAMPLE_RUNNING_SCALE)),
};
unsigned int count = 0;
struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
@@ -898,6 +910,9 @@ create_event_attributes(struct drm_i915_private *i915)
BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR !=
 (1 / I915_SAMPLE_RUNNABLE_SCALE));
 
+   BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR !=
+(1 / I915_SAMPLE_RUNNING_SCALE));
+
/* Count how many counters we will be exposing. */
for (i = 0; i < ARRAY_SIZE(events); i++) {
if (!config_status(i915, events[i].config))
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 703cea694f0d..bff20cfd6870 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -420,7 +420,7 @@ struct intel_engine_cs {
 *
 * Our internal timer stores the current counters in this field.
 */
-#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1)
+#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1)
struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
} pmu;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index cf0265b20e37..9a00c30e4071 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample {
I915_SAMPLE_SEMA = 2,
I915_SAMPLE_QUEUED = 3,
I915_SAMPLE_RUNNABLE = 4,
+   I915_SAMPLE_RUNNING = 5,
 };
 
  /* Divide counter value by divisor to get the real value. */

Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-04-09 Thread Tvrtko Ursulin


On 06/04/2018 21:24, Chris Wilson wrote:

Quoting Tvrtko Ursulin (2018-04-05 13:39:22)

From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
  * Rebase.
  * Drop floating point constant. (Chris Wilson)

v3:
  * Change scale to 1024 for faster arithmetics. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 


Reviewed-by: Chris Wilson 

Do we want these separate in the final push? Is there value in reverting
one but not the others? They seem a triumvirate.


I think the only benefit to have them separate for me was that rebasing 
was marginally easier. I can just as well squash them if that is preferred.


Regards,

Tvrtko
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-04-06 Thread Chris Wilson
Quoting Tvrtko Ursulin (2018-04-05 13:39:22)
> From: Tvrtko Ursulin 
> 
> We add a PMU counter to expose the number of requests currently executing
> on the GPU.
> 
> This is useful to analyze the overall load of the system.
> 
> v2:
>  * Rebase.
>  * Drop floating point constant. (Chris Wilson)
> 
> v3:
>  * Change scale to 1024 for faster arithmetics. (Chris Wilson)
> 
> Signed-off-by: Tvrtko Ursulin 

Reviewed-by: Chris Wilson 

Do we want these separate in the final push? Is there value in reverting
one but not the others? They seem a triumvirate.
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-04-05 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
 * Rebase.
 * Drop floating point constant. (Chris Wilson)

v3:
 * Change scale to 1024 for faster arithmetics. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_pmu.c | 18 --
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +-
 include/uapi/drm/i915_drm.h |  5 +
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index afc561e1aa92..bd7e695fc663 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -17,7 +17,8 @@
 BIT(I915_SAMPLE_WAIT) | \
 BIT(I915_SAMPLE_SEMA) | \
 BIT(I915_SAMPLE_QUEUED) | \
-BIT(I915_SAMPLE_RUNNABLE))
+BIT(I915_SAMPLE_RUNNABLE) | \
+BIT(I915_SAMPLE_RUNNING))
 
 #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
 
@@ -211,6 +212,11 @@ static void engines_sample(struct drm_i915_private 
*dev_priv)
update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNABLE],
  I915_SAMPLE_RUNNABLE_DIVISOR,
  engine->request_stats.runnable);
+
+   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
+   update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNING],
+ I915_SAMPLE_RUNNING_DIVISOR,
+ last_seqno - current_seqno);
}
 
if (fw)
@@ -310,6 +316,7 @@ engine_event_status(struct intel_engine_cs *engine,
case I915_SAMPLE_WAIT:
case I915_SAMPLE_QUEUED:
case I915_SAMPLE_RUNNABLE:
+   case I915_SAMPLE_RUNNING:
break;
case I915_SAMPLE_SEMA:
if (INTEL_GEN(engine->i915) < 6)
@@ -513,7 +520,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
}
 
if (sample == I915_SAMPLE_QUEUED ||
-   sample == I915_SAMPLE_RUNNABLE)
+   sample == I915_SAMPLE_RUNNABLE ||
+   sample == I915_SAMPLE_RUNNING)
val = div_u64(val, FREQUENCY);
} else {
switch (event->attr.config) {
@@ -810,6 +818,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char 
*name,
 /* No brackets or quotes below please. */
 #define I915_SAMPLE_QUEUED_SCALE 0.0009765625
 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625
+#define I915_SAMPLE_RUNNING_SCALE 0.0009765625
 
 static struct attribute **
 create_event_attributes(struct drm_i915_private *i915)
@@ -837,6 +846,8 @@ create_event_attributes(struct drm_i915_private *i915)
 __stringify(I915_SAMPLE_QUEUED_SCALE)),
__engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable",
 __stringify(I915_SAMPLE_RUNNABLE_SCALE)),
+   __engine_event_scale(I915_SAMPLE_RUNNING, "running",
+__stringify(I915_SAMPLE_RUNNING_SCALE)),
};
unsigned int count = 0;
struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
@@ -852,6 +863,9 @@ create_event_attributes(struct drm_i915_private *i915)
BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR !=
 (1 / I915_SAMPLE_RUNNABLE_SCALE));
 
+   BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR !=
+(1 / I915_SAMPLE_RUNNING_SCALE));
+
/* Count how many counters we will be exposing. */
for (i = 0; i < ARRAY_SIZE(events); i++) {
if (!config_status(i915, events[i].config))
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5af93e88c90f..d50b31eb43a5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -414,7 +414,7 @@ struct intel_engine_cs {
 *
 * Our internal timer stores the current counters in this field.
 */
-#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1)
+#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1)
struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
} pmu;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index cf0265b20e37..9a00c30e4071 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample {
I915_SAMPLE_SEMA = 2,
I915_SAMPLE_QUEUED = 3,
I915_SAMPLE_RUNNABLE = 4,
+   I915_SAMPLE_RUNNING = 5,
 };
 
  /* Divide counter value by divisor to get the real value. */
 #define I915_SAMPLE_QUEUED_DIVISOR (1024)
 #define I915_SAMPLE_RUNNABLE_DIVISOR (1024)
+#define I915_SAMPLE_RUNNING_DIVISOR (1024)
 
 #define I915_PMU_SAMPLE_BITS (4)
 #define I915_PMU_SAMPLE_

[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter

2018-03-19 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

We add a PMU counter to expose the number of requests currently executing
on the GPU.

This is useful to analyze the overall load of the system.

v2:
 * Rebase.
 * Drop floating point constant. (Chris Wilson)

v3:
 * Change scale to 1024 for faster arithmetics. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_pmu.c | 18 --
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +-
 include/uapi/drm/i915_drm.h |  5 +
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index afc561e1aa92..bd7e695fc663 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -17,7 +17,8 @@
 BIT(I915_SAMPLE_WAIT) | \
 BIT(I915_SAMPLE_SEMA) | \
 BIT(I915_SAMPLE_QUEUED) | \
-BIT(I915_SAMPLE_RUNNABLE))
+BIT(I915_SAMPLE_RUNNABLE) | \
+BIT(I915_SAMPLE_RUNNING))
 
 #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
 
@@ -211,6 +212,11 @@ static void engines_sample(struct drm_i915_private 
*dev_priv)
update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNABLE],
  I915_SAMPLE_RUNNABLE_DIVISOR,
  engine->request_stats.runnable);
+
+   if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING))
+   update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNING],
+ I915_SAMPLE_RUNNING_DIVISOR,
+ last_seqno - current_seqno);
}
 
if (fw)
@@ -310,6 +316,7 @@ engine_event_status(struct intel_engine_cs *engine,
case I915_SAMPLE_WAIT:
case I915_SAMPLE_QUEUED:
case I915_SAMPLE_RUNNABLE:
+   case I915_SAMPLE_RUNNING:
break;
case I915_SAMPLE_SEMA:
if (INTEL_GEN(engine->i915) < 6)
@@ -513,7 +520,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
}
 
if (sample == I915_SAMPLE_QUEUED ||
-   sample == I915_SAMPLE_RUNNABLE)
+   sample == I915_SAMPLE_RUNNABLE ||
+   sample == I915_SAMPLE_RUNNING)
val = div_u64(val, FREQUENCY);
} else {
switch (event->attr.config) {
@@ -810,6 +818,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char 
*name,
 /* No brackets or quotes below please. */
 #define I915_SAMPLE_QUEUED_SCALE 0.0009765625
 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625
+#define I915_SAMPLE_RUNNING_SCALE 0.0009765625
 
 static struct attribute **
 create_event_attributes(struct drm_i915_private *i915)
@@ -837,6 +846,8 @@ create_event_attributes(struct drm_i915_private *i915)
 __stringify(I915_SAMPLE_QUEUED_SCALE)),
__engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable",
 __stringify(I915_SAMPLE_RUNNABLE_SCALE)),
+   __engine_event_scale(I915_SAMPLE_RUNNING, "running",
+__stringify(I915_SAMPLE_RUNNING_SCALE)),
};
unsigned int count = 0;
struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
@@ -852,6 +863,9 @@ create_event_attributes(struct drm_i915_private *i915)
BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR !=
 (1 / I915_SAMPLE_RUNNABLE_SCALE));
 
+   BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR !=
+(1 / I915_SAMPLE_RUNNING_SCALE));
+
/* Count how many counters we will be exposing. */
for (i = 0; i < ARRAY_SIZE(events); i++) {
if (!config_status(i915, events[i].config))
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d7532b185fe..fe1b7d0a94e9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -414,7 +414,7 @@ struct intel_engine_cs {
 *
 * Our internal timer stores the current counters in this field.
 */
-#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1)
+#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1)
struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
} pmu;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index cf0265b20e37..9a00c30e4071 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample {
I915_SAMPLE_SEMA = 2,
I915_SAMPLE_QUEUED = 3,
I915_SAMPLE_RUNNABLE = 4,
+   I915_SAMPLE_RUNNING = 5,
 };
 
  /* Divide counter value by divisor to get the real value. */
 #define I915_SAMPLE_QUEUED_DIVISOR (1024)
 #define I915_SAMPLE_RUNNABLE_DIVISOR (1024)
+#define I915_SAMPLE_RUNNING_DIVISOR (1024)
 
 #define I915_PMU_SAMPLE_BITS (4)
 #define I915_PMU_SAMPLE_