Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
Quoting Tvrtko Ursulin (2018-06-07 14:25:28) > From: Tvrtko Ursulin > > We add a PMU counter to expose the number of requests currently executing > on the GPU. > > This is useful to analyze the overall load of the system. > > v2: > * Rebase. > * Drop floating point constant. (Chris Wilson) > > v3: > * Change scale to 1024 for faster arithmetics. (Chris Wilson) > > v4: > * Refactored for timer period accounting. > > v5: > * Avoid 64-division. (Chris Wilson) > > v6: > * Do fewer divisions by accumulating in qd.ns units. (Chris Wilson) > * Change counter scale to avoid multiplication in readout and increase >counter headroom. > > Signed-off-by: Tvrtko Ursulin I can't spot any nits to pick. That means I actually have to review it now, right? -Chris ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) v4: * Refactored for timer period accounting. v5: * Avoid 64-division. (Chris Wilson) v6: * Do fewer divisions by accumulating in qd.ns units. (Chris Wilson) * Change counter scale to avoid multiplication in readout and increase counter headroom. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_pmu.c | 20 ++-- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- include/uapi/drm/i915_drm.h | 5 + 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index bdfb430909b4..73b6fe7cc6af 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -17,7 +17,8 @@ BIT(I915_SAMPLE_WAIT) | \ BIT(I915_SAMPLE_SEMA) | \ BIT(I915_SAMPLE_QUEUED) | \ -BIT(I915_SAMPLE_RUNNABLE)) +BIT(I915_SAMPLE_RUNNABLE) | \ +BIT(I915_SAMPLE_RUNNING)) #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) @@ -222,6 +223,11 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns) add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNABLE], engine->request_stats.runnable, period_ns); + + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) + add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING], + last_seqno - current_seqno, + period_ns); } if (fw) @@ -337,6 +343,7 @@ engine_event_status(struct intel_engine_cs *engine, case I915_SAMPLE_WAIT: case I915_SAMPLE_QUEUED: case I915_SAMPLE_RUNNABLE: + case I915_SAMPLE_RUNNING: break; case I915_SAMPLE_SEMA: if (INTEL_GEN(engine->i915) < 6) @@ -556,11 +563,14 @@ static u64 __i915_pmu_event_read(struct perf_event *event) val = engine->pmu.sample[sample].cur; if (sample == I915_SAMPLE_QUEUED || - sample == I915_SAMPLE_RUNNABLE) { + sample == I915_SAMPLE_RUNNABLE || + sample == I915_SAMPLE_RUNNING) { BUILD_BUG_ON(NSEC_PER_SEC % I915_SAMPLE_QUEUED_DIVISOR); BUILD_BUG_ON(I915_SAMPLE_QUEUED_DIVISOR != I915_SAMPLE_RUNNABLE_DIVISOR); + BUILD_BUG_ON(I915_SAMPLE_QUEUED_DIVISOR != +I915_SAMPLE_RUNNING_DIVISOR); /* to qd */ val = div_u64(val, NSEC_PER_SEC / @@ -862,6 +872,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char *name, /* No brackets or quotes below please. */ #define I915_SAMPLE_QUEUED_SCALE 0.001 #define I915_SAMPLE_RUNNABLE_SCALE 0.001 +#define I915_SAMPLE_RUNNING_SCALE 0.001 static struct attribute ** create_event_attributes(struct drm_i915_private *i915) @@ -889,6 +900,8 @@ create_event_attributes(struct drm_i915_private *i915) __stringify(I915_SAMPLE_QUEUED_SCALE)), __engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable", __stringify(I915_SAMPLE_RUNNABLE_SCALE)), + __engine_event_scale(I915_SAMPLE_RUNNING, "running", +__stringify(I915_SAMPLE_RUNNING_SCALE)), }; unsigned int count = 0; struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; @@ -904,6 +917,9 @@ create_event_attributes(struct drm_i915_private *i915) BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR != (1 / I915_SAMPLE_RUNNABLE_SCALE)); + BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR != +(1 / I915_SAMPLE_RUNNING_SCALE)); + /* Count how many counters we will be exposing. */ for (i = 0; i < ARRAY_SIZE(events); i++) { if (!config_status(i915, events[i].config)) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 703cea694f0d..bff20cfd6870 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -420,7 +420,7 @@ struct intel_engine_cs { * * Our internal timer stores the current counters in this field. */ -#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNN
Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
On 06/06/2018 16:23, Chris Wilson wrote: Quoting Tvrtko Ursulin (2018-06-06 15:40:10) From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) v4: * Refactored for timer period accounting. v5: * Avoid 64-division. (Chris Wilson) Signed-off-by: Tvrtko Ursulin --- #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) @@ -226,6 +227,13 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns) div_u64((u64)period_ns * I915_SAMPLE_QUEUED_DIVISOR, 100)); + + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) + add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING], + last_seqno - current_seqno, + div_u64((u64)period_ns * + I915_SAMPLE_QUEUED_DIVISOR, + 100)); Are we worried about losing precision with qd.ns? add_sample_mult(SAMPLE, x, period_ns); here @@ -560,7 +569,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event) val = engine->pmu.sample[sample].cur; if (sample == I915_SAMPLE_QUEUED || - sample == I915_SAMPLE_RUNNABLE) + sample == I915_SAMPLE_RUNNABLE || + sample == I915_SAMPLE_RUNNING) val = div_u64(val, MSEC_PER_SEC); /* to qd */ and val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR, NSEC_PER_SEC); Yeah that works, thanks. So that gives us a limit of ~1 million qd (assuming the user cares for about 1s intervals). Up to 8 million wlog with val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR/8, NSEC_PER_SEC/8); Or keep in qd.us as for frequency. I think precision is plenty in any case. Anyway, just concerned to have more than one 64b division and want to provoke you into thinking of a way of avoiding it :) It is an optimized 64-bit divide, or 64-divide as I faltered in the commit message :), so not as bad as 64/64, but still your idea is very good. Regards, Tvrtko ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
Quoting Tvrtko Ursulin (2018-06-06 15:40:10) > From: Tvrtko Ursulin > > We add a PMU counter to expose the number of requests currently executing > on the GPU. > > This is useful to analyze the overall load of the system. > > v2: > * Rebase. > * Drop floating point constant. (Chris Wilson) > > v3: > * Change scale to 1024 for faster arithmetics. (Chris Wilson) > > v4: > * Refactored for timer period accounting. > > v5: > * Avoid 64-division. (Chris Wilson) > > Signed-off-by: Tvrtko Ursulin > --- > #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) > > @@ -226,6 +227,13 @@ engines_sample(struct drm_i915_private *dev_priv, > unsigned int period_ns) > div_u64((u64)period_ns * > I915_SAMPLE_QUEUED_DIVISOR, > 100)); > + > + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) > + > add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING], > + last_seqno - current_seqno, > + div_u64((u64)period_ns * > + I915_SAMPLE_QUEUED_DIVISOR, > + 100)); Are we worried about losing precision with qd.ns? add_sample_mult(SAMPLE, x, period_ns); here > @@ -560,7 +569,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event) > val = engine->pmu.sample[sample].cur; > > if (sample == I915_SAMPLE_QUEUED || > - sample == I915_SAMPLE_RUNNABLE) > + sample == I915_SAMPLE_RUNNABLE || > + sample == I915_SAMPLE_RUNNING) > val = div_u64(val, MSEC_PER_SEC); /* to qd */ and val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR, NSEC_PER_SEC); So that gives us a limit of ~1 million qd (assuming the user cares for about 1s intervals). Up to 8 million wlog with val = div_u64(val * I915_SAMPLE_QUEUED_DIVISOR/8, NSEC_PER_SEC/8); Anyway, just concerned to have more than one 64b division and want to provoke you into thinking of a way of avoiding it :) -Chris ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) v4: * Refactored for timer period accounting. v5: * Avoid 64-division. (Chris Wilson) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_pmu.c | 20 ++-- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- include/uapi/drm/i915_drm.h | 5 + 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 46a516a748c8..9ecaf662b5c1 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -17,7 +17,8 @@ BIT(I915_SAMPLE_WAIT) | \ BIT(I915_SAMPLE_SEMA) | \ BIT(I915_SAMPLE_QUEUED) | \ -BIT(I915_SAMPLE_RUNNABLE)) +BIT(I915_SAMPLE_RUNNABLE) | \ +BIT(I915_SAMPLE_RUNNING)) #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) @@ -226,6 +227,13 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns) div_u64((u64)period_ns * I915_SAMPLE_QUEUED_DIVISOR, 100)); + + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) + add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING], + last_seqno - current_seqno, + div_u64((u64)period_ns * + I915_SAMPLE_QUEUED_DIVISOR, + 100)); } if (fw) @@ -341,6 +349,7 @@ engine_event_status(struct intel_engine_cs *engine, case I915_SAMPLE_WAIT: case I915_SAMPLE_QUEUED: case I915_SAMPLE_RUNNABLE: + case I915_SAMPLE_RUNNING: break; case I915_SAMPLE_SEMA: if (INTEL_GEN(engine->i915) < 6) @@ -560,7 +569,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event) val = engine->pmu.sample[sample].cur; if (sample == I915_SAMPLE_QUEUED || - sample == I915_SAMPLE_RUNNABLE) + sample == I915_SAMPLE_RUNNABLE || + sample == I915_SAMPLE_RUNNING) val = div_u64(val, MSEC_PER_SEC); /* to qd */ } } else { @@ -858,6 +868,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char *name, /* No brackets or quotes below please. */ #define I915_SAMPLE_QUEUED_SCALE 0.0009765625 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625 +#define I915_SAMPLE_RUNNING_SCALE 0.0009765625 static struct attribute ** create_event_attributes(struct drm_i915_private *i915) @@ -885,6 +896,8 @@ create_event_attributes(struct drm_i915_private *i915) __stringify(I915_SAMPLE_QUEUED_SCALE)), __engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable", __stringify(I915_SAMPLE_RUNNABLE_SCALE)), + __engine_event_scale(I915_SAMPLE_RUNNING, "running", +__stringify(I915_SAMPLE_RUNNING_SCALE)), }; unsigned int count = 0; struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; @@ -900,6 +913,9 @@ create_event_attributes(struct drm_i915_private *i915) BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR != (1 / I915_SAMPLE_RUNNABLE_SCALE)); + BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR != +(1 / I915_SAMPLE_RUNNING_SCALE)); + /* Count how many counters we will be exposing. */ for (i = 0; i < ARRAY_SIZE(events); i++) { if (!config_status(i915, events[i].config)) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 703cea694f0d..bff20cfd6870 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -420,7 +420,7 @@ struct intel_engine_cs { * * Our internal timer stores the current counters in this field. */ -#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1) +#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1) struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; } pmu; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index cf0265b20e37..9a00c30e4071 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample { I915_SAMPLE_SEMA = 2, I915_SAMPLE_QUEUED = 3, I915_SAMPLE_RUNNABLE = 4, +
[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) v4: * Refactored for timer period accounting. Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_pmu.c | 19 +-- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- include/uapi/drm/i915_drm.h | 5 + 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 41527b682c72..60dc68e4911c 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -17,7 +17,8 @@ BIT(I915_SAMPLE_WAIT) | \ BIT(I915_SAMPLE_SEMA) | \ BIT(I915_SAMPLE_QUEUED) | \ -BIT(I915_SAMPLE_RUNNABLE)) +BIT(I915_SAMPLE_RUNNABLE) | \ +BIT(I915_SAMPLE_RUNNING)) #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) @@ -224,6 +225,12 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns) engine->request_stats.runnable, (u64)period_ns * I915_SAMPLE_QUEUED_DIVISOR / 100); + + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) + add_sample_mult(&engine->pmu.sample[I915_SAMPLE_RUNNING], + last_seqno - current_seqno, + (u64)period_ns * + I915_SAMPLE_QUEUED_DIVISOR / 100); } if (fw) @@ -339,6 +346,7 @@ engine_event_status(struct intel_engine_cs *engine, case I915_SAMPLE_WAIT: case I915_SAMPLE_QUEUED: case I915_SAMPLE_RUNNABLE: + case I915_SAMPLE_RUNNING: break; case I915_SAMPLE_SEMA: if (INTEL_GEN(engine->i915) < 6) @@ -558,7 +566,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event) val = engine->pmu.sample[sample].cur; if (sample == I915_SAMPLE_QUEUED || - sample == I915_SAMPLE_RUNNABLE) + sample == I915_SAMPLE_RUNNABLE || + sample == I915_SAMPLE_RUNNING) val = div_u64(val, MSEC_PER_SEC); /* to qd */ } } else { @@ -856,6 +865,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char *name, /* No brackets or quotes below please. */ #define I915_SAMPLE_QUEUED_SCALE 0.0009765625 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625 +#define I915_SAMPLE_RUNNING_SCALE 0.0009765625 static struct attribute ** create_event_attributes(struct drm_i915_private *i915) @@ -883,6 +893,8 @@ create_event_attributes(struct drm_i915_private *i915) __stringify(I915_SAMPLE_QUEUED_SCALE)), __engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable", __stringify(I915_SAMPLE_RUNNABLE_SCALE)), + __engine_event_scale(I915_SAMPLE_RUNNING, "running", +__stringify(I915_SAMPLE_RUNNING_SCALE)), }; unsigned int count = 0; struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; @@ -898,6 +910,9 @@ create_event_attributes(struct drm_i915_private *i915) BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR != (1 / I915_SAMPLE_RUNNABLE_SCALE)); + BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR != +(1 / I915_SAMPLE_RUNNING_SCALE)); + /* Count how many counters we will be exposing. */ for (i = 0; i < ARRAY_SIZE(events); i++) { if (!config_status(i915, events[i].config)) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 703cea694f0d..bff20cfd6870 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -420,7 +420,7 @@ struct intel_engine_cs { * * Our internal timer stores the current counters in this field. */ -#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1) +#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1) struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; } pmu; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index cf0265b20e37..9a00c30e4071 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample { I915_SAMPLE_SEMA = 2, I915_SAMPLE_QUEUED = 3, I915_SAMPLE_RUNNABLE = 4, + I915_SAMPLE_RUNNING = 5, }; /* Divide counter value by divisor to get the real value. */
Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
On 06/04/2018 21:24, Chris Wilson wrote: Quoting Tvrtko Ursulin (2018-04-05 13:39:22) From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Do we want these separate in the final push? Is there value in reverting one but not the others? They seem a triumvirate. I think the only benefit to have them separate for me was that rebasing was marginally easier. I can just as well squash them if that is preferred. Regards, Tvrtko ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
Quoting Tvrtko Ursulin (2018-04-05 13:39:22) > From: Tvrtko Ursulin > > We add a PMU counter to expose the number of requests currently executing > on the GPU. > > This is useful to analyze the overall load of the system. > > v2: > * Rebase. > * Drop floating point constant. (Chris Wilson) > > v3: > * Change scale to 1024 for faster arithmetics. (Chris Wilson) > > Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Do we want these separate in the final push? Is there value in reverting one but not the others? They seem a triumvirate. -Chris ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_pmu.c | 18 -- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- include/uapi/drm/i915_drm.h | 5 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index afc561e1aa92..bd7e695fc663 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -17,7 +17,8 @@ BIT(I915_SAMPLE_WAIT) | \ BIT(I915_SAMPLE_SEMA) | \ BIT(I915_SAMPLE_QUEUED) | \ -BIT(I915_SAMPLE_RUNNABLE)) +BIT(I915_SAMPLE_RUNNABLE) | \ +BIT(I915_SAMPLE_RUNNING)) #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) @@ -211,6 +212,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNABLE], I915_SAMPLE_RUNNABLE_DIVISOR, engine->request_stats.runnable); + + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) + update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNING], + I915_SAMPLE_RUNNING_DIVISOR, + last_seqno - current_seqno); } if (fw) @@ -310,6 +316,7 @@ engine_event_status(struct intel_engine_cs *engine, case I915_SAMPLE_WAIT: case I915_SAMPLE_QUEUED: case I915_SAMPLE_RUNNABLE: + case I915_SAMPLE_RUNNING: break; case I915_SAMPLE_SEMA: if (INTEL_GEN(engine->i915) < 6) @@ -513,7 +520,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event) } if (sample == I915_SAMPLE_QUEUED || - sample == I915_SAMPLE_RUNNABLE) + sample == I915_SAMPLE_RUNNABLE || + sample == I915_SAMPLE_RUNNING) val = div_u64(val, FREQUENCY); } else { switch (event->attr.config) { @@ -810,6 +818,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char *name, /* No brackets or quotes below please. */ #define I915_SAMPLE_QUEUED_SCALE 0.0009765625 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625 +#define I915_SAMPLE_RUNNING_SCALE 0.0009765625 static struct attribute ** create_event_attributes(struct drm_i915_private *i915) @@ -837,6 +846,8 @@ create_event_attributes(struct drm_i915_private *i915) __stringify(I915_SAMPLE_QUEUED_SCALE)), __engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable", __stringify(I915_SAMPLE_RUNNABLE_SCALE)), + __engine_event_scale(I915_SAMPLE_RUNNING, "running", +__stringify(I915_SAMPLE_RUNNING_SCALE)), }; unsigned int count = 0; struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; @@ -852,6 +863,9 @@ create_event_attributes(struct drm_i915_private *i915) BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR != (1 / I915_SAMPLE_RUNNABLE_SCALE)); + BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR != +(1 / I915_SAMPLE_RUNNING_SCALE)); + /* Count how many counters we will be exposing. */ for (i = 0; i < ARRAY_SIZE(events); i++) { if (!config_status(i915, events[i].config)) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 5af93e88c90f..d50b31eb43a5 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -414,7 +414,7 @@ struct intel_engine_cs { * * Our internal timer stores the current counters in this field. */ -#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1) +#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1) struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; } pmu; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index cf0265b20e37..9a00c30e4071 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample { I915_SAMPLE_SEMA = 2, I915_SAMPLE_QUEUED = 3, I915_SAMPLE_RUNNABLE = 4, + I915_SAMPLE_RUNNING = 5, }; /* Divide counter value by divisor to get the real value. */ #define I915_SAMPLE_QUEUED_DIVISOR (1024) #define I915_SAMPLE_RUNNABLE_DIVISOR (1024) +#define I915_SAMPLE_RUNNING_DIVISOR (1024) #define I915_PMU_SAMPLE_BITS (4) #define I915_PMU_SAMPLE_
[Intel-gfx] [PATCH 6/7] drm/i915/pmu: Add running counter
From: Tvrtko Ursulin We add a PMU counter to expose the number of requests currently executing on the GPU. This is useful to analyze the overall load of the system. v2: * Rebase. * Drop floating point constant. (Chris Wilson) v3: * Change scale to 1024 for faster arithmetics. (Chris Wilson) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_pmu.c | 18 -- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- include/uapi/drm/i915_drm.h | 5 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index afc561e1aa92..bd7e695fc663 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -17,7 +17,8 @@ BIT(I915_SAMPLE_WAIT) | \ BIT(I915_SAMPLE_SEMA) | \ BIT(I915_SAMPLE_QUEUED) | \ -BIT(I915_SAMPLE_RUNNABLE)) +BIT(I915_SAMPLE_RUNNABLE) | \ +BIT(I915_SAMPLE_RUNNING)) #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) @@ -211,6 +212,11 @@ static void engines_sample(struct drm_i915_private *dev_priv) update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNABLE], I915_SAMPLE_RUNNABLE_DIVISOR, engine->request_stats.runnable); + + if (engine->pmu.enable & BIT(I915_SAMPLE_RUNNING)) + update_sample(&engine->pmu.sample[I915_SAMPLE_RUNNING], + I915_SAMPLE_RUNNING_DIVISOR, + last_seqno - current_seqno); } if (fw) @@ -310,6 +316,7 @@ engine_event_status(struct intel_engine_cs *engine, case I915_SAMPLE_WAIT: case I915_SAMPLE_QUEUED: case I915_SAMPLE_RUNNABLE: + case I915_SAMPLE_RUNNING: break; case I915_SAMPLE_SEMA: if (INTEL_GEN(engine->i915) < 6) @@ -513,7 +520,8 @@ static u64 __i915_pmu_event_read(struct perf_event *event) } if (sample == I915_SAMPLE_QUEUED || - sample == I915_SAMPLE_RUNNABLE) + sample == I915_SAMPLE_RUNNABLE || + sample == I915_SAMPLE_RUNNING) val = div_u64(val, FREQUENCY); } else { switch (event->attr.config) { @@ -810,6 +818,7 @@ add_pmu_attr(struct perf_pmu_events_attr *attr, const char *name, /* No brackets or quotes below please. */ #define I915_SAMPLE_QUEUED_SCALE 0.0009765625 #define I915_SAMPLE_RUNNABLE_SCALE 0.0009765625 +#define I915_SAMPLE_RUNNING_SCALE 0.0009765625 static struct attribute ** create_event_attributes(struct drm_i915_private *i915) @@ -837,6 +846,8 @@ create_event_attributes(struct drm_i915_private *i915) __stringify(I915_SAMPLE_QUEUED_SCALE)), __engine_event_scale(I915_SAMPLE_RUNNABLE, "runnable", __stringify(I915_SAMPLE_RUNNABLE_SCALE)), + __engine_event_scale(I915_SAMPLE_RUNNING, "running", +__stringify(I915_SAMPLE_RUNNING_SCALE)), }; unsigned int count = 0; struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter; @@ -852,6 +863,9 @@ create_event_attributes(struct drm_i915_private *i915) BUILD_BUG_ON(I915_SAMPLE_RUNNABLE_DIVISOR != (1 / I915_SAMPLE_RUNNABLE_SCALE)); + BUILD_BUG_ON(I915_SAMPLE_RUNNING_DIVISOR != +(1 / I915_SAMPLE_RUNNING_SCALE)); + /* Count how many counters we will be exposing. */ for (i = 0; i < ARRAY_SIZE(events); i++) { if (!config_status(i915, events[i].config)) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 5d7532b185fe..fe1b7d0a94e9 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -414,7 +414,7 @@ struct intel_engine_cs { * * Our internal timer stores the current counters in this field. */ -#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNABLE + 1) +#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_RUNNING + 1) struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; } pmu; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index cf0265b20e37..9a00c30e4071 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -113,11 +113,13 @@ enum drm_i915_pmu_engine_sample { I915_SAMPLE_SEMA = 2, I915_SAMPLE_QUEUED = 3, I915_SAMPLE_RUNNABLE = 4, + I915_SAMPLE_RUNNING = 5, }; /* Divide counter value by divisor to get the real value. */ #define I915_SAMPLE_QUEUED_DIVISOR (1024) #define I915_SAMPLE_RUNNABLE_DIVISOR (1024) +#define I915_SAMPLE_RUNNING_DIVISOR (1024) #define I915_PMU_SAMPLE_BITS (4) #define I915_PMU_SAMPLE_