Re: [Intel-gfx] [PATCH 08/17] drm/i915/selftests: Add request throughput measurement to perf

2020-03-10 Thread Chris Wilson
Quoting Tvrtko Ursulin (2020-03-10 11:58:26)
> 
> On 10/03/2020 11:09, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-03-10 10:38:21)
> >>
> >> On 06/03/2020 13:38, Chris Wilson wrote:
> >>> + intel_engine_pm_get(engine);
> >>> +
> >>> + memset(&engines[idx].p, 0, sizeof(engines[idx].p));
> >>> + engines[idx].p.engine = engine;
> >>> +
> >>> + engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
> >>> +"igt:%s", 
> >>> engine->name);
> >>
> >> Test will get affected by the host CPU core count. How about we only
> >> measure num_cpu engines? Might be even more important with discrete.
> > 
> > No. We want to be able to fill the GPU with the different processors.
> > Comparing glk to kbl helps highlight any inefficiencies we have -- we
> > have to be efficient enough that core count is simply not a critical
> > factor to offset our submission overhead.
> > 
> > So we can run the same test and see how it scaled with engines vs cpus
> > just by running it on different machines and look for problems.
> 
> Normally you would expect one core per engine is enough to saturate the 
> engine. I am afraid adding more combinations will be confusing when 
> reading test results. (Same GPU, same engine count, different CPU core 
> count.) How about two subtest variants? One is 1:1 CPU core to engine, 
> and another can be all engines like here?

Each machine will have its own consistent configuration. The question I
have in mind is "can we saturate this machine"? This machine remains
constant for all the runs. And our goal is that the driver is not a
bottleneck on any machine.
 
> Or possibly:
> 
> 1. 1 CPU core - 1 engine - purest latency/overhead
> 2. 1 CPU core - N engines (N = all engines) - more
> 3. N CPU cores - N engines (N = min(engines, cores) - global lock 
> contention, stable setup
> 4. M CPU cores - N engines (N, M = max) - lock contention stress
> 5. N CPU cores - 1 engine (N = all cores) - more extreme lock contention

I hear you in that you would like to have a serial test as well. Where
we just use 1 cpu thread to submit to all engines as fast as we can and
see how close we get with just "1 core". (There will still be
parallelism one hopes from our interrupt handler.)
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 08/17] drm/i915/selftests: Add request throughput measurement to perf

2020-03-10 Thread Tvrtko Ursulin



On 10/03/2020 11:09, Chris Wilson wrote:

Quoting Tvrtko Ursulin (2020-03-10 10:38:21)


On 06/03/2020 13:38, Chris Wilson wrote:

+static int perf_many(void *arg)
+{
+ struct perf_parallel *p = arg;
+ struct intel_engine_cs *engine = p->engine;
+ struct intel_context *ce;
+ IGT_TIMEOUT(end_time);
+ unsigned long count;
+ int err = 0;
+ bool busy;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ return PTR_ERR(ce);
+
+ err = intel_context_pin(ce);
+ if (err) {
+ intel_context_put(ce);
+ return err;
+ }
+
+ busy = false;
+ if (intel_engine_supports_stats(engine) &&
+ !intel_enable_engine_stats(engine)) {
+ p->busy = intel_engine_get_busy_time(engine);
+ busy = true;
+ }
+
+ count = 0;
+ p->time = ktime_get();
+ do {
+ struct i915_request *rq;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ break;
+ }
+
+ i915_request_add(rq);


Any concerns on ring size here and maybe managing the wait explicitly?


No concern, the intention is to flood the ring. If we are able to wait
on the ring, we have succeeded in submitting faster than the engine can
retire. (Which might be another issue for us to resolve, as it may be
our own interrupt latency that is then the bottleneck.)

If we did a sync0, sync1, many; that could give us some more insight
into the interrupt latency in comparison to engine latency.




+ count++;
+ } while (!__igt_timeout(end_time, NULL));
+ p->time = ktime_sub(ktime_get(), p->time);
+
+ if (busy) {
+ p->busy = ktime_sub(intel_engine_get_busy_time(engine),
+ p->busy);
+ intel_disable_engine_stats(engine);
+ }
+
+ err = switch_to_kernel_sync(ce, err);
+ p->runtime = intel_context_get_total_runtime_ns(ce);
+ p->count = count;
+
+ intel_context_unpin(ce);
+ intel_context_put(ce);
+ return err;
+}
+
+static int perf_parallel_engines(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ static int (* const func[])(void *arg) = {
+ perf_sync,
+ perf_many,
+ NULL,
+ };
+ const unsigned int nengines = num_uabi_engines(i915);
+ struct intel_engine_cs *engine;
+ int (* const *fn)(void *arg);
+ struct pm_qos_request *qos;
+ struct {
+ struct perf_parallel p;
+ struct task_struct *tsk;
+ } *engines;
+ int err = 0;
+
+ engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
+ if (!engines)
+ return -ENOMEM;
+
+ qos = kzalloc(sizeof(*qos), GFP_KERNEL);
+ if (qos)
+ pm_qos_add_request(qos, PM_QOS_CPU_DMA_LATENCY, 0);
+
+ for (fn = func; *fn; fn++) {
+ char name[KSYM_NAME_LEN];
+ struct igt_live_test t;
+ unsigned int idx;
+
+ snprintf(name, sizeof(name), "%ps", *fn);


Is this any better than just storing the name in local static array?


It's easier for sure, and since the name is already in a static array,
why not use it :)


It looks weird, it needs KSYM_NAME_LEN of stack space and the special 
%ps. But okay.





+ err = igt_live_test_begin(&t, i915, __func__, name);
+ if (err)
+ break;
+
+ atomic_set(&i915->selftest.counter, nengines);
+
+ idx = 0;
+ for_each_uabi_engine(engine, i915) {


For a pure driver overhead test I would suggest this to be a gt live test.


It's a request performance test, so sits above the gt. My thinking is
that this is a more of a high level request/scheduler test than
execlists/guc (though it depends on those backends).


Okay, yeah, it makes sense.

  

+ intel_engine_pm_get(engine);
+
+ memset(&engines[idx].p, 0, sizeof(engines[idx].p));
+ engines[idx].p.engine = engine;
+
+ engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
+"igt:%s", engine->name);


Test will get affected by the host CPU core count. How about we only
measure num_cpu engines? Might be even more important with discrete.


No. We want to be able to fill the GPU with the different processors.
Comparing glk to kbl helps highlight any inefficiencies we have -- we
have to be efficient enough that core count is simply not a critical
factor to offset our submission overhead.

So we can run the same test and see how it scaled with engines vs cpus
just by running it on different machines and look for problems.


Normally you would expect one core per engine is enough to saturate the 
engine. I am afraid adding more combinations will be confusing when 
reading test results. (Same GPU, same engine count, different CPU core 
count.) How a

Re: [Intel-gfx] [PATCH 08/17] drm/i915/selftests: Add request throughput measurement to perf

2020-03-10 Thread Chris Wilson
Quoting Tvrtko Ursulin (2020-03-10 10:38:21)
> 
> On 06/03/2020 13:38, Chris Wilson wrote:
> > +static int perf_many(void *arg)
> > +{
> > + struct perf_parallel *p = arg;
> > + struct intel_engine_cs *engine = p->engine;
> > + struct intel_context *ce;
> > + IGT_TIMEOUT(end_time);
> > + unsigned long count;
> > + int err = 0;
> > + bool busy;
> > +
> > + ce = intel_context_create(engine);
> > + if (IS_ERR(ce))
> > + return PTR_ERR(ce);
> > +
> > + err = intel_context_pin(ce);
> > + if (err) {
> > + intel_context_put(ce);
> > + return err;
> > + }
> > +
> > + busy = false;
> > + if (intel_engine_supports_stats(engine) &&
> > + !intel_enable_engine_stats(engine)) {
> > + p->busy = intel_engine_get_busy_time(engine);
> > + busy = true;
> > + }
> > +
> > + count = 0;
> > + p->time = ktime_get();
> > + do {
> > + struct i915_request *rq;
> > +
> > + rq = i915_request_create(ce);
> > + if (IS_ERR(rq)) {
> > + err = PTR_ERR(rq);
> > + break;
> > + }
> > +
> > + i915_request_add(rq);
> 
> Any concerns on ring size here and maybe managing the wait explicitly?

No concern, the intention is to flood the ring. If we are able to wait
on the ring, we have succeeded in submitting faster than the engine can
retire. (Which might be another issue for us to resolve, as it may be
our own interrupt latency that is then the bottleneck.)

If we did a sync0, sync1, many; that could give us some more insight
into the interrupt latency in comparison to engine latency.

> 
> > + count++;
> > + } while (!__igt_timeout(end_time, NULL));
> > + p->time = ktime_sub(ktime_get(), p->time);
> > +
> > + if (busy) {
> > + p->busy = ktime_sub(intel_engine_get_busy_time(engine),
> > + p->busy);
> > + intel_disable_engine_stats(engine);
> > + }
> > +
> > + err = switch_to_kernel_sync(ce, err);
> > + p->runtime = intel_context_get_total_runtime_ns(ce);
> > + p->count = count;
> > +
> > + intel_context_unpin(ce);
> > + intel_context_put(ce);
> > + return err;
> > +}
> > +
> > +static int perf_parallel_engines(void *arg)
> > +{
> > + struct drm_i915_private *i915 = arg;
> > + static int (* const func[])(void *arg) = {
> > + perf_sync,
> > + perf_many,
> > + NULL,
> > + };
> > + const unsigned int nengines = num_uabi_engines(i915);
> > + struct intel_engine_cs *engine;
> > + int (* const *fn)(void *arg);
> > + struct pm_qos_request *qos;
> > + struct {
> > + struct perf_parallel p;
> > + struct task_struct *tsk;
> > + } *engines;
> > + int err = 0;
> > +
> > + engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
> > + if (!engines)
> > + return -ENOMEM;
> > +
> > + qos = kzalloc(sizeof(*qos), GFP_KERNEL);
> > + if (qos)
> > + pm_qos_add_request(qos, PM_QOS_CPU_DMA_LATENCY, 0);
> > +
> > + for (fn = func; *fn; fn++) {
> > + char name[KSYM_NAME_LEN];
> > + struct igt_live_test t;
> > + unsigned int idx;
> > +
> > + snprintf(name, sizeof(name), "%ps", *fn);
> 
> Is this any better than just storing the name in local static array?

It's easier for sure, and since the name is already in a static array,
why not use it :)

> > + err = igt_live_test_begin(&t, i915, __func__, name);
> > + if (err)
> > + break;
> > +
> > + atomic_set(&i915->selftest.counter, nengines);
> > +
> > + idx = 0;
> > + for_each_uabi_engine(engine, i915) {
> 
> For a pure driver overhead test I would suggest this to be a gt live test.

It's a request performance test, so sits above the gt. My thinking is
that this is a more of a high level request/scheduler test than
execlists/guc (though it depends on those backends).
 
> > + intel_engine_pm_get(engine);
> > +
> > + memset(&engines[idx].p, 0, sizeof(engines[idx].p));
> > + engines[idx].p.engine = engine;
> > +
> > + engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
> > +"igt:%s", 
> > engine->name);
> 
> Test will get affected by the host CPU core count. How about we only 
> measure num_cpu engines? Might be even more important with discrete.

No. We want to be able to fill the GPU with the different processors.
Comparing glk to kbl helps highlight any inefficiencies we have -- we
have to be efficient enough that core count is simply not a critical
factor to offset our submission overhead.

So we can run the same test and see how it scaled with engines vs cpus
just by running it on dif

Re: [Intel-gfx] [PATCH 08/17] drm/i915/selftests: Add request throughput measurement to perf

2020-03-10 Thread Tvrtko Ursulin



On 06/03/2020 13:38, Chris Wilson wrote:

Under ideal circumstances, the driver should be able to keep the GPU
fully saturated with work. Measure how close to ideal we get under the
harshest of conditions with no user payload.

Signed-off-by: Chris Wilson 
---
  .../drm/i915/selftests/i915_perf_selftests.h  |   1 +
  drivers/gpu/drm/i915/selftests/i915_request.c | 285 +-
  2 files changed, 285 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h 
b/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h
index 3bf7f53e9924..d8da142985eb 100644
--- a/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_perf_selftests.h
@@ -16,5 +16,6 @@
   * Tests are executed in order by igt/i915_selftest
   */
  selftest(engine_cs, intel_engine_cs_perf_selftests)
+selftest(request, i915_request_perf_selftests)
  selftest(blt, i915_gem_object_blt_perf_selftests)
  selftest(region, intel_memory_region_perf_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c 
b/drivers/gpu/drm/i915/selftests/i915_request.c
index f89d9c42f1fa..d4c088cfe4e1 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -23,6 +23,7 @@
   */
  
  #include 

+#include 
  
  #include "gem/i915_gem_pm.h"

  #include "gem/selftests/mock_context.h"
@@ -1233,7 +1234,7 @@ static int live_parallel_engines(void *arg)
struct igt_live_test t;
unsigned int idx;
  
-		snprintf(name, sizeof(name), "%pS", fn);

+   snprintf(name, sizeof(name), "%ps", *fn);
err = igt_live_test_begin(&t, i915, __func__, name);
if (err)
break;
@@ -1470,3 +1471,285 @@ int i915_request_live_selftests(struct drm_i915_private 
*i915)
  
  	return i915_subtests(tests, i915);

  }
+
+struct perf_parallel {
+   struct intel_engine_cs *engine;
+   unsigned long count;
+   ktime_t time;
+   ktime_t busy;
+   u64 runtime;
+};
+
+static int switch_to_kernel_sync(struct intel_context *ce, int err)
+{
+   struct i915_request *rq;
+   struct dma_fence *fence;
+
+   rq = intel_engine_create_kernel_request(ce->engine);
+   if (IS_ERR(rq))
+   return PTR_ERR(rq);
+
+   fence = i915_active_fence_get(&ce->timeline->last_request);
+   if (fence) {
+   i915_request_await_dma_fence(rq, fence);
+   dma_fence_put(fence);
+   }
+
+   rq = i915_request_get(rq);
+   i915_request_add(rq);
+   if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
+   err = -ETIME;
+   i915_request_put(rq);
+
+   while (!err && !intel_engine_is_idle(ce->engine))
+   intel_engine_flush_submission(ce->engine);
+
+   return err;
+}
+
+static int perf_sync(void *arg)
+{
+   struct perf_parallel *p = arg;
+   struct intel_engine_cs *engine = p->engine;
+   struct intel_context *ce;
+   IGT_TIMEOUT(end_time);
+   unsigned long count;
+   bool busy;
+   int err = 0;
+
+   ce = intel_context_create(engine);
+   if (IS_ERR(ce))
+   return PTR_ERR(ce);
+
+   err = intel_context_pin(ce);
+   if (err) {
+   intel_context_put(ce);
+   return err;
+   }
+
+   busy = false;
+   if (intel_engine_supports_stats(engine) &&
+   !intel_enable_engine_stats(engine)) {
+   p->busy = intel_engine_get_busy_time(engine);
+   busy = true;
+   }
+
+   p->time = ktime_get();
+   count = 0;
+   do {
+   struct i915_request *rq;
+
+   rq = i915_request_create(ce);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   break;
+   }
+
+   i915_request_get(rq);
+   i915_request_add(rq);
+
+   err = 0;
+   if (i915_request_wait(rq, 0, HZ / 5) < 0)
+   err = -ETIME;
+   i915_request_put(rq);
+   if (err)
+   break;
+
+   count++;
+   } while (!__igt_timeout(end_time, NULL));
+   p->time = ktime_sub(ktime_get(), p->time);
+
+   if (busy) {
+   p->busy = ktime_sub(intel_engine_get_busy_time(engine),
+   p->busy);
+   intel_disable_engine_stats(engine);
+   }
+
+   err = switch_to_kernel_sync(ce, err);
+   p->runtime = intel_context_get_total_runtime_ns(ce);
+   p->count = count;
+
+   intel_context_unpin(ce);
+   intel_context_put(ce);
+   return err;
+}
+
+static int perf_many(void *arg)
+{
+   struct perf_parallel *p = arg;
+   struct intel_engine_cs *engine = p->engine;
+   struct intel_context *ce;
+   IGT_TIMEOUT(end_time);
+   unsigned long count;
+   int err = 0;
+   bool busy;
+
+   ce = intel_context