[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2019-01-24 Thread Tvrtko Ursulin
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active.

On most platforms there are no issues with this apart from a performance
penality for some media workloads that benefit from running on a partially
powergated GPU. We already prevent RC6 from affecting the programming so
it doesn't sound completely unreasonable to hold on powergating for the
same reason.

On Icelake however there would a functional problem if the slices not-
containing the VME block were left enabled with a running media workload
which explicitly disabled them. To avoid a GPU hang in this case, on
Icelake we lock the enablement to only slices which contain VME blocks.
Downside is that it means degraded GPU performance when OA is active but
there is no known alternative solution for this.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

v8-9:
 * Rebase.

v10:
 * Squashed with ICL support patch.

Bspec: 21140
Co-developed-by: Tvrtko Ursulin 
Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
Cc: Lionel Landwerlin 
Reviewed-by: Chris Wilson  # v9
Reviewed-by: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 ++---
 drivers/gpu/drm/i915/intel_lrc.c | 46 
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 727118301f91..9ebf99f3d8d3 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2098,21 +2103,21 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
if (ret) {
DRM_DEBUG("Unable to enable metric set\n");
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index cdfcc22cd2fd..333c991ede47 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1173,9 +1173,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1189,8 +1186,8 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
 
/* RPCS */
if (engine->class == RENDER_CLASS)
-   

[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2019-01-15 Thread Joonas Lahtinen
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active.

On most platforms there are no issues with this apart from a performance
penality for some media workloads that benefit from running on a partially
powergated GPU. We already prevent RC6 from affecting the programming so
it doesn't sound completely unreasonable to hold on powergating for the
same reason.

On Icelake however there would a functional problem if the slices not-
containing the VME block were left enabled with a running media workload
which explicitly disabled them. To avoid a GPU hang in this case, on
Icelake we lock the enablement to only slices which contain VME blocks.
Downside is that it means degraded GPU performance when OA is active but
there is no known alternative solution for this.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

v8-9:
 * Rebase.

v10:
 * Squashed with ICL support patch.

Bspec: 21140
Co-Developed-by: Tvrtko Ursulin 
Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
Cc: Lionel Landwerlin 
Reviewed-by: Chris Wilson  # v9
Reviewed-by: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 ++---
 drivers/gpu/drm/i915/intel_lrc.c | 46 
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index faff6cf1aaa1..07170e49ecb3 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2098,21 +2103,21 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
if (ret) {
DRM_DEBUG("Unable to enable metric set\n");
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 31bf84b22e61..f32be56ec503 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1169,9 +1169,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1185,8 +1182,8 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
 
/* RPCS */
if (engine->class == RENDER_CLASS)
-   

[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2019-01-14 Thread Tvrtko Ursulin
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active.

On most platforms there are no issues with this apart from a performance
penality for some media workloads that benefit from running on a partially
powergated GPU. We already prevent RC6 from affecting the programming so
it doesn't sound completely unreasonable to hold on powergating for the
same reason.

On Icelake however there would a functional problem if the slices not-
containing the VME block were left enabled with a running media workload
which explicitly disabled them. To avoid a GPU hang in this case, on
Icelake we lock the enablement to only slices which contain VME blocks.
Downside is that it means degraded GPU performance when OA is active but
there is no known alternative solution for this.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

v8-9:
 * Rebase.

v10:
 * Squashed with ICL support patch.

Bspec: 21140
Co-Developed-by: Tvrtko Ursulin 
Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
Cc: Lionel Landwerlin 
Reviewed-by: Chris Wilson  # v9
Reviewed-by: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 ++---
 drivers/gpu/drm/i915/intel_lrc.c | 46 
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5b1ae5ed97b3..67e3fc12e84c 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2098,21 +2103,21 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
if (ret) {
DRM_DEBUG("Unable to enable metric set\n");
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 723aee5ae9d5..fbbfa792d4d7 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1169,9 +1169,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1185,8 +1182,8 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
 
/* RPCS */
if (engine->class == RENDER_CLASS)
-   

[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2019-01-08 Thread Tvrtko Ursulin
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active.

On most platforms there are no issues with this apart from a performance
penality for some media workloads that benefit from running on a partially
powergated GPU. We already prevent RC6 from affecting the programming so
it doesn't sound completely unreasonable to hold on powergating for the
same reason.

On Icelake however there would a functional problem if the slices not-
containing the VME block were left enabled with a running media workload
which explicitly disabled them. To avoid a GPU hang in this case, on
Icelake we lock the enablement to only slices which contain VME blocks.
Downside is that it means degraded GPU performance when OA is active but
there is no known alternative solution for this.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

v8-9:
 * Rebase.

v10:
 * Squashed with ICL support patch.

Bspec: 21140
Co-Developed-by: Tvrtko Ursulin 
Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
Cc: Lionel Landwerlin 
Reviewed-by: Chris Wilson  # v9
Reviewed-by: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 ++---
 drivers/gpu/drm/i915/intel_lrc.c | 46 
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5b1ae5ed97b3..67e3fc12e84c 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2098,21 +2103,21 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
if (ret) {
DRM_DEBUG("Unable to enable metric set\n");
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 6df792bc5067..bca09a497f27 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1170,9 +1170,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1186,8 +1183,8 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
 
/* RPCS */
if (engine->class == RENDER_CLASS)
-   

[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2018-11-13 Thread Tvrtko Ursulin
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active. There is no known issue with this apart from a performance
penality for some media workloads that benefit from running on a
partially powergated GPU. We already prevent RC6 from affecting the
programming so it doesn't sound completely unreasonable to hold on
powergating for the same reason.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

v8-9:
 * Rebase.

Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 +
 drivers/gpu/drm/i915/intel_lrc.c | 31 ---
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 2c2b63be7a6c..627a83af2288 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2098,21 +2103,21 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
if (ret) {
DRM_DEBUG("Unable to enable metric set\n");
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dc1e08b72446..ee02c593a72c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1152,9 +1152,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1168,8 +1165,8 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
 
/* RPCS */
if (engine->class == RENDER_CLASS)
-   regs[CTX_R_PWR_CLK_STATE + 1] = make_rpcs(engine->i915,
- >sseu);
+   regs[CTX_R_PWR_CLK_STATE + 1] = gen8_make_rpcs(engine->i915,
+  >sseu);
 }
 
 static struct intel_context *
@@ -2358,13 +2355,12 @@ int logical_xcs_ring_init(struct intel_engine_cs 
*engine)
return logical_ring_init(engine);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu)
+u32 gen8_make_rpcs(struct drm_i915_private *i915, struct 

Re: [Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2018-09-17 Thread Chris Wilson
Quoting Tvrtko Ursulin (2018-09-17 12:30:55)
> From: Lionel Landwerlin 
> 
> If some of the contexts submitting workloads to the GPU have been
> configured to shutdown slices/subslices, we might loose the NOA
> configurations written in the NOA muxes.
> 
> One possible solution to this problem is to reprogram the NOA muxes
> when we switch to a new context. We initially tried this in the
> workaround batchbuffer but some concerns where raised about the cost
> of reprogramming at every context switch. This solution is also not
> without consequences from the userspace point of view. Reprogramming
> of the muxes can only happen once the powergating configuration has
> changed (which happens after context switch). This means for a window
> of time during the recording, counters recorded by the OA unit might
> be invalid. This requires userspace dealing with OA reports to discard
> the invalid values.
> 
> Minimizing the reprogramming could be implemented by tracking of the
> last programmed configuration somewhere in GGTT and use MI_PREDICATE
> to discard some of the programming commands, but the command streamer
> would still have to parse all the MI_LRI instructions in the
> workaround batchbuffer.
> 
> Another solution, which this change implements, is to simply disregard
> the user requested configuration for the period of time when i915/perf
> is active. There is no known issue with this apart from a performance
> penality for some media workloads that benefit from running on a
> partially powergated GPU. We already prevent RC6 from affecting the
> programming so it doesn't sound completely unreasonable to hold on
> powergating for the same reason.
> 
> v2: Leave RPCS programming in intel_lrc.c (Lionel)
> 
> v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
> More to_intel_context() (Tvrtko)
> s/dev_priv/i915/ (Tvrtko)
> 
> Tvrtko Ursulin:
> 
> v4:
>  * Rebase for make_rpcs changes.
> 
> v5:
>  * Apply OA restriction from make_rpcs directly.
> 
> v6:
>  * Rebase for context image setup changes.
> 
> v7:
>  * Move stream assignment before metric enable.
> 
> v8:
>  * Rebase.
> 
> Signed-off-by: Lionel Landwerlin 
> Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Chris Wilson 
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2018-09-17 Thread Tvrtko Ursulin
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active. There is no known issue with this apart from a performance
penality for some media workloads that benefit from running on a
partially powergated GPU. We already prevent RC6 from affecting the
programming so it doesn't sound completely unreasonable to hold on
powergating for the same reason.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

v8:
 * Rebase.

Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 +
 drivers/gpu/drm/i915/intel_lrc.c | 31 ---
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 664b96bb65a3..46ddd1913fee 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2092,6 +2097,9 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
  stream->oa_config);
if (ret) {
@@ -2099,15 +2107,12 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1491032b675b..cf2e0848fa55 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1305,9 +1305,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1321,8 +1318,8 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
 
/* RPCS */
if (engine->class == RENDER_CLASS)
-   regs[CTX_R_PWR_CLK_STATE + 1] = make_rpcs(engine->i915,
- >sseu);
+   regs[CTX_R_PWR_CLK_STATE + 1] = gen8_make_rpcs(engine->i915,
+  >sseu);
 }
 
 static struct intel_context *
@@ -2507,13 +2504,12 @@ int logical_xcs_ring_init(struct intel_engine_cs 
*engine)
return logical_ring_init(engine);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct 

[Intel-gfx] [PATCH 3/6] drm/i915/perf: lock powergating configuration to default when active

2018-09-14 Thread Tvrtko Ursulin
From: Lionel Landwerlin 

If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes.

One possible solution to this problem is to reprogram the NOA muxes
when we switch to a new context. We initially tried this in the
workaround batchbuffer but some concerns where raised about the cost
of reprogramming at every context switch. This solution is also not
without consequences from the userspace point of view. Reprogramming
of the muxes can only happen once the powergating configuration has
changed (which happens after context switch). This means for a window
of time during the recording, counters recorded by the OA unit might
be invalid. This requires userspace dealing with OA reports to discard
the invalid values.

Minimizing the reprogramming could be implemented by tracking of the
last programmed configuration somewhere in GGTT and use MI_PREDICATE
to discard some of the programming commands, but the command streamer
would still have to parse all the MI_LRI instructions in the
workaround batchbuffer.

Another solution, which this change implements, is to simply disregard
the user requested configuration for the period of time when i915/perf
is active. There is no known issue with this apart from a performance
penality for some media workloads that benefit from running on a
partially powergated GPU. We already prevent RC6 from affecting the
programming so it doesn't sound completely unreasonable to hold on
powergating for the same reason.

v2: Leave RPCS programming in intel_lrc.c (Lionel)

v3: Update for s/union intel_sseu/struct intel_sseu/ (Lionel)
More to_intel_context() (Tvrtko)
s/dev_priv/i915/ (Tvrtko)

Tvrtko Ursulin:

v4:
 * Rebase for make_rpcs changes.

v5:
 * Apply OA restriction from make_rpcs directly.

v6:
 * Rebase for context image setup changes.

v7:
 * Move stream assignment before metric enable.

Signed-off-by: Lionel Landwerlin 
Signed-off-by: Tvrtko Ursulin 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 +
 drivers/gpu/drm/i915/intel_lrc.c | 29 +++--
 drivers/gpu/drm/i915/intel_lrc.h |  2 ++
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 664b96bb65a3..46ddd1913fee 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1677,6 +1677,11 @@ static void gen8_update_reg_state_unlocked(struct 
i915_gem_context *ctx,
 
CTX_REG(reg_state, state_offset, flex_regs[i], value);
}
+
+   CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+   gen8_make_rpcs(dev_priv,
+  _intel_context(ctx,
+dev_priv->engine[RCS])->sseu));
 }
 
 /*
@@ -2092,6 +2097,9 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
if (ret)
goto err_lock;
 
+   stream->ops = _oa_stream_ops;
+   dev_priv->perf.oa.exclusive_stream = stream;
+
ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
  stream->oa_config);
if (ret) {
@@ -2099,15 +2107,12 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
goto err_enable;
}
 
-   stream->ops = _oa_stream_ops;
-
-   dev_priv->perf.oa.exclusive_stream = stream;
-
mutex_unlock(_priv->drm.struct_mutex);
 
return 0;
 
 err_enable:
+   dev_priv->perf.oa.exclusive_stream = NULL;
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
mutex_unlock(_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1c9cf41dab3f..3b379c7295a3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1305,9 +1305,6 @@ static int __context_pin(struct i915_gem_context *ctx, 
struct i915_vma *vma)
return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
-
 static void
 __execlists_update_reg_state(struct intel_engine_cs *engine,
 struct intel_context *ce)
@@ -1322,7 +1319,7 @@ __execlists_update_reg_state(struct intel_engine_cs 
*engine,
/* RPCS */
if (engine->class == RENDER_CLASS) {
ce->lrc_reg_state[CTX_R_PWR_CLK_STATE + 1] =
-   make_rpcs(engine->i915, >sseu);
+   gen8_make_rpcs(engine->i915, >sseu);
}
 }
 
@@ -2508,13 +2505,12 @@ int logical_xcs_ring_init(struct intel_engine_cs 
*engine)
return logical_ring_init(engine);
 }
 
-static u32
-make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu)
+u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *req_sseu)
 {
const struct sseu_dev_info *sseu =