We can't run into issues with doing writing the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++----------------
 1 file changed, 100 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index bf4f5fee6764..7e636463e1f5 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref)
        kfree(oa_config);
 }
 
+static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915)
+{
+       struct i915_oa_config *oa_config, *next;
+
+       mutex_lock(&i915->perf.metrics_lock);
+       list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, 
vma_link) {
+               list_del(&oa_config->vma_link);
+               i915_gem_object_put(oa_config->obj);
+               oa_config->obj = NULL;
+       }
+       mutex_unlock(&i915->perf.metrics_lock);
+}
+
 static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 
n_regs)
 {
        u32 i;
@@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915)
        return ret;
 }
 
-static void config_oa_regs(struct drm_i915_private *dev_priv,
-                          const struct i915_oa_reg *regs,
-                          u32 n_regs)
+static int config_oa_regs(struct drm_i915_private *i915,
+                         struct i915_oa_config *oa_config)
 {
-       u32 i;
+       struct i915_request *rq;
+       struct i915_vma *vma;
+       long timeout;
+       u32 *cs;
+       int err;
 
-       for (i = 0; i < n_regs; i++) {
-               const struct i915_oa_reg *reg = regs + i;
+       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
+       if (IS_ERR(rq))
+               return PTR_ERR(rq);
 
-               I915_WRITE(reg->addr, reg->value);
+       err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
+                                     rq);
+       if (err) {
+               i915_request_add(rq);
+               return err;
+       }
+
+       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
+       if (unlikely(IS_ERR(vma))) {
+               i915_request_add(rq);
+               return PTR_ERR(vma);
+       }
+
+       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+       if (err) {
+               i915_request_add(rq);
+               return err;
        }
+
+       err = i915_vma_move_to_active(vma, rq, 0);
+       if (err) {
+               i915_vma_unpin(vma);
+               i915_request_add(rq);
+               return err;
+       }
+
+       cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2);
+       if (IS_ERR(cs)) {
+               i915_vma_unpin(vma);
+               i915_request_add(rq);
+               return PTR_ERR(cs);
+       }
+
+       if (INTEL_GEN(i915) > 8) {
+               *cs++ = MI_BATCH_BUFFER_START_GEN8;
+               *cs++ = lower_32_bits(vma->node.start);
+               *cs++ = upper_32_bits(vma->node.start);
+               *cs++ = MI_NOOP;
+       } else {
+               *cs++ = MI_BATCH_BUFFER_START;
+               *cs++ = vma->node.start;
+       }
+
+       intel_ring_advance(rq, cs);
+
+       i915_vma_unpin(vma);
+
+       i915_request_add(rq);
+
+       i915_request_get(rq);
+       timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | 
I915_WAIT_LOCKED,
+                                   MAX_SCHEDULE_TIMEOUT);
+       i915_request_put(rq);
+
+       return timeout < 0 ? err : 0;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
        struct drm_i915_private *dev_priv = stream->dev_priv;
-       const struct i915_oa_config *oa_config = stream->oa_config;
 
-       /* PRM:
-        *
-        * OA unit is using “crclk” for its functionality. When trunk
-        * level clock gating takes place, OA clock would be gated,
-        * unable to count the events from non-render clock domain.
-        * Render clock gating must be disabled when OA is enabled to
-        * count the events from non-render domain. Unit level clock
-        * gating for RCS should also be disabled.
-        */
        I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
                                    ~GEN7_DOP_CLOCK_GATE_ENABLE));
        I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
                                  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
 
-       config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-
-       /* It apparently takes a fairly long time for a new MUX
-        * configuration to be be applied after these register writes.
-        * This delay duration was derived empirically based on the
-        * render_basic config but hopefully it covers the maximum
-        * configuration latency.
-        *
-        * As a fallback, the checks in _append_oa_reports() to skip
-        * invalid OA reports do also seem to work to discard reports
-        * generated before this config has completed - albeit not
-        * silently.
-        *
-        * Unfortunately this is essentially a magic number, since we
-        * don't currently know of a reliable mechanism for predicting
-        * how long the MUX config will take to apply and besides
-        * seeing invalid reports we don't know of a reliable way to
-        * explicitly check that the MUX config has landed.
-        *
-        * It's even possible we've miss characterized the underlying
-        * problem - it just seems like the simplest explanation why
-        * a delay at this location would mitigate any invalid reports.
-        */
-       usleep_range(15000, 20000);
-
-       config_oa_regs(dev_priv, oa_config->b_counter_regs,
-                      oa_config->b_counter_regs_len);
-
-       return 0;
+       return config_oa_regs(dev_priv, stream->oa_config);
 }
 
 static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
@@ -1978,7 +2010,6 @@ static int gen8_configure_all_contexts(struct 
drm_i915_private *dev_priv,
 {
        unsigned int map_type = i915_coherent_map_type(dev_priv);
        struct i915_gem_context *ctx;
-       struct i915_request *rq;
        int ret;
 
        lockdep_assert_held(&dev_priv->drm.struct_mutex);
@@ -2037,14 +2068,9 @@ static int gen8_configure_all_contexts(struct 
drm_i915_private *dev_priv,
        }
 
        /*
-        * Apply the configuration by doing one context restore of the edited
-        * context image.
+        * The above configuration will be applied when called
+        * config_oa_regs().
         */
-       rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
-       if (IS_ERR(rq))
-               return PTR_ERR(rq);
-
-       i915_request_add(rq);
 
        return 0;
 }
@@ -2093,35 +2119,7 @@ static int gen8_enable_metric_set(struct 
i915_perf_stream *stream)
        if (ret)
                return ret;
 
-       config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-
-       /* It apparently takes a fairly long time for a new MUX
-        * configuration to be be applied after these register writes.
-        * This delay duration was derived empirically based on the
-        * render_basic config but hopefully it covers the maximum
-        * configuration latency.
-        *
-        * As a fallback, the checks in _append_oa_reports() to skip
-        * invalid OA reports do also seem to work to discard reports
-        * generated before this config has completed - albeit not
-        * silently.
-        *
-        * Unfortunately this is essentially a magic number, since we
-        * don't currently know of a reliable mechanism for predicting
-        * how long the MUX config will take to apply and besides
-        * seeing invalid reports we don't know of a reliable way to
-        * explicitly check that the MUX config has landed.
-        *
-        * It's even possible we've miss characterized the underlying
-        * problem - it just seems like the simplest explanation why
-        * a delay at this location would mitigate any invalid reports.
-        */
-       usleep_range(15000, 20000);
-
-       config_oa_regs(dev_priv, oa_config->b_counter_regs,
-                      oa_config->b_counter_regs_len);
-
-       return 0;
+       return config_oa_regs(dev_priv, stream->oa_config);
 }
 
 static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
@@ -2292,6 +2290,7 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
                               struct perf_open_properties *props)
 {
        struct drm_i915_private *dev_priv = stream->dev_priv;
+       struct drm_i915_gem_object *obj;
        int format_size;
        int ret;
 
@@ -2376,13 +2375,6 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
                }
        }
 
-       ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
-                                     &stream->oa_config, NULL);
-       if (ret) {
-               DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
-               goto err_config;
-       }
-
        ret = alloc_noa_wait(dev_priv);
        if (ret) {
                DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
@@ -2412,6 +2404,19 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
        if (ret)
                goto err_lock;
 
+       ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
+                                     &stream->oa_config, &obj);
+       if (ret) {
+               DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
+               goto err_config;
+       }
+
+       /*
+        * We just need the buffer to be created, but not our own reference on
+        * it as the oa_config already has one.
+        */
+       i915_gem_object_put(obj);
+
        stream->ops = &i915_oa_stream_ops;
        dev_priv->perf.oa.exclusive_stream = stream;
 
@@ -2430,14 +2435,16 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
 err_enable:
        dev_priv->perf.oa.exclusive_stream = NULL;
        dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+
+err_config:
+       i915_oa_config_put(stream->oa_config);
+       i915_oa_config_dispose_buffers(dev_priv);
        mutex_unlock(&dev_priv->drm.struct_mutex);
 
 err_lock:
        free_oa_buffer(dev_priv);
 
 err_oa_buf_alloc:
-       i915_oa_config_put(stream->oa_config);
-
        intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
        intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
 
@@ -2446,9 +2453,6 @@ static int i915_oa_stream_init(struct i915_perf_stream 
*stream,
        mutex_unlock(&dev_priv->drm.struct_mutex);
 
 err_noa_wait_alloc:
-       i915_oa_config_put(stream->oa_config);
-
-err_config:
        if (stream->ctx)
                oa_put_render_ctx_id(stream);
 
@@ -2810,20 +2814,13 @@ static int i915_perf_release(struct inode *inode, 
struct file *file)
 {
        struct i915_perf_stream *stream = file->private_data;
        struct drm_i915_private *dev_priv = stream->dev_priv;
-       struct i915_oa_config *oa_config, *next;
 
        mutex_lock(&dev_priv->perf.lock);
 
        i915_perf_destroy_locked(stream);
 
        /* Dispose of all oa config batch buffers. */
-       mutex_lock(&dev_priv->perf.metrics_lock);
-       list_for_each_entry_safe(oa_config, next, 
&dev_priv->perf.metrics_buffers, vma_link) {
-               list_del(&oa_config->vma_link);
-               i915_gem_object_put(oa_config->obj);
-               oa_config->obj = NULL;
-       }
-       mutex_unlock(&dev_priv->perf.metrics_lock);
+       i915_oa_config_dispose_buffers(dev_priv);
 
        mutex_unlock(&dev_priv->perf.lock);
 
-- 
2.21.0.392.gf8f6787159e

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to