We can't run into issues with doing writing the global OA/NOA registers configuration from CPU so far, but HW engineers actually recommend doing this from the command streamer.
Since we have a command buffer prepared for the execbuffer side of things, we can reuse that approach here too. Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com> --- drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++---------------- 1 file changed, 100 insertions(+), 103 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index bf4f5fee6764..7e636463e1f5 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref) kfree(oa_config); } +static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915) +{ + struct i915_oa_config *oa_config, *next; + + mutex_lock(&i915->perf.metrics_lock); + list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) { + list_del(&oa_config->vma_link); + i915_gem_object_put(oa_config->obj); + oa_config->obj = NULL; + } + mutex_unlock(&i915->perf.metrics_lock); +} + static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs) { u32 i; @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915) return ret; } -static void config_oa_regs(struct drm_i915_private *dev_priv, - const struct i915_oa_reg *regs, - u32 n_regs) +static int config_oa_regs(struct drm_i915_private *i915, + struct i915_oa_config *oa_config) { - u32 i; + struct i915_request *rq; + struct i915_vma *vma; + long timeout; + u32 *cs; + int err; - for (i = 0; i < n_regs; i++) { - const struct i915_oa_reg *reg = regs + i; + rq = i915_request_create(i915->engine[RCS0]->kernel_context); + if (IS_ERR(rq)) + return PTR_ERR(rq); - I915_WRITE(reg->addr, reg->value); + err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config, + rq); + if (err) { + i915_request_add(rq); + return err; + } + + vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL); + if (unlikely(IS_ERR(vma))) { + i915_request_add(rq); + return PTR_ERR(vma); + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) { + i915_request_add(rq); + return err; } + + err = i915_vma_move_to_active(vma, rq, 0); + if (err) { + i915_vma_unpin(vma); + i915_request_add(rq); + return err; + } + + cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2); + if (IS_ERR(cs)) { + i915_vma_unpin(vma); + i915_request_add(rq); + return PTR_ERR(cs); + } + + if (INTEL_GEN(i915) > 8) { + *cs++ = MI_BATCH_BUFFER_START_GEN8; + *cs++ = lower_32_bits(vma->node.start); + *cs++ = upper_32_bits(vma->node.start); + *cs++ = MI_NOOP; + } else { + *cs++ = MI_BATCH_BUFFER_START; + *cs++ = vma->node.start; + } + + intel_ring_advance(rq, cs); + + i915_vma_unpin(vma); + + i915_request_add(rq); + + i915_request_get(rq); + timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED, + MAX_SCHEDULE_TIMEOUT); + i915_request_put(rq); + + return timeout < 0 ? err : 0; } static int hsw_enable_metric_set(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - const struct i915_oa_config *oa_config = stream->oa_config; - /* PRM: - * - * OA unit is using “crclk” for its functionality. When trunk - * level clock gating takes place, OA clock would be gated, - * unable to count the events from non-render clock domain. - * Render clock gating must be disabled when OA is enabled to - * count the events from non-render domain. Unit level clock - * gating for RCS should also be disabled. - */ I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) & ~GEN7_DOP_CLOCK_GATE_ENABLE)); I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) | GEN6_CSUNIT_CLOCK_GATE_DISABLE)); - config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len); - - /* It apparently takes a fairly long time for a new MUX - * configuration to be be applied after these register writes. - * This delay duration was derived empirically based on the - * render_basic config but hopefully it covers the maximum - * configuration latency. - * - * As a fallback, the checks in _append_oa_reports() to skip - * invalid OA reports do also seem to work to discard reports - * generated before this config has completed - albeit not - * silently. - * - * Unfortunately this is essentially a magic number, since we - * don't currently know of a reliable mechanism for predicting - * how long the MUX config will take to apply and besides - * seeing invalid reports we don't know of a reliable way to - * explicitly check that the MUX config has landed. - * - * It's even possible we've miss characterized the underlying - * problem - it just seems like the simplest explanation why - * a delay at this location would mitigate any invalid reports. - */ - usleep_range(15000, 20000); - - config_oa_regs(dev_priv, oa_config->b_counter_regs, - oa_config->b_counter_regs_len); - - return 0; + return config_oa_regs(dev_priv, stream->oa_config); } static void hsw_disable_metric_set(struct drm_i915_private *dev_priv) @@ -1978,7 +2010,6 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, { unsigned int map_type = i915_coherent_map_type(dev_priv); struct i915_gem_context *ctx; - struct i915_request *rq; int ret; lockdep_assert_held(&dev_priv->drm.struct_mutex); @@ -2037,14 +2068,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv, } /* - * Apply the configuration by doing one context restore of the edited - * context image. + * The above configuration will be applied when called + * config_oa_regs(). */ - rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - i915_request_add(rq); return 0; } @@ -2093,35 +2119,7 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream) if (ret) return ret; - config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len); - - /* It apparently takes a fairly long time for a new MUX - * configuration to be be applied after these register writes. - * This delay duration was derived empirically based on the - * render_basic config but hopefully it covers the maximum - * configuration latency. - * - * As a fallback, the checks in _append_oa_reports() to skip - * invalid OA reports do also seem to work to discard reports - * generated before this config has completed - albeit not - * silently. - * - * Unfortunately this is essentially a magic number, since we - * don't currently know of a reliable mechanism for predicting - * how long the MUX config will take to apply and besides - * seeing invalid reports we don't know of a reliable way to - * explicitly check that the MUX config has landed. - * - * It's even possible we've miss characterized the underlying - * problem - it just seems like the simplest explanation why - * a delay at this location would mitigate any invalid reports. - */ - usleep_range(15000, 20000); - - config_oa_regs(dev_priv, oa_config->b_counter_regs, - oa_config->b_counter_regs_len); - - return 0; + return config_oa_regs(dev_priv, stream->oa_config); } static void gen8_disable_metric_set(struct drm_i915_private *dev_priv) @@ -2292,6 +2290,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, struct perf_open_properties *props) { struct drm_i915_private *dev_priv = stream->dev_priv; + struct drm_i915_gem_object *obj; int format_size; int ret; @@ -2376,13 +2375,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, } } - ret = i915_perf_get_oa_config(dev_priv, props->metrics_set, - &stream->oa_config, NULL); - if (ret) { - DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set); - goto err_config; - } - ret = alloc_noa_wait(dev_priv); if (ret) { DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); @@ -2412,6 +2404,19 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, if (ret) goto err_lock; + ret = i915_perf_get_oa_config(dev_priv, props->metrics_set, + &stream->oa_config, &obj); + if (ret) { + DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set); + goto err_config; + } + + /* + * We just need the buffer to be created, but not our own reference on + * it as the oa_config already has one. + */ + i915_gem_object_put(obj); + stream->ops = &i915_oa_stream_ops; dev_priv->perf.oa.exclusive_stream = stream; @@ -2430,14 +2435,16 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, err_enable: dev_priv->perf.oa.exclusive_stream = NULL; dev_priv->perf.oa.ops.disable_metric_set(dev_priv); + +err_config: + i915_oa_config_put(stream->oa_config); + i915_oa_config_dispose_buffers(dev_priv); mutex_unlock(&dev_priv->drm.struct_mutex); err_lock: free_oa_buffer(dev_priv); err_oa_buf_alloc: - i915_oa_config_put(stream->oa_config); - intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); @@ -2446,9 +2453,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, mutex_unlock(&dev_priv->drm.struct_mutex); err_noa_wait_alloc: - i915_oa_config_put(stream->oa_config); - -err_config: if (stream->ctx) oa_put_render_ctx_id(stream); @@ -2810,20 +2814,13 @@ static int i915_perf_release(struct inode *inode, struct file *file) { struct i915_perf_stream *stream = file->private_data; struct drm_i915_private *dev_priv = stream->dev_priv; - struct i915_oa_config *oa_config, *next; mutex_lock(&dev_priv->perf.lock); i915_perf_destroy_locked(stream); /* Dispose of all oa config batch buffers. */ - mutex_lock(&dev_priv->perf.metrics_lock); - list_for_each_entry_safe(oa_config, next, &dev_priv->perf.metrics_buffers, vma_link) { - list_del(&oa_config->vma_link); - i915_gem_object_put(oa_config->obj); - oa_config->obj = NULL; - } - mutex_unlock(&dev_priv->perf.metrics_lock); + i915_oa_config_dispose_buffers(dev_priv); mutex_unlock(&dev_priv->perf.lock); -- 2.21.0.392.gf8f6787159e _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx