[Intel-gfx] [RFC 2/8] drm/i915: Introduce mode for capture of multi ctx OA reports synchronized with RCS
From: Sourab Gupta sourab.gu...@intel.com This patch introduces a mode of capturing OA counter reports belonging to multiple contexts, which can be mapped back to individual contexts. The OA reports captured in this way are synchronized with Render command stream. There may be usecases wherein we need more than periodic OA capture mode which is supported by perf_event currently. We may need to insert RCS synchronized commands to capture the OA counter snapshots. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts. - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The OA reports generated in this way will be forwarded to userspace after appending a footer, which will have this metadata information. This will enable the usecases mentioned above. This patch introduces an additional field in the oa attr structure for supporting this capture mode. The data thus captured needs to be stored in a separate buffer, which will be different from the buffer used otherwise for periodic OA capture mode. Again this buffer address will not need to be mapped to OA unit register addresses such as OASTATUS1, OASTATUS2 and OABUFFER. The subsequent patches introduce the mechanism for forwarding reports to userspace, handling the command synchronization and mechanism for inserting corresponding commands into the ringbuffer. v2: Following changes: - Save the gtt offset while pinning the buffer, to be retrieved when required. - Use spin_lock instead of spin_lock_irqsave Signed-off-by: Sourab Gupta sourab.gu...@intel.com --- drivers/gpu/drm/i915/i915_drv.h | 10 +++ drivers/gpu/drm/i915/i915_oa_perf.c | 170 +++- include/uapi/drm/i915_drm.h | 3 +- 3 files changed, 141 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 3436f3b..050bdda 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1931,6 +1931,7 @@ struct drm_i915_private { bool event_active; bool periodic; + bool multiple_ctx_mode; u32 period_exponent; u32 metrics_set; @@ -1945,6 +1946,15 @@ struct drm_i915_private { int format_size; spinlock_t flush_lock; } oa_buffer; + + /* Fields for multiple context capture mode */ + struct { + struct drm_i915_gem_object *obj; + u32 gtt_offset; + u8 *addr; + int format; + int format_size; + } oa_rcs_buffer; } oa_pmu; #endif diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c index e7e0b2b..fd0c3a0 100644 --- a/drivers/gpu/drm/i915/i915_oa_perf.c +++ b/drivers/gpu/drm/i915/i915_oa_perf.c @@ -166,19 +166,36 @@ static void flush_oa_snapshots(struct drm_i915_private *dev_priv, } static void -oa_buffer_destroy(struct drm_i915_private *i915) +oa_rcs_buffer_destroy(struct drm_i915_private *i915) { mutex_lock(i915-dev-struct_mutex); + vunmap(i915-oa_pmu.oa_rcs_buffer.addr); + i915_gem_object_ggtt_unpin(i915-oa_pmu.oa_rcs_buffer.obj); + drm_gem_object_unreference(i915-oa_pmu.oa_rcs_buffer.obj-base); + mutex_unlock(i915-dev-struct_mutex); + + spin_lock(i915-oa_pmu.lock); + i915-oa_pmu.oa_rcs_buffer.obj = NULL; + i915-oa_pmu.oa_rcs_buffer.gtt_offset = 0; + i915-oa_pmu.oa_rcs_buffer.addr = NULL; + spin_unlock(i915-oa_pmu.lock); +} +static void +oa_buffer_destroy(struct drm_i915_private *i915) +{ + mutex_lock(i915-dev-struct_mutex); vunmap(i915-oa_pmu.oa_buffer.addr); i915_gem_object_ggtt_unpin(i915-oa_pmu.oa_buffer.obj); drm_gem_object_unreference(i915-oa_pmu.oa_buffer.obj-base); + mutex_unlock(i915-dev-struct_mutex); + spin_lock(i915-oa_pmu.lock); i915-oa_pmu.oa_buffer.obj = NULL; i915-oa_pmu.oa_buffer.gtt_offset = 0; i915-oa_pmu.oa_buffer.addr = NULL; + spin_unlock(i915-oa_pmu.lock); - mutex_unlock(i915-dev-struct_mutex); } static void i915_oa_event_destroy(struct perf_event *event) @@ -207,6 +224,9 @@ static void i915_oa_event_destroy(struct perf_event *event) I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) ~GT_NOA_ENABLE)); + if (dev_priv-oa_pmu.multiple_ctx_mode) + oa_rcs_buffer_destroy(dev_priv); + oa_buffer_destroy(dev_priv); BUG_ON(dev_priv-oa_pmu.exclusive_event != event); @@ -216,6 +236,59 @@ static void i915_oa_event_destroy(struct perf_event *event)
[Intel-gfx] [RFC 2/8] drm/i915: Introduce mode for capture of multi ctx OA reports synchronized with RCS
From: Sourab Gupta sourab.gu...@intel.com This patch introduces a mode of capturing OA counter reports belonging to multiple contexts, which can be mapped back to individual contexts. The OA reports captured in this way are synchronized with Render command stream. There may be usecases wherein we need more than periodic OA capture mode which is supported by perf_event currently. We may need to insert RCS synchronized commands to capture the OA counter snapshots. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts. - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The OA reports generated in this way will be forwarded to userspace after appending a footer, which will have this metadata information. This will enable the usecases mentioned above. This patch introduces an additional field in the oa attr structure for supporting this capture mode. The data thus captured needs to be stored in a separate buffer, which will be different from the buffer used otherwise for periodic OA capture mode. Again this buffer address will not need to be mapped to OA unit register addresses such as OASTATUS1, OASTATUS2 and OABUFFER. The subsequent patches introduce the mechanism for forwarding reports to userspace, handling the command synchronization and mechanism for inserting corresponding commands into the ringbuffer. Signed-off-by: Sourab Gupta sourab.gu...@intel.com --- drivers/gpu/drm/i915/i915_drv.h | 9 ++ drivers/gpu/drm/i915/i915_oa_perf.c | 173 +++- include/uapi/drm/i915_drm.h | 3 +- 3 files changed, 143 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index baa0234..740148d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1930,6 +1930,7 @@ struct drm_i915_private { bool event_active; bool periodic; + bool multiple_ctx_mode; u32 period_exponent; u32 metrics_set; @@ -1944,6 +1945,14 @@ struct drm_i915_private { int format_size; spinlock_t flush_lock; } oa_buffer; + + /* Fields for multiple context capture mode */ + struct { + struct drm_i915_gem_object *obj; + u8 *addr; + int format; + int format_size; + } oa_rcs_buffer; } oa_pmu; #endif diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c index e7e0b2b..b79582b 100644 --- a/drivers/gpu/drm/i915/i915_oa_perf.c +++ b/drivers/gpu/drm/i915/i915_oa_perf.c @@ -166,19 +166,39 @@ static void flush_oa_snapshots(struct drm_i915_private *dev_priv, } static void -oa_buffer_destroy(struct drm_i915_private *i915) +oa_rcs_buffer_destroy(struct drm_i915_private *i915) { + unsigned long lock_flags; + mutex_lock(i915-dev-struct_mutex); + vunmap(i915-oa_pmu.oa_rcs_buffer.addr); + i915_gem_object_ggtt_unpin(i915-oa_pmu.oa_rcs_buffer.obj); + drm_gem_object_unreference(i915-oa_pmu.oa_rcs_buffer.obj-base); + mutex_unlock(i915-dev-struct_mutex); + spin_lock_irqsave(i915-oa_pmu.lock, lock_flags); + i915-oa_pmu.oa_rcs_buffer.obj = NULL; + i915-oa_pmu.oa_rcs_buffer.addr = NULL; + spin_unlock_irqrestore(i915-oa_pmu.lock, lock_flags); +} + +static void +oa_buffer_destroy(struct drm_i915_private *i915) +{ + unsigned long lock_flags; + + mutex_lock(i915-dev-struct_mutex); vunmap(i915-oa_pmu.oa_buffer.addr); i915_gem_object_ggtt_unpin(i915-oa_pmu.oa_buffer.obj); drm_gem_object_unreference(i915-oa_pmu.oa_buffer.obj-base); + mutex_unlock(i915-dev-struct_mutex); + spin_lock_irqsave(i915-oa_pmu.lock, lock_flags); i915-oa_pmu.oa_buffer.obj = NULL; i915-oa_pmu.oa_buffer.gtt_offset = 0; i915-oa_pmu.oa_buffer.addr = NULL; + spin_unlock_irqrestore(i915-oa_pmu.lock, lock_flags); - mutex_unlock(i915-dev-struct_mutex); } static void i915_oa_event_destroy(struct perf_event *event) @@ -207,6 +227,9 @@ static void i915_oa_event_destroy(struct perf_event *event) I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) ~GT_NOA_ENABLE)); + if (dev_priv-oa_pmu.multiple_ctx_mode) + oa_rcs_buffer_destroy(dev_priv); + oa_buffer_destroy(dev_priv); BUG_ON(dev_priv-oa_pmu.exclusive_event != event); @@ -216,6 +239,59 @@ static void i915_oa_event_destroy(struct perf_event *event) intel_runtime_pm_put(dev_priv); } +static int alloc_obj(struct drm_i915_private *dev_priv, +