The engine provides a mirror of the CSB in the HWSP. If we use the cacheable reads from the HWSP, we can shave off a few mmio reads per context-switch interrupt (which are quite frequent!). Just removing a couple of mmio is not enough to actually reduce any latency, but a small reduction in overall cpu usage.
Much appreciation for Ben dropping the bombshell that the CSB was in the HWSP and for Michel in digging out the details. v2: Don't be lazy, add the defines for the indices. v3: Include the HWSP in debugfs/i915_engine_info Suggested-by: Ben Widawsky <benjamin.widaw...@intel.com> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk> Cc: Michel Thierry <michel.thie...@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com> Cc: Mika Kuoppala <mika.kuopp...@intel.com> Acked-by: Michel Thierry <michel.thie...@intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 7 +++++-- drivers/gpu/drm/i915/intel_lrc.c | 10 +++++----- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 ++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 620c9218d1c1..5fd01c14a3ec 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3384,6 +3384,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) upper_32_bits(addr), lower_32_bits(addr)); if (i915.enable_execlists) { + const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX]; u32 ptr, read, write; unsigned int idx; @@ -3404,10 +3405,12 @@ static int i915_engine_info(struct seq_file *m, void *unused) write += GEN8_CSB_ENTRIES; while (read < write) { idx = ++read % GEN8_CSB_ENTRIES; - seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n", + seq_printf(m, "\tExeclist CSB[%d]: 0x%08x [0x%08x in hwsp], context: %d [%d in hwsp]\n", idx, I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)), - I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx))); + hws[idx * 2], + I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)), + hws[idx * 2 + 1]); } rcu_read_lock(); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 3469badedbe0..a887379b004d 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -547,8 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data) while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) { u32 __iomem *csb_mmio = dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)); - u32 __iomem *buf = - dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)); + /* The HWSP contains a (cacheable) mirror of the CSB */ + const u32 *buf = + &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX]; unsigned int head, tail; /* The write will be ordered by the uncached read (itself @@ -590,13 +591,12 @@ static void intel_lrc_irq_handler(unsigned long data) * status notifier. */ - status = readl(buf + 2 * head); + status = buf[2 * head]; if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) continue; /* Check the context/desc id for this event matches */ - GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != - port->context_id); + GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id); rq = port_unpack(port, &count); GEM_BUG_ON(count == 0); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index d33c93444c0d..2c55cfa14fb5 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -496,6 +496,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value) #define I915_GEM_HWS_SCRATCH_INDEX 0x40 #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT) +#define I915_HWS_CSB_BUF0_INDEX 0x10 + struct intel_ring * intel_engine_create_ring(struct intel_engine_cs *engine, int size); int intel_ring_pin(struct intel_ring *ring, -- 2.13.2 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx