On Mon, Mar 02, 2015 at 11:07:20AM +0000, Arun Siluvery wrote:
> Some of the workarounds are to be applied during context save but before
> restore and some at the end of context save/restore but before executing
> the instructions in the ring. Workaround batch buffers are created for
> this purpose as they cannot be applied using normal means. HW executes
> them at specific stages during context save/restore.
> 
> In this method we initialize batch buffer with w/a commands and its address
> is supplied using context offset pointers when a context is initialized.
> 
> This patch introduces indirect and per-context batch buffers using which
> following workarounds are applied. These are required to fix issues
> observed with preemption related workloads.
> 
> In Indirect context w/a batch buffer,
> +WaDisableCtxRestoreArbitration
> +WaFlushCoherentL3CacheLinesAtContextSwitch
> +WaClearSlmSpaceAtContextSwitch
> 
> In Per context w/a batch buffer,
> +WaDisableCtxRestoreArbitration
> +WaRsRestoreWithPerCtxtBb
> 
> v2: Use GTT address type for all privileged instructions, update as
> per dynamic pinning changes, minor simplifications, rename variables
> as follows to keep lines under 80 chars and rebase.
> s/indirect_ctx_wa_ringbuf/indirect_ctx_wa_bb
> s/per_ctx_wa_ringbuf/per_ctx_wa_bb
> 
> v3: Modify WA BB initialization to Gen specific.
> 
> v4: s/PIPE_CONTROL_FLUSH_RO_CACHES/PIPE_CONTROL_FLUSH_L3 (Ville)
> This patches modifies definitions of MI_LOAD_REGISTER_MEM and
> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
> so as to not break any future users of existing definitions (Michel)
> 
> Change-Id: I0cedb536b7f6d9f10ba9e81ba625848e7bab603c
> Signed-off-by: Rafael Barbalho <[email protected]>
> Signed-off-by: Arun Siluvery <[email protected]>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |   3 +
>  drivers/gpu/drm/i915/i915_reg.h         |  28 ++++
>  drivers/gpu/drm/i915/intel_lrc.c        | 231 
> +++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |   3 +
>  4 files changed, 258 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d42040f..86cdb52 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -774,6 +774,9 @@ struct intel_context {
>  
>       /* Execlists */
>       bool rcs_initialized;
> +     struct intel_ringbuffer *indirect_ctx_wa_bb;
> +     struct intel_ringbuffer *per_ctx_wa_bb;

Why is this per-ctx and not per-engine? Also your patch splitting doesn't
make that much sense: Patch 1 only adds a static function without any
users (resulting in gcc being unhappy). Imo a better split would be:
- wire up wa batch/ring allocation/freing functions
- wire up the changes to the lrc initial reg state code
- one patch per w/a entry you add

Cheers, Daniel

> +
>       struct {
>               struct drm_i915_gem_object *state;
>               struct intel_ringbuffer *ringbuf;
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 55143cb..3048494 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -347,6 +347,26 @@
>  #define   MI_INVALIDATE_BSD          (1<<7)
>  #define   MI_FLUSH_DW_USE_GTT                (1<<2)
>  #define   MI_FLUSH_DW_USE_PPGTT              (0<<2)
> +#define MI_ATOMIC(len)       MI_INSTR(0x2F, (len-2))
> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22)
> +#define   MI_ATOMIC_INLINE_DATA              (1<<18)
> +#define   MI_ATOMIC_CS_STALL         (1<<17)
> +#define   MI_ATOMIC_RETURN_DATA_CTL  (1<<16)
> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
> +#define MI_ATOMIC_AND        MI_ATOMIC_OP_MASK(0x01)
> +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02)
> +#define MI_ATOMIC_XOR        MI_ATOMIC_OP_MASK(0x03)
> +#define MI_ATOMIC_MOVE       MI_ATOMIC_OP_MASK(0x04)
> +#define MI_ATOMIC_INC        MI_ATOMIC_OP_MASK(0x05)
> +#define MI_ATOMIC_DEC        MI_ATOMIC_OP_MASK(0x06)
> +#define MI_ATOMIC_ADD        MI_ATOMIC_OP_MASK(0x07)
> +#define MI_ATOMIC_SUB        MI_ATOMIC_OP_MASK(0x08)
> +#define MI_ATOMIC_RSUB       MI_ATOMIC_OP_MASK(0x09)
> +#define MI_ATOMIC_IMAX       MI_ATOMIC_OP_MASK(0x0A)
> +#define MI_ATOMIC_IMIN       MI_ATOMIC_OP_MASK(0x0B)
> +#define MI_ATOMIC_UMAX       MI_ATOMIC_OP_MASK(0x0C)
> +#define MI_ATOMIC_UMIN       MI_ATOMIC_OP_MASK(0x0D)
> +
>  #define MI_BATCH_BUFFER              MI_INSTR(0x30, 1)
>  #define   MI_BATCH_NON_SECURE                (1)
>  /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
> @@ -410,6 +430,7 @@
>  #define   DISPLAY_PLANE_A           (0<<20)
>  #define   DISPLAY_PLANE_B           (1<<20)
>  #define GFX_OP_PIPE_CONTROL(len)     ((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2))
> +#define   PIPE_CONTROL_FLUSH_L3                              (1<<27)
>  #define   PIPE_CONTROL_GLOBAL_GTT_IVB                        (1<<24) /* 
> gen7+ */
>  #define   PIPE_CONTROL_MMIO_WRITE                    (1<<23)
>  #define   PIPE_CONTROL_STORE_DATA_INDEX                      (1<<21)
> @@ -426,6 +447,7 @@
>  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE                (1<<9)
>  #define   PIPE_CONTROL_NOTIFY                                (1<<8)
>  #define   PIPE_CONTROL_FLUSH_ENABLE                  (1<<7) /* gen7+ */
> +#define   PIPE_CONTROL_DC_FLUSH_ENABLE                       (1<<5)
>  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE           (1<<4)
>  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE                (1<<3)
>  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE                (1<<2)
> @@ -451,6 +473,10 @@
>  #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
>  #define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
>  #define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
> +#define MI_LRM_USE_GLOBAL_GTT (1<<22)
> +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21)
> +#define MI_LOAD_REGISTER_REG_GEN8 MI_INSTR(0x2A, 1)
>  #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>  #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>  #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
> @@ -1520,6 +1546,8 @@ enum skl_disp_power_wells {
>  #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE      (1 << 12)
>  #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE     (1<<10)
>  
> +#define GEN8_RS_PREEMPT_STATUS               0x215C
> +
>  /* Fuse readout registers for GT */
>  #define CHV_FUSE_GT                  (VLV_DISPLAY_BASE + 0x2168)
>  #define   CHV_FGT_EU_DIS_SS0_R0_SHIFT        16
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index ea37a56..326539b 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -202,6 +202,7 @@ enum {
>       FAULT_AND_CONTINUE /* Unsupported */
>  };
>  #define GEN8_CTX_ID_SHIFT 32
> +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
>  
>  static int intel_lr_context_pin(struct intel_engine_cs *ring,
>               struct intel_context *ctx);
> @@ -1139,6 +1140,169 @@ create_wa_bb(struct intel_engine_cs *ring, uint32_t 
> bb_size)
>       return ringbuf;
>  }
>  
> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
> +                                 struct intel_context *ctx)
> +{
> +     unsigned long flags = 0;
> +     u32 scratch_addr;
> +     struct intel_ringbuffer *ringbuf = NULL;
> +
> +     if (ring->scratch.obj == NULL) {
> +             DRM_ERROR("scratch page not allocated for %s\n", ring->name);
> +             return -EINVAL;
> +     }
> +
> +     ringbuf = create_wa_bb(ring, PAGE_SIZE);
> +     if (!ringbuf)
> +             return -ENOMEM;
> +
> +     ctx->indirect_ctx_wa_bb = ringbuf;
> +
> +     /* WaDisableCtxRestoreArbitration:bdw,chv */
> +     intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_DISABLE);
> +
> +     /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw,chv */
> +     intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
> +     intel_logical_ring_emit(ringbuf, PIPE_CONTROL_DC_FLUSH_ENABLE);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +
> +     /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
> +     flags = PIPE_CONTROL_FLUSH_L3 |
> +             PIPE_CONTROL_GLOBAL_GTT_IVB |
> +             PIPE_CONTROL_CS_STALL |
> +             PIPE_CONTROL_QW_WRITE;
> +
> +     /* Actual scratch location is at 128 bytes offset */
> +     scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
> +     scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
> +
> +     intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
> +     intel_logical_ring_emit(ringbuf, flags);
> +     intel_logical_ring_emit(ringbuf, scratch_addr);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +
> +     /* Padding to align with cache line */
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, 0);
> +
> +     /*
> +      * No MI_BATCH_BUFFER_END is required in Indirect ctx BB because
> +      * execution depends on the size defined in CTX_RCS_INDIRECT_CTX
> +      */
> +
> +     return 0;
> +}
> +
> +static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
> +                            struct intel_context *ctx)
> +{
> +     unsigned long flags = 0;
> +     u32 scratch_addr;
> +     struct intel_ringbuffer *ringbuf = NULL;
> +
> +     if (ring->scratch.obj == NULL) {
> +             DRM_ERROR("scratch page not allocated for %s\n", ring->name);
> +             return -EINVAL;
> +     }
> +
> +     ringbuf = create_wa_bb(ring, PAGE_SIZE);
> +     if (!ringbuf)
> +             return -ENOMEM;
> +
> +     ctx->per_ctx_wa_bb = ringbuf;
> +
> +     /* Actual scratch location is at 128 bytes offset */
> +     scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
> +     scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
> +
> +     /* WaDisableCtxRestoreArbitration:bdw,chv */
> +     intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_ENABLE);
> +
> +     /*
> +      * As per Bspec, to workaround a known HW issue, SW must perform the
> +      * below programming sequence prior to programming MI_BATCH_BUFFER_END.
> +      *
> +      * This is only applicable for Gen8.
> +      */
> +
> +     /* WaRsRestoreWithPerCtxtBb:bdw,chv */
> +     intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
> +     intel_logical_ring_emit(ringbuf, INSTPM);
> +     intel_logical_ring_emit(ringbuf,
> +                             _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
> +
> +     flags = MI_ATOMIC_MEMORY_TYPE_GGTT |
> +             MI_ATOMIC_INLINE_DATA |
> +             MI_ATOMIC_CS_STALL |
> +             MI_ATOMIC_RETURN_DATA_CTL |
> +             MI_ATOMIC_MOVE;
> +
> +     intel_logical_ring_emit(ringbuf, MI_ATOMIC(5) | flags);
> +     intel_logical_ring_emit(ringbuf, scratch_addr);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf,
> +                             _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +     intel_logical_ring_emit(ringbuf,
> +                             _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +
> +     /*
> +      * Bspec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
> +      * MI_BATCH_BUFFER_END instructions in this sequence need to be
> +      * in the same cacheline.
> +      */
> +     while (((unsigned long) ringbuf->tail % CACHELINE_BYTES) != 0)
> +             intel_logical_ring_emit(ringbuf, MI_NOOP);
> +
> +     intel_logical_ring_emit(ringbuf,
> +                             MI_LOAD_REGISTER_MEM_GEN8 |
> +                             MI_LRM_USE_GLOBAL_GTT |
> +                             MI_LRM_ASYNC_MODE_ENABLE);
> +     intel_logical_ring_emit(ringbuf, INSTPM);
> +     intel_logical_ring_emit(ringbuf, 0);
> +     intel_logical_ring_emit(ringbuf, scratch_addr);
> +
> +     /*
> +      * Bspec says there should not be any commands programmed
> +      * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
> +      * do not add any new commands
> +      */
> +     intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_REG_GEN8);
> +     intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS);
> +     intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS);
> +
> +     intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_END);
> +
> +     return 0;
> +}
> +
> +static int intel_init_workaround_bb(struct intel_engine_cs *ring,
> +                                 struct intel_context *ctx)
> +{
> +     int ret;
> +     struct drm_device *dev = ring->dev;
> +
> +     WARN_ON(ring->id != RCS);
> +
> +     if (IS_GEN8(dev)) {
> +             ret = gen8_init_indirectctx_bb(ring, ctx);
> +             if (ret)
> +                     return ret;
> +
> +             ret = gen8_init_perctx_bb(ring, ctx);
> +             if (ret)
> +                     return ret;
> +     }
> +
> +     return 0;
> +
> +}
> +
>  static int gen8_init_common_ring(struct intel_engine_cs *ring)
>  {
>       struct drm_device *dev = ring->dev;
> @@ -1490,6 +1654,7 @@ static int logical_render_ring_init(struct drm_device 
> *dev)
>       else
>               ring->init_hw = gen8_init_render_ring;
>       ring->init_context = gen8_init_rcs_context;
> +     ring->init_context_bb = intel_init_workaround_bb;
>       ring->cleanup = intel_fini_pipe_control;
>       ring->get_seqno = gen8_get_seqno;
>       ring->set_seqno = gen8_set_seqno;
> @@ -1500,11 +1665,16 @@ static int logical_render_ring_init(struct drm_device 
> *dev)
>       ring->emit_bb_start = gen8_emit_bb_start;
>  
>       ring->dev = dev;
> +
> +     ret = intel_init_pipe_control(ring);
> +     if (ret)
> +             return ret;
> +
>       ret = logical_ring_init(dev, ring);
>       if (ret)
>               return ret;
>  
> -     return intel_init_pipe_control(ring);
> +     return 0;
>  }
>  
>  static int logical_bsd_ring_init(struct drm_device *dev)
> @@ -1784,15 +1954,29 @@ populate_lr_context(struct intel_context *ctx, struct 
> drm_i915_gem_object *ctx_o
>       reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
>       reg_state[CTX_SECOND_BB_STATE+1] = 0;
>       if (ring->id == RCS) {
> -             /* TODO: according to BSpec, the register state context
> -              * for CHV does not have these. OTOH, these registers do
> -              * exist in CHV. I'm waiting for a clarification */
>               reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
> -             reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
> +
> +             if (ctx->per_ctx_wa_bb)
> +                     reg_state[CTX_BB_PER_CTX_PTR + 1] =
> +                             i915_gem_obj_ggtt_offset(
> +                                     ctx->per_ctx_wa_bb->obj) | 0x01;
> +             else
> +                     reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
> +
>               reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
> -             reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
>               reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 
> 0x1c8;
> -             reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
> +
> +             if (ctx->indirect_ctx_wa_bb) {
> +                     reg_state[CTX_RCS_INDIRECT_CTX + 1] =
> +                             i915_gem_obj_ggtt_offset(
> +                             ctx->indirect_ctx_wa_bb->obj) | 0x01;
> +
> +                     reg_state[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
> +                             CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
> +             } else {
> +                     reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
> +                     reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
> +             }
>       }
>       reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
>       reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
> @@ -1859,6 +2043,18 @@ void intel_lr_context_free(struct intel_context *ctx)
>                       drm_gem_object_unreference(&ctx_obj->base);
>               }
>       }
> +
> +     if (ctx->indirect_ctx_wa_bb) {
> +             intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +             intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +             kfree(ctx->indirect_ctx_wa_bb);
> +     }
> +
> +     if (ctx->per_ctx_wa_bb) {
> +             intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +             intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +             kfree(ctx->per_ctx_wa_bb);
> +     }
>  }
>  
>  static uint32_t get_lr_context_size(struct intel_engine_cs *ring)
> @@ -1985,6 +2181,16 @@ int intel_lr_context_deferred_create(struct 
> intel_context *ctx,
>  
>       }
>  
> +     if (ring->id == RCS && !ctx->rcs_initialized) {
> +             if (ring->init_context_bb) {
> +                     ret = ring->init_context_bb(ring, ctx);
> +                     if (ret) {
> +                             DRM_ERROR("ring init context bb: %d\n", ret);
> +                             goto error;
> +                     }
> +             }
> +     }
> +
>       ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf);
>       if (ret) {
>               DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
> @@ -2013,6 +2219,17 @@ int intel_lr_context_deferred_create(struct 
> intel_context *ctx,
>       return 0;
>  
>  error:
> +     if (ctx->indirect_ctx_wa_bb) {
> +             intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +             intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +             kfree(ctx->indirect_ctx_wa_bb);
> +     }
> +     if (ctx->per_ctx_wa_bb) {
> +             intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +             intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +             kfree(ctx->per_ctx_wa_bb);
> +     }
> +
>       if (is_global_default_ctx)
>               intel_unpin_ringbuffer_obj(ringbuf);
>  error_destroy_rbuf:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 39183fc..839d698 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -146,6 +146,9 @@ struct  intel_engine_cs {
>       int             (*init_context)(struct intel_engine_cs *ring,
>                                       struct intel_context *ctx);
>  
> +     int             (*init_context_bb)(struct intel_engine_cs *ring,
> +                                        struct intel_context *ctx);
> +
>       void            (*write_tail)(struct intel_engine_cs *ring,
>                                     u32 value);
>       int __must_check (*flush)(struct intel_engine_cs *ring,
> -- 
> 2.3.0
> 
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to