On 21 October 2013 17:48, Eric Anholt <e...@anholt.net> wrote: > This avoids a lot of message setup we had to do otherwise. Improves > GLB2.7 performance with register spilling force enabled by 1.6442% +/- > 0.553218% (n=4). > --- > src/mesa/drivers/dri/i965/brw_defines.h | 7 ++++ > src/mesa/drivers/dri/i965/brw_eu.h | 5 +++ > src/mesa/drivers/dri/i965/brw_eu_emit.c | 41 > ++++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_fs.h | 1 + > src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 10 ++++++ > src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 21 +++++++---- > .../drivers/dri/i965/brw_schedule_instructions.cpp | 12 +++++++ > src/mesa/drivers/dri/i965/brw_shader.cpp | 2 ++ > 8 files changed, 93 insertions(+), 6 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h > b/src/mesa/drivers/dri/i965/brw_defines.h > index 72a0891..276ab44 100644 > --- a/src/mesa/drivers/dri/i965/brw_defines.h > +++ b/src/mesa/drivers/dri/i965/brw_defines.h > @@ -776,6 +776,7 @@ enum opcode { > > SHADER_OPCODE_GEN4_SCRATCH_READ, > SHADER_OPCODE_GEN4_SCRATCH_WRITE, > + SHADER_OPCODE_GEN7_SCRATCH_READ, > > FS_OPCODE_DDX, > FS_OPCODE_DDY, > @@ -1135,6 +1136,12 @@ enum brw_message_target { > #define GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE 12 > #define GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE 13 > > +#define GEN7_DATAPORT_SCRATCH_READ ((1 << 18) > | \ > + (0 << 17)) > +#define GEN7_DATAPORT_SCRATCH_WRITE ((1 << 18) > | \ > + (1 << 17)) > +#define GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT 12 > + > /* HSW */ > #define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ 0 > #define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ 1 > diff --git a/src/mesa/drivers/dri/i965/brw_eu.h > b/src/mesa/drivers/dri/i965/brw_eu.h > index 072310d..a307948 100644 > --- a/src/mesa/drivers/dri/i965/brw_eu.h > +++ b/src/mesa/drivers/dri/i965/brw_eu.h > @@ -379,6 +379,11 @@ void brw_oword_block_write_scratch(struct brw_compile > *p, > int num_regs, > GLuint offset); > > +void gen7_block_read_scratch(struct brw_compile *p, > + struct brw_reg dest, > + int num_regs, > + GLuint offset); > + > void brw_shader_time_add(struct brw_compile *p, > struct brw_reg payload, > uint32_t surf_index); > diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c > b/src/mesa/drivers/dri/i965/brw_eu_emit.c > index 8efd679..accf324 100644 > --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c > +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c > @@ -2055,6 +2055,47 @@ brw_oword_block_read_scratch(struct brw_compile *p, > } > } > > +void > +gen7_block_read_scratch(struct brw_compile *p, > + struct brw_reg dest, > + int num_regs, > + GLuint offset) > +{ > + dest = retype(dest, BRW_REGISTER_TYPE_UW); > + > + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); > + > + assert(insn->header.predicate_control == 0); >
Can we please use BRW_PREDICATE_NONE instead of 0 here? > + insn->header.compression_control = BRW_COMPRESSION_NONE; > + > + brw_set_dest(p, insn, dest); > + > + /* The HW requires that the header is present; this is to get the g0.5 > + * scratch offset. > + */ > + bool header_present = true; > + brw_set_src0(p, insn, brw_vec8_grf(0, 0)); > + > + brw_set_message_descriptor(p, insn, > + GEN7_SFID_DATAPORT_DATA_CACHE, > + 1, /* mlen: just g0 */ > + num_regs, > + header_present, > + false); > + > + insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ; > + > + assert(num_regs == 1 || num_regs == 2 || num_regs == 4); > + insn->bits3.ud |= (num_regs - 1) << > GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT; > + > + /* The "HWORD" unit in the docs just happens to mean "the size of a > + * register" > + */ > I had to search the docs for a while to understand what this comment was referring to. How about saying something like this instead? /* According to the docs, offset is "A 12-bit HWord offset into the memory * Immediate Memory buffer as specified by binding table 0xFF." An HWORD * is 32 bytes, which happens to be the size of a register. */ > + offset /= REG_SIZE; > + assert(offset < (1 << 12)); > + insn->bits3.ud |= offset; > +} > + > /** > * Read a float[4] vector from the data port Data Cache (const buffer). > * Location (in buffer) should be a multiple of 16. > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index f9c87c7..432f3df 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -521,6 +521,7 @@ private: > bool negate_value); > void generate_scratch_write(fs_inst *inst, struct brw_reg src); > void generate_scratch_read(fs_inst *inst, struct brw_reg dst); > + void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst); > void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg > dst, > struct brw_reg index, > struct brw_reg offset); > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > index 6aebc41..fa8bccd 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > @@ -732,6 +732,12 @@ fs_generator::generate_scratch_read(fs_inst *inst, > struct brw_reg dst) > } > > void > +fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg > dst) > +{ > + gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset); > +} > + > +void > fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, > struct brw_reg dst, > struct brw_reg index, > @@ -1517,6 +1523,10 @@ fs_generator::generate_code(exec_list *instructions) > generate_scratch_read(inst, dst); > break; > > + case SHADER_OPCODE_GEN7_SCRATCH_READ: > + generate_scratch_read_gen7(inst, dst); > + break; > + > case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: > generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); > break; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > index 75090a6..a78c1af 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > @@ -527,17 +527,25 @@ fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, > uint32_t spill_offset, > int count) > { > for (int i = 0; i < count; i++) { > + /* The gen7 descriptor-based offset is 12 bits of HWORD units. */ > + bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * > REG_SIZE; > + > fs_inst *unspill_inst = > - new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_READ, dst); > + new(mem_ctx) fs_inst(gen7_read ? > + SHADER_OPCODE_GEN7_SCRATCH_READ : > + SHADER_OPCODE_GEN4_SCRATCH_READ, > + dst); > unspill_inst->offset = spill_offset; > unspill_inst->ir = inst->ir; > unspill_inst->annotation = inst->annotation; > > - /* Choose a MRF that won't conflict with an MRF that's live across > the > - * spill. Nothing else will make it up to MRF 14/15. > - */ > - unspill_inst->base_mrf = 14; > - unspill_inst->mlen = 1; /* header contains offset */ > + if (!gen7_read) { > + /* Choose a MRF that won't conflict with an MRF that's live > across > + * the spill. Nothing else will make it up to MRF 14/15. > + */ > + unspill_inst->base_mrf = 14; > + unspill_inst->mlen = 1; /* header contains offset */ > + } > inst->insert_before(unspill_inst); > > dst.reg_offset++; > @@ -605,6 +613,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) > break; > > case SHADER_OPCODE_GEN4_SCRATCH_READ: > + case SHADER_OPCODE_GEN7_SCRATCH_READ: > if (inst->dst.file == GRF) > no_spill[inst->dst.reg] = true; > break; > diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > index 9a480b4..a4e0be3 100644 > --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > @@ -329,6 +329,18 @@ schedule_node::set_latency_gen7(bool is_haswell) > latency = 200; > break; > > + case SHADER_OPCODE_GEN7_SCRATCH_READ: > + /* Testing a load from offset 0, that had been previously written: > + * > + * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { > align1 WE_normal 1Q }; > + * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; > + * > + * The cycles spent seemed to be grouped around 40-50 (as low as > 38), > + * then around 140. Presumably this is cache hit vs miss. > + */ > + latency = 50; > + break; > + > default: > /* 2 cycles: > * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal > 1Q }; > diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp > b/src/mesa/drivers/dri/i965/brw_shader.cpp > index 19421ba..81930ae 100644 > --- a/src/mesa/drivers/dri/i965/brw_shader.cpp > +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp > @@ -442,6 +442,8 @@ brw_instruction_name(enum opcode op) > return "gen4_scratch_read"; > case SHADER_OPCODE_GEN4_SCRATCH_WRITE: > return "gen4_scratch_write"; > + case SHADER_OPCODE_GEN7_SCRATCH_READ: > + return "gen7_scratch_read"; > > case FS_OPCODE_DDX: > return "ddx"; > -- > 1.8.4.rc3 > With those changes, this patch is: Reviewed-by: Paul Berry <stereotype...@gmail.com> I sent a comment on patch 2 earlier. The remainder are: Reviewed-by: Paul Berry <stereotype...@gmail.com>
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev