On Fri, 2016-07-29 at 12:59 -0700, Francisco Jerez wrote: > Iago Toral Quiroga <ito...@igalia.com> writes: > > > > > From: "Juan A. Suarez Romero" <jasua...@igalia.com> > > > > Our current data flow analysis does not take into account that > > channels > > on 64-bit operands are 64-bit. This is a problem when the same > > register > > is accessed using both 64-bit and 32-bit channels. This is very > > common > > in operations where we need to access 64-bit data in 32-bit chunks, > > such as the double packing and packing operations. > > > > This patch changes the analysis by checking the bits that each > > source > > or destination datatype needs. Actually, rather than bits, we use > > blocks of 32bits, which is the minimum channel size. > > > > Because a vgrf can contain a dvec4 (256 bits), we reserve 8 > > 32-bit blocks to map the channels. > > --- > > .../dri/i965/brw_vec4_dead_code_eliminate.cpp | 30 > > +++++++++++++----- > > .../drivers/dri/i965/brw_vec4_live_variables.cpp | 36 > > +++++++++++++++++----- > > .../drivers/dri/i965/brw_vec4_live_variables.h | 7 +++-- > > 3 files changed, 55 insertions(+), 18 deletions(-) > > > > diff --git > > a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > index c643212..1f7cd49 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > > @@ -59,10 +59,16 @@ vec4_visitor::dead_code_eliminate() > > bool result_live[4] = { false }; > > > > if (inst->dst.file == VGRF) { > > + const unsigned offs = type_sz(inst->dst.type) == 8 > > ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > I suggest that instead of duplicating this calculation in every > single > user of the live sets you handle this in the var_from_reg helpers by > adding a new argument to specify which dword of the register > component > you want to take, like: > > > > > inline unsigned > > var_from_reg(const simple_allocator &alloc, const src_reg ®, > > unsigned c = 0, unsigned k = 0); > > > > inline unsigned > > var_from_reg(const simple_allocator &alloc, const dst_reg ®, > > unsigned c = 0, unsigned k = 0); > > This would give you the variable index for the k-th dword of the c-th > component of register reg (feel free to write it in a comment for the > sake of documentation). Doing it in a single place would be > advantageous because we are likely to have to change the calculation > in > the future (e.g. to account for strides), and going through every > user > will be a pain. You could implement it as follows: > > > > > inline unsigned > > var_from_reg(const simple_allocator &alloc, const src_reg ®, > > unsigned c = 0, unsigned i = 0) > > { > > /* ... */ > > const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); > > return 8 * (alloc.offsets[reg.nr] + reg.reg_offset) + > > (BRW_GET_SWZ(reg.swizzle, c) + i / csize * 4) * csize + > > i % csize; > > } > > > > > for (unsigned i = 0; i < inst->regs_written; i++) { > > - for (int c = 0; c < 4; c++) > > - result_live[c] |= BITSET_TEST( > > - live, var_from_reg(alloc, offset(inst- > > >dst, i), c)); > > + for (int c = 0; c < 4; c++) { > > + const unsigned v = > > + var_from_reg(alloc, offset(inst->dst, i), > > c); > > + result_live[c] |= BITSET_TEST(live, v); > > + if (offs) > > + result_live[c] |= BITSET_TEST(live, v + > > offs); > > + } > > Then you could simplify this and most of the following hunks > massively, > like: > > > > > for (unsigned i = 0; i < 2 * inst->regs_written; i++) > > { > > for (int c = 0; c < 4; c++) > > result_live[c] |= BITSET_TEST( > > live, var_from_reg(alloc, inst->dst, c, i)); > > } > > Note that the offset call goes away. The factor of two makes sense > because you need to check 4 * 2 * regs_written bits in total, since > you're keeping track of 8 bits per GRF. It would likely make sense > to > represent vec4_instruction::regs_written and ::regs_read with smaller > granularity (e.g. in bytes) to avoid losing accuracy with sub-GRF > writes > and reads, but that can probably be done in a separate change.
Thanks for the suggestion!! I'll proceed as you said. > > > > > } > > } else { > > for (unsigned c = 0; c < 4; c++) > > @@ -110,11 +116,16 @@ vec4_visitor::dead_code_eliminate() > > } > > > > if (inst->dst.file == VGRF && !inst->predicate) { > > + const unsigned offs = type_sz(inst->dst.type) == 8 ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > for (unsigned i = 0; i < inst->regs_written; i++) { > > for (int c = 0; c < 4; c++) { > > if (inst->dst.writemask & (1 << c)) { > > - BITSET_CLEAR(live, var_from_reg(alloc, > > - offset(inst- > > >dst, i), c)); > > + const unsigned v = > > + var_from_reg(alloc, offset(inst->dst, i), > > c); > > + BITSET_CLEAR(live, v); > > + if (offs) > > + BITSET_CLEAR(live, v + offs); > > } > > } > > } > > @@ -132,10 +143,15 @@ vec4_visitor::dead_code_eliminate() > > > > for (int i = 0; i < 3; i++) { > > if (inst->src[i].file == VGRF) { > > + const unsigned offs = type_sz(inst->src[i].type) == > > 8 ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > for (unsigned j = 0; j < inst->regs_read(i); j++) { > > for (int c = 0; c < 4; c++) { > > - BITSET_SET(live, var_from_reg(alloc, > > - offset(inst- > > >src[i], j), c)); > > + const unsigned v = > > + var_from_reg(alloc, offset(inst->src[i], > > j), c); > > + BITSET_SET(live, v); > > + if (offs) > > + BITSET_SET(live, v + offs); > > } > > } > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > index 57d5fbb..bfc5e44 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp > > @@ -76,12 +76,16 @@ vec4_live_variables::setup_def_use() > > /* Set use[] for this instruction */ > > for (unsigned int i = 0; i < 3; i++) { > > if (inst->src[i].file == VGRF) { > > + const unsigned offs = type_sz(inst->dst.type) == 8 > > ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > for (unsigned j = 0; j < inst->regs_read(i); j++) { > > for (int c = 0; c < 4; c++) { > > const unsigned v = > > var_from_reg(alloc, offset(inst->src[i], > > j), c); > > if (!BITSET_TEST(bd->def, v)) > > BITSET_SET(bd->use, v); > > + if (offs && !BITSET_TEST(bd->def, v + offs)) > > + BITSET_SET(bd->use, v + offs); > > } > > } > > } > > @@ -99,6 +103,8 @@ vec4_live_variables::setup_def_use() > > */ > > if (inst->dst.file == VGRF && > > (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) > > { > > + const unsigned offs = type_sz(inst->dst.type) == 8 ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > for (unsigned i = 0; i < inst->regs_written; i++) { > > for (int c = 0; c < 4; c++) { > > if (inst->dst.writemask & (1 << c)) { > > @@ -106,6 +112,8 @@ vec4_live_variables::setup_def_use() > > var_from_reg(alloc, offset(inst->dst, i), > > c); > > if (!BITSET_TEST(bd->use, v)) > > BITSET_SET(bd->def, v); > > + if (offs && !BITSET_TEST(bd->use, v + offs)) > > + BITSET_SET(bd->def, v + offs); > > } > > } > > } > > @@ -188,7 +196,7 @@ vec4_live_variables::vec4_live_variables(const > > simple_allocator &alloc, > > { > > mem_ctx = ralloc_context(NULL); > > > > - num_vars = alloc.total_size * 4; > > + num_vars = alloc.total_size * 8; > > block_data = rzalloc_array(mem_ctx, struct block_data, cfg- > > >num_blocks); > > > > bitset_words = BITSET_WORDS(num_vars); > > @@ -238,14 +246,14 @@ vec4_visitor::calculate_live_intervals() > > if (this->live_intervals) > > return; > > > > - int *start = ralloc_array(mem_ctx, int, this->alloc.total_size > > * 4); > > - int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * > > 4); > > + int *start = ralloc_array(mem_ctx, int, this->alloc.total_size > > * 8); > > + int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * > > 8); > > ralloc_free(this->virtual_grf_start); > > ralloc_free(this->virtual_grf_end); > > this->virtual_grf_start = start; > > this->virtual_grf_end = end; > > > > - for (unsigned i = 0; i < this->alloc.total_size * 4; i++) { > > + for (unsigned i = 0; i < this->alloc.total_size * 8; i++) { > > start[i] = MAX_INSTRUCTION; > > end[i] = -1; > > } > > @@ -257,18 +265,26 @@ vec4_visitor::calculate_live_intervals() > > foreach_block_and_inst(block, vec4_instruction, inst, cfg) { > > for (unsigned int i = 0; i < 3; i++) { > > if (inst->src[i].file == VGRF) { > > + const unsigned offs = type_sz(inst->dst.type) == 8 ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > for (unsigned j = 0; j < inst->regs_read(i); j++) { > > for (int c = 0; c < 4; c++) { > > const unsigned v = > > var_from_reg(alloc, offset(inst->src[i], j), > > c); > > start[v] = MIN2(start[v], ip); > > end[v] = ip; > > + if (offs) { > > + start[v + offs] = MIN2(start[v + offs], ip); > > + end[v + offs] = ip; > > + } > > } > > } > > } > > } > > > > if (inst->dst.file == VGRF) { > > + const unsigned offs = type_sz(inst->dst.type) == 8 ? > > + 1 : (inst->exec_size == 4 ? 0 : 4); > > for (unsigned i = 0; i < inst->regs_written; i++) { > > for (int c = 0; c < 4; c++) { > > if (inst->dst.writemask & (1 << c)) { > > @@ -276,6 +292,10 @@ vec4_visitor::calculate_live_intervals() > > var_from_reg(alloc, offset(inst->dst, i), c); > > start[v] = MIN2(start[v], ip); > > end[v] = ip; > > + if (offs) { > > + start[v + offs] = MIN2(start[v + offs], ip); > > + end[v + offs] = ip; > > + } > > } > > } > > } > > @@ -340,8 +360,8 @@ vec4_visitor::var_range_end(unsigned v, > > unsigned n) const > > bool > > vec4_visitor::virtual_grf_interferes(int a, int b) > > { > > - return !((var_range_end(4 * alloc.offsets[a], 4 * > > alloc.sizes[a]) <= > > - var_range_start(4 * alloc.offsets[b], 4 * > > alloc.sizes[b])) || > > - (var_range_end(4 * alloc.offsets[b], 4 * > > alloc.sizes[b]) <= > > - var_range_start(4 * alloc.offsets[a], 4 * > > alloc.sizes[a]))); > > + return !((var_range_end(8 * alloc.offsets[a], 8 * > > alloc.sizes[a]) <= > > + var_range_start(8 * alloc.offsets[b], 8 * > > alloc.sizes[b])) || > > + (var_range_end(8 * alloc.offsets[b], 8 * > > alloc.sizes[b]) <= > > + var_range_start(8 * alloc.offsets[a], 8 * > > alloc.sizes[a]))); > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > index 12d281e..cdbd7dc 100644 > > --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h > > @@ -84,8 +84,8 @@ var_from_reg(const simple_allocator &alloc, const > > src_reg ®, > > { > > assert(reg.file == VGRF && reg.nr < alloc.count && > > reg.reg_offset < alloc.sizes[reg.nr] && c < 4); > > - return (4 * (alloc.offsets[reg.nr] + reg.reg_offset) + > > - BRW_GET_SWZ(reg.swizzle, c)); > > + return (8 * (alloc.offsets[reg.nr] + reg.reg_offset) + > > + BRW_GET_SWZ(reg.swizzle, c) * MAX2(1, type_sz(reg.type) > > / 4)); > > } > > > > inline unsigned > > @@ -94,7 +94,8 @@ var_from_reg(const simple_allocator &alloc, const > > dst_reg ®, > > { > > assert(reg.file == VGRF && reg.nr < alloc.count && > > reg.reg_offset < alloc.sizes[reg.nr] && c < 4); > > - return 4 * (alloc.offsets[reg.nr] + reg.reg_offset) + c; > > + return 8 * (alloc.offsets[reg.nr] + reg.reg_offset) + > > + c * MAX2(1, type_sz(reg.type) / 4); > > } > > > > } /* namespace brw */ > > I think there are a few cases you haven't updated for this change of > representation of live variables: > > - brw_vec4.cpp:1158 ("var_range_end(var_from_reg(alloc, inst- > >src[0]), 4) > ip") > - brw_vec4_cse.cpp:267 ("var_range_end(var_from_reg(alloc, *src), 4) > < ip") > > BTW, have you tried running shader-db after and before this series, > in > order to make sure that your optimizer changes didn't inadvertently > cause instruction or cycle count regressions? > No, I did not. I will do. Thank you for the notice. J.A. _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev