On Wed, Apr 15, 2015 at 1:31 PM, Matt Turner <matts...@gmail.com> wrote: > On Wed, Apr 1, 2015 at 6:19 PM, Jason Ekstrand <ja...@jlekstrand.net> wrote: >> Instead of the complicated and broken-by-design pile of heuristics we had >> before, we now have a straightforward lowering: >> >> 1) All header sources are copied directly using force_writemask_all and, >> since they are guaranteed to be a single register, there are no >> force_sechalf issues. >> >> 2) All non-header sources are copied using the exact same force_sechalf >> and saturate modifiers as the LOAD_PAYLOAD operation itself. > > Let's not do this. Nothing puts a saturate modifier on LOAD_PAYLOAD > today, and it is kind of confusing about what it means. Can't we have > fbwrites that write depth as well. I wouldn't think we wanted to > saturate that.
Sure. I can drop saturate and just assert that it's not set. We do want to keep force_sechalf and force_writemask_all though. > I don't think it buys us anything. If we just run copy propagation > after lower_load_payload() we'll get the code we want. > >> 3) In order to accommodate older gens that need interleaved colors, >> lower_load_payload detects when the destination is a COMPR4 register >> and automatically interleaves the non-header sources. The >> lower_load_payload pass does the right thing here regardless of whether >> or not the hardware actually supports COMPR4. >> --- >> src/mesa/drivers/dri/i965/brw_fs.cpp | 154 >> ++++++++++++++------------- >> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 - >> 2 files changed, 80 insertions(+), 77 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index f8e26c0..bc45a38 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -3201,93 +3201,99 @@ fs_visitor::lower_load_payload() >> { >> bool progress = false; >> >> - int vgrf_to_reg[alloc.count]; >> - int reg_count = 0; >> - for (unsigned i = 0; i < alloc.count; ++i) { >> - vgrf_to_reg[i] = reg_count; >> - reg_count += alloc.sizes[i]; >> - } >> - >> - struct { >> - bool written:1; /* Whether this register has ever been written */ >> - bool force_writemask_all:1; >> - bool force_sechalf:1; >> - } metadata[reg_count]; >> - memset(metadata, 0, sizeof(metadata)); >> - >> foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { >> - if (inst->dst.file == GRF) { >> - const int dst_reg = vgrf_to_reg[inst->dst.reg] + >> inst->dst.reg_offset; >> - bool force_sechalf = inst->force_sechalf && >> - !inst->force_writemask_all; >> - bool toggle_sechalf = inst->dst.width == 16 && >> - type_sz(inst->dst.type) == 4 && >> - !inst->force_writemask_all; >> - for (int i = 0; i < inst->regs_written; ++i) { >> - metadata[dst_reg + i].written = true; >> - metadata[dst_reg + i].force_sechalf = force_sechalf; >> - metadata[dst_reg + i].force_writemask_all = >> inst->force_writemask_all; >> - force_sechalf = (toggle_sechalf != force_sechalf); >> - } >> - } >> - >> if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { > > We should invert this condition and 'continue' so that we can avoid an > extra level of indentation which only makes the code harder to read. Sure. I can do that. >> assert(inst->dst.file == MRF || inst->dst.file == GRF); >> + >> fs_reg dst = inst->dst; >> >> - for (int i = 0; i < inst->sources; i++) { >> - dst.width = inst->src[i].effective_width; >> - dst.type = inst->src[i].type; >> - >> - if (inst->src[i].file == BAD_FILE) { >> - /* Do nothing but otherwise increment as normal */ >> - } else if (dst.file == MRF && >> - dst.width == 8 && >> - brw->has_compr4 && >> - i + 4 < inst->sources && >> - inst->src[i + 4].equals(horiz_offset(inst->src[i], >> 8))) { >> - fs_reg compr4_dst = dst; >> - compr4_dst.reg += BRW_MRF_COMPR4; >> - compr4_dst.width = 16; >> - fs_reg compr4_src = inst->src[i]; >> - compr4_src.width = 16; >> - fs_inst *mov = MOV(compr4_dst, compr4_src); >> + /* Get rid of COMPR4. We'll add it back in if we need it */ >> + if (dst.file == MRF && dst.reg & BRW_MRF_COMPR4) >> + dst.reg = dst.reg & ~BRW_MRF_COMPR4; > > No point in checking whether the BRW_MRF_COMPR4 bit is set before > clearing it. Just clear it. Sure. >> + >> + dst.width = 8; >> + for (uint8_t i = 0; i < inst->header_size; i++) { >> + if (inst->src[i].file != BAD_FILE) { >> + fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); >> + fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); >> + mov_src.width = 8; >> + fs_inst *mov = MOV(mov_dst, mov_src); >> mov->force_writemask_all = true; >> inst->insert_before(block, mov); >> - /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */ >> - inst->src[i + 4].file = BAD_FILE; >> - } else { >> - fs_inst *mov = MOV(dst, inst->src[i]); >> - if (inst->src[i].file == GRF) { >> - int src_reg = vgrf_to_reg[inst->src[i].reg] + >> - inst->src[i].reg_offset; >> - mov->force_sechalf = metadata[src_reg].force_sechalf; >> - mov->force_writemask_all = >> metadata[src_reg].force_writemask_all; >> - } else { >> - /* We don't have any useful metadata for immediates or >> - * uniforms. Assume that any of the channels of the >> - * destination may be used. >> - */ >> - assert(inst->src[i].file == IMM || >> - inst->src[i].file == UNIFORM); >> - mov->force_writemask_all = true; >> - } >> + } >> + dst = offset(dst, 1); >> + } >> >> - if (dst.file == GRF) { >> - const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset; >> - const bool force_writemask = mov->force_writemask_all; >> - metadata[dst_reg].force_writemask_all = force_writemask; >> - metadata[dst_reg].force_sechalf = mov->force_sechalf; >> - if (dst.width * type_sz(dst.type) > 32) { >> - assert(!mov->force_sechalf); >> - metadata[dst_reg + 1].force_writemask_all = >> force_writemask; >> - metadata[dst_reg + 1].force_sechalf = !force_writemask; >> + dst.width = inst->exec_size; >> + if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) && >> + inst->exec_size > 8) { >> + /* In this case, the payload portion of the LOAD_PAYLOAD isn't >> + * a straightforward copy. Instead, the result of the >> + * LOAD_PAYLOAD is treated as interlaced and the first four > > I'd prefer it if you changed 'interlaced' to 'interleaved', like > you've said elsewhere. I don't know... just something about the word > interlace that doesn't feel right for this. (Same thing applies to > other patches) That's fine. I can do that >> + * non-header sources are unpacked as: >> + * >> + * m + 0: r0 >> + * m + 1: g0 >> + * m + 2: b0 >> + * m + 3: a0 >> + * m + 4: r1 >> + * m + 5: g1 >> + * m + 6: b1 >> + * m + 7: a1 >> + * >> + * This is used for gen <= 5 fb writes. >> + */ >> + assert(inst->exec_size == 16); >> + assert(inst->header_size + 4 <= inst->sources); >> + for (uint8_t i = inst->header_size; i < inst->header_size + 4; >> i++) { >> + if (inst->src[i].file != BAD_FILE) { >> + if (brw->has_compr4) { >> + fs_reg compr4_dst = retype(dst, inst->src[i].type); >> + compr4_dst.reg |= BRW_MRF_COMPR4; >> + >> + fs_inst *mov = MOV(compr4_dst, inst->src[i]); >> + mov->force_writemask_all = inst->force_writemask_all; >> + mov->saturate = inst->saturate; >> + inst->insert_before(block, mov); >> + } else { >> + /* Platform doesn't have COMPR4. We have to fake it */ >> + fs_reg mov_dst = retype(dst, inst->src[i].type); >> + mov_dst.width = 8; >> + >> + fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0)); >> + mov->force_writemask_all = inst->force_writemask_all; >> + mov->saturate = inst->saturate; >> + inst->insert_before(block, mov); >> + >> + mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1)); >> + mov->force_writemask_all = inst->force_writemask_all; >> + mov->saturate = inst->saturate; >> + mov->force_sechalf = true; >> + inst->insert_before(block, mov); >> } >> } >> >> - inst->insert_before(block, mov); >> + dst.reg++; >> } >> >> + dst.reg += 4; >> + >> + /* The COMPR4 code took care of the first 4 sources. We'll let >> + * the regular path handle any remaining sources. Yes, we are >> + * modifying the instruction but we're about to delete it so >> + * this really doesn't hurt anything. >> + */ >> + inst->header_size += 4; > > I mean, while the comment is a true statement, why is doing this any > better than just... > >> + } >> + >> + for (uint8_t i = inst->header_size; i < inst->sources; i++) { > > ... changing this to inst->header_size + 4? Because the inst->header_size += 4 is predicated on it being a COMPR4 destination while the code below handles both the remaining sources (in the COMPR4 case) and the regular non-COMPR4 case. > Seems like if we get to replace a four line comment (and some actual > code) with '+ 4' that's better. > >> + if (inst->src[i].file != BAD_FILE) { >> + fs_inst *mov = MOV(retype(dst, inst->src[i].type), >> + inst->src[i]); >> + mov->force_writemask_all = inst->force_writemask_all; >> + mov->saturate = inst->saturate; >> + inst->insert_before(block, mov); >> + } >> dst = offset(dst, 1); > > Here and above you could move the dst = ... into the for-loop > increment step, which would allow you to change the if statement to if > (BAD_FILE) continue and unindent the block. > > No preference. I guess. But that also hides the fact that we're doing something to dst. Meh >> } >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> index b7eeb47..a0d8b798 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp >> @@ -3471,9 +3471,6 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg >> color, unsigned components, >> color.type, color.width); >> inst = emit(MOV(dst[len], offset(color, i))); >> inst->saturate = key->clamp_fragment_color; >> - } else if (color.width == 16) { >> - /* We need two BAD_FILE slots for a 16-wide color */ >> - len++; >> } >> len++; >> } >> -- >> 2.3.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev