For our own edification, we should add some feedback in the INTEL_DEBUG=perf case. If there is any case that an ADDC (or SUBB) from the frontend doesn't get merged with an add, we should generate a perf warning. This probably indicates a failing of the optimization pass.
That could probably be a follow-on patch... On 09/23/2013 04:13 PM, Matt Turner wrote: > --- > src/mesa/drivers/dri/i965/brw_fs.h | 1 + > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 101 > +++++++++++++++++++++++++++ > 2 files changed, 102 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index a56f561..d776b5a 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -373,6 +373,7 @@ public: > fs_reg src0, fs_reg src1); > bool try_emit_saturate(ir_expression *ir); > bool try_emit_mad(ir_expression *ir, int mul_arg); > + void try_combine_add_with_addc_subb(); > void try_replace_with_sel(); > void emit_bool_to_cond_code(ir_rvalue *condition); > void emit_if_gen6(ir_if *ir); > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index 23cdc2e..531ef52 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -313,9 +313,104 @@ fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg) > return true; > } > > +/** > + * The addition and carry in the uaddCarry() built-in function are > implemented > + * separately as ir_binop_add and ir_binop_carry respectively. i965 generates > + * ADDC and a MOV from the accumulator for the carry. > + * > + * The generated code for uaddCarry(uint x, uint y, out uint carry) would > look > + * like this: > + * > + * addc null, x, y > + * mov carry, acc0 > + * add sum, x, y > + * > + * This peephole pass optimizes this into > + * > + * addc sum, x, y > + * mov carry, acc0 > + * > + * usubBorrow() works in the same fashion. > + */ > +void > +fs_visitor::try_combine_add_with_addc_subb() > +{ > + /* ADDC/SUBB was introduced in gen7. */ > + if (brw->gen < 7) > + return; > + > + fs_inst *add_inst = (fs_inst *) instructions.get_tail(); > + assert(add_inst->opcode == BRW_OPCODE_ADD); > + > + /* ADDC/SUBB only operates on UD. */ > + if (add_inst->dst.type != BRW_REGISTER_TYPE_UD || > + add_inst->src[0].type != BRW_REGISTER_TYPE_UD || > + add_inst->src[1].type != BRW_REGISTER_TYPE_UD) > + return; > + > + bool found = false; > + fs_inst *match = (fs_inst *) add_inst->prev; > + /* The ADDC should appear within 8 instructions of ADD for a vec4. SUBB > + * should appear farther away because of the extra MOV negates. > + */ > + for (int i = 0; i < 16; i++, match = (fs_inst *) match->prev) { > + if (match->is_head_sentinel()) > + return; > + > + /* Look for an ADDC/SUBB instruction whose destination is the null > + * register (ir_binop_carry emits ADDC with null destination; same for > + * ir_binop_borrow with SUBB) and whose sources are identical to those > + * of the ADD. > + */ > + if (match->opcode != BRW_OPCODE_ADDC && match->opcode != > BRW_OPCODE_SUBB) > + continue; > + > + /* Only look for newly emitted ADDC/SUBB with null destination. */ > + if (match->dst.file != ARF || match->dst.reg != BRW_ARF_NULL) > + continue; > + > + fs_reg *src0 = &add_inst->src[0]; > + fs_reg *src1 = &add_inst->src[1]; > + > + /* For SUBB, the ADD's second source will contain a negate modifier > + * which at this point will be in the form of a > + * > + * MOV dst, -src > + * > + * instruction, so src[1].file will be GRF, even if it's a uniform push > + * constant. > + */ > + if (match->src[1].reg != add_inst->src[1].reg) { > + /* The negating MOV should be immediately before the ADD. */ > + fs_inst *mov_inst = (fs_inst *) add_inst->prev; > + if (mov_inst->opcode != BRW_OPCODE_MOV) > + continue; > + > + src1 = &mov_inst->src[0]; > + } > + > + /* If everything matches, we're done. */ > + if (match->src[0].file == src0->file && > + match->src[1].file == src1->file && > + match->src[0].reg == src0->reg && > + match->src[1].reg == src1->reg && > + match->src[0].reg_offset == src0->reg_offset && > + match->src[1].reg_offset == src1->reg_offset) { > + found = true; > + break; > + } > + } > + > + if (found) { > + match->dst = add_inst->dst; > + add_inst->remove(); > + } > +} > + > void > fs_visitor::visit(ir_expression *ir) > { > + static bool emitted_addc_or_subb = false; > unsigned int operand; > fs_reg op[3], temp; > fs_inst *inst; > @@ -415,6 +510,8 @@ fs_visitor::visit(ir_expression *ir) > > case ir_binop_add: > emit(ADD(this->result, op[0], op[1])); > + if (emitted_addc_or_subb) > + try_combine_add_with_addc_subb(); > break; > case ir_binop_sub: > assert(!"not reached: should be handled by ir_sub_to_add_neg"); > @@ -451,6 +548,8 @@ fs_visitor::visit(ir_expression *ir) > if (brw->gen >= 7 && dispatch_width == 16) > fail("16-wide explicit accumulator operands unsupported\n"); > > + emitted_addc_or_subb = true; > + > struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); > > emit(ADDC(reg_null_ud, op[0], op[1])); > @@ -461,6 +560,8 @@ fs_visitor::visit(ir_expression *ir) > if (brw->gen >= 7 && dispatch_width == 16) > fail("16-wide explicit accumulator operands unsupported\n"); > > + emitted_addc_or_subb = true; > + > struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD); > > emit(SUBB(reg_null_ud, op[0], op[1])); > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev