On Wed, Aug 5, 2015 at 10:52 AM, Francisco Jerez <curroje...@riseup.net> wrote: > In order to make room for the code that will lower the MULH virtual > instruction. Also move the hardware generation and execution type > checks into the same branch, they are going to have to be different > for MULH. > --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 265 > ++++++++++++++++++----------------- > 1 file changed, 136 insertions(+), 129 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index fc9f007..911e32b 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -3125,155 +3125,162 @@ fs_visitor::lower_integer_multiplication() > { > bool progress = false; > > - /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation > - * directly, but CHV/BXT cannot. > - */ > - if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton) > - return false; > - > foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > - if (inst->opcode != BRW_OPCODE_MUL || > - inst->dst.is_accumulator() || > - (inst->dst.type != BRW_REGISTER_TYPE_D && > - inst->dst.type != BRW_REGISTER_TYPE_UD)) > - continue; > - > const fs_builder ibld(this, block, inst); > > - /* The MUL instruction isn't commutative. On Gen <= 6, only the low > - * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of > - * src1 are used. > - * > - * If multiplying by an immediate value that fits in 16-bits, do a > - * single MUL instruction with that value in the proper location. > - */ > - if (inst->src[1].file == IMM && > - inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { > - if (devinfo->gen < 7) { > - fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), > - inst->dst.type); > - ibld.MOV(imm, inst->src[1]); > - ibld.MUL(inst->dst, imm, inst->src[0]); > - } else { > - ibld.MUL(inst->dst, inst->src[0], inst->src[1]); > - } > - } else { > - /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot > - * do 32-bit integer multiplication in one instruction, but instead > - * must do a sequence (which actually calculates a 64-bit result): > - * > - * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D > - * mach(8) null g3<8,8,1>D g4<8,8,1>D > - * mov(8) g2<1>D acc0<8,8,1>D > - * > - * But on Gen > 6, the ability to use second accumulator register > - * (acc1) for non-float data types was removed, preventing a simple > - * implementation in SIMD16. A 16-channel result can be calculated > by > - * executing the three instructions twice in SIMD8, once with > quarter > - * control of 1Q for the first eight channels and again with 2Q for > - * the second eight channels. > - * > - * Which accumulator register is implicitly accessed (by AccWrEnable > - * for instance) is determined by the quarter control. Unfortunately > - * Ivybridge (and presumably Baytrail) has a hardware bug in which > an > - * implicit accumulator access by an instruction with 2Q will access > - * acc1 regardless of whether the data type is usable in acc1. > - * > - * Specifically, the 2Q mach(8) writes acc1 which does not exist for > - * integer data types. > - * > - * Since we only want the low 32-bits of the result, we can do two > - * 32-bit x 16-bit multiplies (like the mul and mach are doing), and > - * adjust the high result and add them (like the mach is doing): > - * > - * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW > - * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW > - * shl(8) g9<1>D g8<8,8,1>D 16D > - * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D > - * > - * We avoid the shl instruction by realizing that we only want to > add > - * the low 16-bits of the "high" result to the high 16-bits of the > - * "low" result and using proper regioning on the add: > - * > - * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW > - * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW > - * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW > - * > - * Since it does not use the (single) accumulator register, we can > - * schedule multi-component multiplications much better. > + if (inst->opcode == BRW_OPCODE_MUL) { > + if (inst->dst.is_accumulator() || > + (inst->dst.type != BRW_REGISTER_TYPE_D && > + inst->dst.type != BRW_REGISTER_TYPE_UD)) > + continue; > + > + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit > + * operation directly, but CHV/BXT cannot. > */ > + if (devinfo->gen >= 8 && > + !devinfo->is_cherryview && !devinfo->is_broxton) > + continue; > > - if (inst->conditional_mod && inst->dst.is_null()) { > - inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), > - inst->dst.type); > - } > - fs_reg low = inst->dst; > - fs_reg high(GRF, alloc.allocate(dispatch_width / 8), > - inst->dst.type); > + if (inst->src[1].file == IMM && > + inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { > + /* The MUL instruction isn't commutative. On Gen <= 6, only the > low > + * 16-bits of src0 are read, and on Gen >= 7 only the low > 16-bits of > + * src1 are used. > + * > + * If multiplying by an immediate value that fits in 16-bits, do > a > + * single MUL instruction with that value in the proper location. > + */ > + if (devinfo->gen < 7) { > + fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), > + inst->dst.type); > + ibld.MOV(imm, inst->src[1]); > + ibld.MUL(inst->dst, imm, inst->src[0]); > + } else { > + ibld.MUL(inst->dst, inst->src[0], inst->src[1]); > + } > + } else { > + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) > cannot > + * do 32-bit integer multiplication in one instruction, but > instead > + * must do a sequence (which actually calculates a 64-bit > result): > + * > + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D > + * mach(8) null g3<8,8,1>D g4<8,8,1>D > + * mov(8) g2<1>D acc0<8,8,1>D > + * > + * But on Gen > 6, the ability to use second accumulator register > + * (acc1) for non-float data types was removed, preventing a > simple > + * implementation in SIMD16. A 16-channel result can be > calculated by > + * executing the three instructions twice in SIMD8, once with > quarter > + * control of 1Q for the first eight channels and again with 2Q > for > + * the second eight channels. > + * > + * Which accumulator register is implicitly accessed (by > AccWrEnable > + * for instance) is determined by the quarter control. > Unfortunately > + * Ivybridge (and presumably Baytrail) has a hardware bug in > which an > + * implicit accumulator access by an instruction with 2Q will > access > + * acc1 regardless of whether the data type is usable in acc1. > + * > + * Specifically, the 2Q mach(8) writes acc1 which does not exist > for > + * integer data types. > + * > + * Since we only want the low 32-bits of the result, we can do > two > + * 32-bit x 16-bit multiplies (like the mul and mach are doing), > and > + * adjust the high result and add them (like the mach is doing): > + * > + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW > + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW > + * shl(8) g9<1>D g8<8,8,1>D 16D > + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D > + * > + * We avoid the shl instruction by realizing that we only want > to add > + * the low 16-bits of the "high" result to the high 16-bits of > the > + * "low" result and using proper regioning on the add: > + * > + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW > + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW > + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW > + * > + * Since it does not use the (single) accumulator register, we > can > + * schedule multi-component multiplications much better. > + */ > + > + if (inst->conditional_mod && inst->dst.is_null()) { > + inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), > + inst->dst.type); > + } > + fs_reg low = inst->dst; > + fs_reg high(GRF, alloc.allocate(dispatch_width / 8), > + inst->dst.type); > > - if (devinfo->gen >= 7) { > - fs_reg src1_0_w = inst->src[1]; > - fs_reg src1_1_w = inst->src[1]; > + if (devinfo->gen >= 7) { > + fs_reg src1_0_w = inst->src[1]; > + fs_reg src1_1_w = inst->src[1]; > > - if (inst->src[1].file == IMM) { > - src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; > - src1_1_w.fixed_hw_reg.dw1.ud >>= 16; > - } else { > - src1_0_w.type = BRW_REGISTER_TYPE_UW; > - if (src1_0_w.stride != 0) { > - assert(src1_0_w.stride == 1); > - src1_0_w.stride = 2; > + if (inst->src[1].file == IMM) { > + src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; > + src1_1_w.fixed_hw_reg.dw1.ud >>= 16; > + } else { > + src1_0_w.type = BRW_REGISTER_TYPE_UW; > + if (src1_0_w.stride != 0) { > + assert(src1_0_w.stride == 1); > + src1_0_w.stride = 2; > + } > + > + src1_1_w.type = BRW_REGISTER_TYPE_UW; > + if (src1_1_w.stride != 0) { > + assert(src1_1_w.stride == 1); > + src1_1_w.stride = 2; > + } > + src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); > } > + ibld.MUL(low, inst->src[0], src1_0_w); > + ibld.MUL(high, inst->src[0], src1_1_w); > + } else { > + fs_reg src0_0_w = inst->src[0]; > + fs_reg src0_1_w = inst->src[0]; > > - src1_1_w.type = BRW_REGISTER_TYPE_UW; > - if (src1_1_w.stride != 0) { > - assert(src1_1_w.stride == 1); > - src1_1_w.stride = 2; > + src0_0_w.type = BRW_REGISTER_TYPE_UW; > + if (src0_0_w.stride != 0) { > + assert(src0_0_w.stride == 1); > + src0_0_w.stride = 2; > } > - src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); > - } > - ibld.MUL(low, inst->src[0], src1_0_w); > - ibld.MUL(high, inst->src[0], src1_1_w); > - } else { > - fs_reg src0_0_w = inst->src[0]; > - fs_reg src0_1_w = inst->src[0]; > > - src0_0_w.type = BRW_REGISTER_TYPE_UW; > - if (src0_0_w.stride != 0) { > - assert(src0_0_w.stride == 1); > - src0_0_w.stride = 2; > - } > + src0_1_w.type = BRW_REGISTER_TYPE_UW; > + if (src0_1_w.stride != 0) { > + assert(src0_1_w.stride == 1); > + src0_1_w.stride = 2; > + } > + src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); > > - src0_1_w.type = BRW_REGISTER_TYPE_UW; > - if (src0_1_w.stride != 0) { > - assert(src0_1_w.stride == 1); > - src0_1_w.stride = 2; > + ibld.MUL(low, src0_0_w, inst->src[1]); > + ibld.MUL(high, src0_1_w, inst->src[1]); > } > - src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); > > - ibld.MUL(low, src0_0_w, inst->src[1]); > - ibld.MUL(high, src0_1_w, inst->src[1]); > - } > + fs_reg dst = inst->dst; > + dst.type = BRW_REGISTER_TYPE_UW; > + dst.subreg_offset = 2; > + dst.stride = 2; > > - fs_reg dst = inst->dst; > - dst.type = BRW_REGISTER_TYPE_UW; > - dst.subreg_offset = 2; > - dst.stride = 2; > + high.type = BRW_REGISTER_TYPE_UW; > + high.stride = 2; > > - high.type = BRW_REGISTER_TYPE_UW; > - high.stride = 2; > + low.type = BRW_REGISTER_TYPE_UW; > + low.subreg_offset = 2; > + low.stride = 2; > > - low.type = BRW_REGISTER_TYPE_UW; > - low.subreg_offset = 2; > - low.stride = 2; > + ibld.ADD(dst, low, high); > > - ibld.ADD(dst, low, high); > + if (inst->conditional_mod) { > + fs_reg null(retype(ibld.null_reg_f(), inst->dst.type)); > + set_condmod(inst->conditional_mod, > + ibld.MOV(null, inst->dst)); > + } > + } > > - if (inst->conditional_mod) { > - fs_reg null(retype(ibld.null_reg_f(), inst->dst.type)); > - set_condmod(inst->conditional_mod, > - ibld.MOV(null, inst->dst)); > } > +
Extra newline before the else. > + } else { > + continue; > } There's a build-break here, probably a rebase problem in splitting this from the next patch. There seems to be an extra } that should be in the next patch. With that fixed, these five patches are Reviewed-by: Matt Turner <matts...@gmail.com> _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev