Strongly mitigates the harm from the previous commit, which made many integer multiplications much more heavy on the register and instruction count.
total instructions in shared programs : 5294693 -> 5268293 (-0.50%) total gprs used in shared programs : 624962 -> 624196 (-0.12%) total shared used in shared programs : 360704 -> 360704 (0.00%) total local used in shared programs : 21048 -> 20952 (-0.46%) local shared gpr inst bytes helped 1 0 368 1772 1772 hurt 0 0 74 23 23 Signed-off-by: Rhys Perry <pendingchao...@gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 135 ++++++++++++++++++--- 1 file changed, 121 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 14cc4b32d4..5b23b816b3 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -378,6 +378,10 @@ private: CmpInstruction *findOriginForTestWithZero(Value *); + Value *createMulMethod1(Value *a, unsigned b, Value *c); + Value *createMulMethod2(Value *a, unsigned b, Value *c); + Value *createMul(Value *a, unsigned b, Value *c); + unsigned int foldCount; BuildUtil bld; @@ -952,6 +956,97 @@ ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2) } } +Value * +ConstantFolding::createMulMethod1(Value *a, unsigned b, Value *c) +{ + if (b == 1) + return a; + + // Basically constant folded shift and add multiplication. + Value *res = c ? c : bld.loadImm(NULL, 0u); + bool resZero = !c; + unsigned ashift = 0; + while (b) { + if ((b & 1) && ashift) { + if (resZero) + res = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, bld.mkImm(ashift)); + else + res = bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, bld.mkImm(ashift), res); + resZero = false; + } else if (b & 1) { + if (resZero) + res = a; + else + res = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), res, a); + resZero = false; + } + b >>= 1; + ashift++; + } + return res; +} + +Value * +ConstantFolding::createMulMethod2(Value *a, unsigned b, Value *c) +{ + uint64_t b2 = u_next_power_of_two(b); + unsigned b2shift = ffsll(b2) - 1; + if (b2 != b) { // a * b2 - a * (b2 - b) + // mul1 = a * (b2 - b) + Value *mul1 = createMulMethod1(a, b2 - b, NULL); + + if (b2shift < 32 && c) { // a * b2 - mul1 + c (implemented as a * b2 + c - mul1) + return bld.mkOp2v(OP_SUB, TYPE_U32, bld.getSSA(), + bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), + a, bld.mkImm(b2shift), c), + mul1); + } else + if (b2shift < 32) { // a * b2 - mul1 + Value *res = bld.getSSA(); + Instruction *i = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(b2shift), mul1); + if (bld.getProgram()->getTarget()->isModSupported(i, 2, NV50_IR_MOD_NEG)) + i->src(2).mod *= Modifier(NV50_IR_MOD_NEG); + else + i->setSrc(2, bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1)); + return res; + } else + if (c) { // - mul1 + c (implemented as c - mul1) + return bld.mkOp2v(OP_SUB, TYPE_U32, bld.getSSA(), c, mul1); + } else { // - mul1 + return bld.mkOp1v(OP_NEG, TYPE_U32, bld.getSSA(), mul1); + } + } else { + if (c) // a * b2 + c + return bld.mkOp3v(OP_SHLADD, TYPE_U32, bld.getSSA(), a, bld.mkImm(b2shift), c); + else // a * b2 + return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), a, bld.loadImm(NULL, b2shift)); + } +} + +Value * +ConstantFolding::createMul(Value *a, unsigned b, Value *c) +{ + unsigned cost[2]; + + // Estimate cost for first method (a << i) + (b << j) + ... + cost[0] = u_bit_count64(b >> 1); + + // Estimate cost for second method (a << i) - ((a << j) + (a << k) + ...) + uint64_t rounded_b = u_next_power_of_two(b); + cost[1] = rounded_b == b ? 1 : (u_bit_count64((rounded_b - b) >> 1) + 2); + if (c) cost[1]++; + + // The general method, multiplication by XMADs, costs three instructions. + // So nothing larger than that or it could be making things worse. + if (cost[0] > 3 && cost[1] > 3) + return NULL; + + if (cost[0] < cost[1]) + return createMulMethod1(a, b, c); + else + return createMulMethod2(a, b, c); +} + void ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) { @@ -1039,13 +1134,25 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->setSrc(s, i->getSrc(t)); i->src(s).mod = i->src(t).mod; } else - if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) { - i->op = OP_SHL; - imm0.applyLog2(); - i->setSrc(0, i->getSrc(t)); - i->src(0).mod = i->src(t).mod; - i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); - i->src(1).mod = 0; + if (!isFloatType(i->dType)) { + bool optimized = false; + if (target->isOpSupported(OP_SHLADD, TYPE_U32)) { + bld.setPosition(i, false); + Value *val = createMul(i->getSrc(t), imm0.reg.data.u32, NULL); + if (val) { + i->def(0).replace(val, false); + delete_Instruction(prog, i); + optimized = true; + } + } + if (!optimized && !imm0.isNegative() && imm0.isPow2()) { + i->op = OP_SHL; + imm0.applyLog2(); + i->setSrc(0, i->getSrc(t)); + i->src(0).mod = i->src(t).mod; + i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); + i->src(1).mod = 0; + } } else if (i->postFactor && i->sType == TYPE_F32) { /* Can't emit a postfactor with an immediate, have to fold it in */ @@ -1078,13 +1185,13 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->setSrc(2, NULL); i->op = OP_ADD; } else - if (s == 1 && !imm0.isNegative() && imm0.isPow2() && - !isFloatType(i->dType) && - target->isOpSupported(OP_SHLADD, i->dType) && - !i->subOp) { - i->op = OP_SHLADD; - imm0.applyLog2(); - i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); + if (!isFloatType(i->dType) && target->isOpSupported(OP_SHLADD, TYPE_U32) && !i->subOp) { + bld.setPosition(i, false); + Value *val = createMul(i->getSrc(t), imm0.reg.data.u32, i->getSrc(2)); + if (val) { + i->def(0).replace(val, false); + delete_Instruction(prog, i); + } } break; case OP_SUB: -- 2.14.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev