Strongly mitigates the harm from the previous commit, which made many integer multiplications much more heavy on the register and instruction count.
total instructions in shared programs : 5820882 -> 5788434 (-0.56%) total gprs used in shared programs : 670595 -> 669996 (-0.09%) total shared used in shared programs : 548832 -> 548832 (0.00%) total local used in shared programs : 21164 -> 21068 (-0.45%) local shared gpr inst bytes helped 1 0 388 2500 2500 hurt 0 0 94 11 11 v4: rework createMul() (has a side-effect that mad/fma is optimized to shl+add on nv50) Signed-off-by: Rhys Perry <pendingchao...@gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 81 ++++++++++++++++++---- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index dc7bf24ba2..d7eb6b503e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -379,6 +379,8 @@ private: CmpInstruction *findOriginForTestWithZero(Value *); + bool createMul(Value *def, Value *a, int32_t b, Value *c); + unsigned int foldCount; BuildUtil bld; @@ -953,10 +955,65 @@ ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2) } } +bool +ConstantFolding::createMul(Value *def, Value *a, int32_t b, Value *c) +{ + const Target *target = prog->getTarget(); + int64_t absB = llabs(b); + + //a * (2^shl) -> a << shl + if (b >= 0 && util_is_power_of_two_or_zero64(b)) { + int shl = util_logbase2_64(b); + + if (c && target->isOpSupported(OP_SHLADD, TYPE_U32)) + return bld.mkOp3(OP_SHLADD, TYPE_U32, def, a, bld.mkImm(shl), c); + + Value *res = c ? bld.getSSA() : def; + bld.mkOp2(OP_SHL, TYPE_U32, res, a, bld.mkImm(shl)); + if (c) + bld.mkOp2(OP_ADD, TYPE_U32, def, res, c); + + return true; + } + + //a * (2^shl + 1) -> a << shl + a + //a * -(2^shl + 1) -> -a << shl + a + //a * (2^shl - 1) -> a << shl - a + //a * -(2^shl - 1) -> -a << shl - a + if ((util_is_power_of_two_or_zero64(absB - 1) || + util_is_power_of_two_or_zero64(absB + 1)) && + target->isOpSupported(OP_SHLADD, TYPE_U32)) { + bool subA = util_is_power_of_two_or_zero64(absB + 1); + int shl = subA ? util_logbase2_64(absB + 1) : util_logbase2_64(absB - 1); + + Value *res = c ? bld.getSSA() : def; + Instruction *insn = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(shl), a); + if (b < 0) + insn->src(0).mod = Modifier(NV50_IR_MOD_NEG); + if (subA) + insn->src(2).mod = Modifier(NV50_IR_MOD_NEG); + + if (c) + bld.mkOp2(OP_ADD, TYPE_U32, def, res, c); + + return true; + } + + if (b >= 0 && b <= 0xffff && target->isOpSupported(OP_XMAD, TYPE_U32)) { + Value *tmp = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(), + a, bld.mkImm(b), c ? c : bld.mkImm(0)); + bld.mkOp3(OP_XMAD, TYPE_U32, def, a, bld.mkImm(b), tmp)->subOp = + NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_H1(0); + + return true; + } + + return false; +} + void ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) { - const Target *target = prog->getTarget(); const int t = !s; const operation op = i->op; Instruction *newi = i; @@ -1040,13 +1097,10 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->setSrc(s, i->getSrc(t)); i->src(s).mod = i->src(t).mod; } else - if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) { - i->op = OP_SHL; - imm0.applyLog2(); - i->setSrc(0, i->getSrc(t)); - i->src(0).mod = i->src(t).mod; - i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); - i->src(1).mod = 0; + if (!isFloatType(i->dType) && !i->src(t).mod) { + bld.setPosition(i, false); + if (createMul(i->getDef(0), i->getSrc(t), imm0.reg.data.s32, NULL)) + delete_Instruction(prog, i); } else if (i->postFactor && i->sType == TYPE_F32) { /* Can't emit a postfactor with an immediate, have to fold it in */ @@ -1079,13 +1133,10 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->setSrc(2, NULL); i->op = OP_ADD; } else - if (s == 1 && !imm0.isNegative() && imm0.isPow2() && - !isFloatType(i->dType) && - target->isOpSupported(OP_SHLADD, i->dType) && - !i->subOp) { - i->op = OP_SHLADD; - imm0.applyLog2(); - i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); + if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) { + bld.setPosition(i, false); + if (createMul(i->getDef(0), i->getSrc(t), imm0.reg.data.s32, i->getSrc(2))) + delete_Instruction(prog, i); } break; case OP_SUB: -- 2.14.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev