Re: [Mesa-dev] [PATCH v2 3/5] nv50/ir: optimize imul/imad to xmads

2018-07-18 Thread Karol Herbst
some nitpicks, but with those fixed:

Reviewed-by: Karol Herbst 

On Wed, Jul 18, 2018 at 7:05 PM, Rhys Perry  wrote:
> This hits the shader-db numbers a good bit, though a few xmads is way
> faster than an imul or imad and the cost is mitigated by the next commit,
> which optimizes many multiplications by immediates into shorter and less
> register heavy instructions than the xmads.
>
> total instructions in shared programs : 5256901 -> 5294693 (0.72%)
> total gprs used in shared programs: 624328 -> 624962 (0.10%)
> total shared used in shared programs  : 360704 -> 360704 (0.00%)
> total local used in shared programs   : 20952 -> 21048 (0.46%)
>
> local sharedgpr   inst  bytes
> helped   0   0  39   0   0
>   hurt   1   0 33422772277
>
> Signed-off-by: Rhys Perry 
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 51 
> ++
>  .../nouveau/codegen/nv50_ir_target_gm107.cpp   |  1 -
>  2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index 5fc1fba970..14cc4b32d4 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2291,13 +2291,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
>  // 
> =
>
>  // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
>  class LateAlgebraicOpt : public Pass
>  {
>  private:
> virtual bool visit(Instruction *);
>
> void handleADD(Instruction *);
> +   void handleMULMAD(Instruction *);
> bool tryADDToSHLADD(Instruction *);
> +
> +   BuildUtil bld;
>  };
>
>  void
> @@ -2357,6 +2362,47 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
>
> return true;
>  }
> +
> +// MUL(a, b) -> a few XMADs
> +// MAD/FMA(a, b, c) -> a few XMADs
> +void
> +LateAlgebraicOpt::handleMULMAD(Instruction *i)
> +{
> +   // TODO: handle NV50_IR_SUBOP_MUL_HIGH
> +   if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
> +  return;
> +   if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
> +  return;
> +   if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
> +  return;
> +
> +   assert(!i->src(0).mod);
> +   assert(!i->src(1).mod);
> +   assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
> +
> +   bld.setPosition(i, true);
> +
> +   Value *a = i->getSrc(0);
> +   Value *b = i->getSrc(1);
> +   Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
> +
> +   Value *tmp0 = bld.getSSA();
> +   Value *tmp1 = bld.getSSA();
> +
> +   Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
> +   insn->setPredicate(i->cc, i->getPredicate());
> +
> +   insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
> +   insn->setPredicate(i->cc, i->getPredicate());
> +   insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
> +
> +   insn = bld.mkOp3(OP_XMAD, TYPE_U32, i->getDef(0), b, tmp1, tmp0);
> +   insn->setPredicate(i->cc, i->getPredicate());
> +   insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
> +   insn->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
> +
> +   delete_Instruction(prog, i);

I think you can simply adjust the current op instead, then we don't
need to set the predicate, create and delete an instruction:

insn->op = OP_XMAD;
insn->setSrc(0, b);
insn->setSrc(1, tmp1);
insn->setSrc(2, tmp0);
insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
insn->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);

> +}
>
>  bool
>  LateAlgebraicOpt::visit(Instruction *i)
> @@ -2365,6 +2411,11 @@ LateAlgebraicOpt::visit(Instruction *i)
> case OP_ADD:
>handleADD(i);
>break;
> +   case OP_MUL:
> +   case OP_MAD:
> +   case OP_FMA:
> +  handleMULMAD(i);
> +  break;
> default:
>break;
> }
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> index f918fbfdd3..571d8a67c2 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
> @@ -165,7 +165,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) 
> const
>}
>break;
> case OPCLASS_ARITH:
> -  // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
>if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
>!isFloatType(insn->dType))
>   return true;
> --
> 2.14.4
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___

[Mesa-dev] [PATCH v2 3/5] nv50/ir: optimize imul/imad to xmads

2018-07-18 Thread Rhys Perry
This hits the shader-db numbers a good bit, though a few xmads is way
faster than an imul or imad and the cost is mitigated by the next commit,
which optimizes many multiplications by immediates into shorter and less
register heavy instructions than the xmads.

total instructions in shared programs : 5256901 -> 5294693 (0.72%)
total gprs used in shared programs: 624328 -> 624962 (0.10%)
total shared used in shared programs  : 360704 -> 360704 (0.00%)
total local used in shared programs   : 20952 -> 21048 (0.46%)

local sharedgpr   inst  bytes
helped   0   0  39   0   0
  hurt   1   0 33422772277

Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 51 ++
 .../nouveau/codegen/nv50_ir_target_gm107.cpp   |  1 -
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 5fc1fba970..14cc4b32d4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2291,13 +2291,18 @@ AlgebraicOpt::visit(BasicBlock *bb)
 // 
=
 
 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
 class LateAlgebraicOpt : public Pass
 {
 private:
virtual bool visit(Instruction *);
 
void handleADD(Instruction *);
+   void handleMULMAD(Instruction *);
bool tryADDToSHLADD(Instruction *);
+
+   BuildUtil bld;
 };
 
 void
@@ -2357,6 +2362,47 @@ LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
 
return true;
 }
+ 
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+void
+LateAlgebraicOpt::handleMULMAD(Instruction *i)
+{
+   // TODO: handle NV50_IR_SUBOP_MUL_HIGH
+   if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
+  return;
+   if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
+  return;
+   if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
+  return;
+
+   assert(!i->src(0).mod);
+   assert(!i->src(1).mod);
+   assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
+
+   bld.setPosition(i, true);
+
+   Value *a = i->getSrc(0);
+   Value *b = i->getSrc(1);
+   Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
+
+   Value *tmp0 = bld.getSSA();
+   Value *tmp1 = bld.getSSA();
+
+   Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
+   insn->setPredicate(i->cc, i->getPredicate());
+
+   insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
+   insn->setPredicate(i->cc, i->getPredicate());
+   insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
+
+   insn = bld.mkOp3(OP_XMAD, TYPE_U32, i->getDef(0), b, tmp1, tmp0);
+   insn->setPredicate(i->cc, i->getPredicate());
+   insn->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
+   insn->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
+
+   delete_Instruction(prog, i);
+}
 
 bool
 LateAlgebraicOpt::visit(Instruction *i)
@@ -2365,6 +2411,11 @@ LateAlgebraicOpt::visit(Instruction *i)
case OP_ADD:
   handleADD(i);
   break;
+   case OP_MUL:
+   case OP_MAD:
+   case OP_FMA:
+  handleMULMAD(i);
+  break;
default:
   break;
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index f918fbfdd3..571d8a67c2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -165,7 +165,6 @@ TargetGM107::isBarrierRequired(const Instruction *insn) 
const
   }
   break;
case OPCLASS_ARITH:
-  // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
   if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
   !isFloatType(insn->dType))
  return true;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev