Does this give correct results for special floats (0, infs)?
We tried to improve (for single floats) x86 rcp in llvmpipe with
newton-raphson, but unfortunately not being able to give correct results
for these two cases (without even more additional code) meant it got all
disabled in the end (you can still see that code in the driver) since
the problems are at least as bad as those due to bad accuracy...

Roland

Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
> Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu>
> ---
> 
> Not sure how many steps are needed for the necessary accuracy. Just
> doing 2 because that seems like a reasonable number.
> 
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 42 
> ++++++++++++++++++++--
>  1 file changed, 39 insertions(+), 3 deletions(-)
> 
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 87e75e1..9767566 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>     bld.setPosition(i, false);
>  
>     // 1. Take the source and it up.
> -   Value *src[2], *dst[2], *def = i->getDef(0);
> -   bld.mkSplit(src, 4, i->getSrc(0));
> +   Value *input = i->getSrc(0);
> +   Value *src[2], *dst[2], *guess, *def = i->getDef(0);
> +   bld.mkSplit(src, 4, input);
>  
>     // 2. We don't care about the low 32 bits of the destination. Stick a 0 
> in.
>     dst[0] = bld.loadImm(NULL, 0);
> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>  
>     // 4. Recombine the two dst pieces back into the original destination.
>     bld.setPosition(i, true);
> -   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
> +   guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
> +
> +   // 5. Perform 2 Newton-Raphson steps
> +   if (i->op == OP_RCP) {
> +      // RCP: x_{n+1} = 2 * x_n - input * x_n^2
> +      Value *two = bld.getSSA(8);
> +
> +      bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
> +
> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, 
> guess),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess)));
> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, 
> guess),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess)));
> +   } else {
> +      // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
> +      Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
> +      bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, 
> -0.5f));
> +      bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 
> 1.5f));
> +
> +      half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, 
> input);
> +      // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), 
> half_input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess),
> +                                    three_half));
> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), 
> half_input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess),
> +                                    three_half));
> +   }
> +
> +   bld.mkMov(def, guess);
>  }
>  
>  bool
> 

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to