================
@@ -882,6 +890,107 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
return emitFrexpDiv(Builder, Num, Den, DivFMF);
}
+bool AMDGPUCodeGenPrepareImpl::expandReciprocalNewtonRaphson(
+ BinaryOperator &FDiv) {
+ if (!EnableFP32ReciprocalNewtonRaphson)
+ return false;
+
+ Type *Ty = FDiv.getType();
+ if (!Ty->isFloatTy())
+ return false;
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ const APFloat *NumVal;
+ if (!match(Num, m_APFloat(NumVal)) || !NumVal->isExactlyValue(1.0))
+ return false;
+
+ // Skip fdivs that were created in the slow path.
+ if (FDiv.getMetadata("amdgpu.no.rcp.transform"))
+ return false;
+
+ // 2^126: largest FP32 magnitude whose reciprocal is still normal.
+ constexpr float MaxNormalWithNormalRcp = 0x1.0p126f;
+ // Scale factor for large normals.
+ constexpr float LargeNormalScaleFactor = 0x1.0p-32f;
+
+ // Emit rcp + one Newton-Raphson iteration.
+ // In IEEE denormal mode, v_rcp_f32 flushes subnormal outputs to zero,
+ // so for |x| > 2^126 we scale x down before rcp and scale the result
+ // back. In FTZ mode the flush is expected, so scaling is unnecessary.
+ auto EmitNRPath = [&](IRBuilder<> &B, Value *D) -> Value * {
+ Value *DForRcp = D;
+ Value *Scale = nullptr;
+ if (!HasFP32DenormalFlush) {
+ Value *AbsD = B.CreateUnaryIntrinsic(Intrinsic::fabs, D);
+ Value *IsLarge =
+ B.CreateFCmpOGT(AbsD, ConstantFP::get(Ty, MaxNormalWithNormalRcp));
+ Scale =
+ B.CreateSelect(IsLarge, ConstantFP::get(Ty, LargeNormalScaleFactor),
+ ConstantFP::get(Ty, 1.0));
+ DForRcp = B.CreateFMul(D, Scale);
+ }
+ Value *Y0 = B.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, DForRcp);
+ Value *Err = B.CreateIntrinsic(Intrinsic::fma, {Ty},
+ {DForRcp, Y0, ConstantFP::get(Ty, -1.0)});
+ Value *NegErr = B.CreateFNeg(Err);
+ Value *Y1 = B.CreateIntrinsic(Intrinsic::fma, {Ty}, {Y0, NegErr, Y0});
+ if (!HasFP32DenormalFlush)
+ return B.CreateFMul(Y1, Scale);
+ return Y1;
+ };
+
+ KnownFPClass Known = computeKnownFPClass(Den, fcAllFlags, &FDiv);
+ bool DenIsKnownNormal = (Known.KnownFPClasses & ~fcNormal) == fcNone;
+
+ Value *Result;
+ if (DenIsKnownNormal) {
+ // Denominator is provably normal -- emit NR inline without a branch.
+ IRBuilder<> Builder(&FDiv);
+ Result = EmitNRPath(Builder, Den);
+ } else {
+ BasicBlock *OrigBB = FDiv.getParent();
+ Function *Fn = OrigBB->getParent();
+ LLVMContext &Ctx = Fn->getContext();
+
+ BasicBlock *TailBB =
+ OrigBB->splitBasicBlock(FDiv.getIterator(), "fdiv.tail");
+ OrigBB->getTerminator()->eraseFromParent();
+
+ BasicBlock *FastBB = BasicBlock::Create(Ctx, "fdiv.fast", Fn, TailBB);
+ BasicBlock *SlowBB = BasicBlock::Create(Ctx, "fdiv.slow", Fn, TailBB);
+
+ IRBuilder<> Builder(Ctx);
+ Builder.SetInsertPoint(OrigBB);
+ Value *IsNormal = Builder.CreateIntrinsic(
+ Intrinsic::is_fpclass, {Ty}, {Den, Builder.getInt32(fcNormal)});
+ Builder.CreateCondBr(IsNormal, FastBB, SlowBB);
+
+ Builder.SetInsertPoint(FastBB);
+ Value *FastResult = EmitNRPath(Builder, Den);
+ Builder.CreateBr(TailBB);
+
+ Builder.SetInsertPoint(SlowBB);
+ Value *SlowResult = Builder.CreateFDiv(Num, Den);
----------------
carlobertolli wrote:
Good catch! I added both fast math metadata and debug location, which were
missing. There is another place in the same file where this happens, so it was
easy to do.
https://github.com/llvm/llvm-project/pull/194716
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits