================
@@ -882,6 +890,107 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
   return emitFrexpDiv(Builder, Num, Den, DivFMF);
 }
 
+bool AMDGPUCodeGenPrepareImpl::expandReciprocalNewtonRaphson(
+    BinaryOperator &FDiv) {
+  if (!EnableFP32ReciprocalNewtonRaphson)
+    return false;
+
+  Type *Ty = FDiv.getType();
+  if (!Ty->isFloatTy())
+    return false;
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  const APFloat *NumVal;
+  if (!match(Num, m_APFloat(NumVal)) || !NumVal->isExactlyValue(1.0))
+    return false;
+
+  // Skip fdivs that were created in the slow path.
+  if (FDiv.getMetadata("amdgpu.no.rcp.transform"))
+    return false;
+
+  // 2^126: largest FP32 magnitude whose reciprocal is still normal.
+  constexpr float MaxNormalWithNormalRcp = 0x1.0p126f;
+  // Scale factor for large normals.
+  constexpr float LargeNormalScaleFactor = 0x1.0p-32f;
+
+  // Emit rcp + one Newton-Raphson iteration.
+  // In IEEE denormal mode, v_rcp_f32 flushes subnormal outputs to zero,
+  // so for |x| > 2^126 we scale x down before rcp and scale the result
+  // back. In FTZ mode the flush is expected, so scaling is unnecessary.
+  auto EmitNRPath = [&](IRBuilder<> &B, Value *D) -> Value * {
+    Value *DForRcp = D;
+    Value *Scale = nullptr;
+    if (!HasFP32DenormalFlush) {
+      Value *AbsD = B.CreateUnaryIntrinsic(Intrinsic::fabs, D);
+      Value *IsLarge =
+          B.CreateFCmpOGT(AbsD, ConstantFP::get(Ty, MaxNormalWithNormalRcp));
+      Scale =
+          B.CreateSelect(IsLarge, ConstantFP::get(Ty, LargeNormalScaleFactor),
+                         ConstantFP::get(Ty, 1.0));
+      DForRcp = B.CreateFMul(D, Scale);
+    }
+    Value *Y0 = B.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, DForRcp);
+    Value *Err = B.CreateIntrinsic(Intrinsic::fma, {Ty},
+                                   {DForRcp, Y0, ConstantFP::get(Ty, -1.0)});
+    Value *NegErr = B.CreateFNeg(Err);
+    Value *Y1 = B.CreateIntrinsic(Intrinsic::fma, {Ty}, {Y0, NegErr, Y0});
+    if (!HasFP32DenormalFlush)
+      return B.CreateFMul(Y1, Scale);
+    return Y1;
+  };
+
+  KnownFPClass Known = computeKnownFPClass(Den, fcAllFlags, &FDiv);
+  bool DenIsKnownNormal = (Known.KnownFPClasses & ~fcNormal) == fcNone;
+
+  Value *Result;
+  if (DenIsKnownNormal) {
+    // Denominator is provably normal -- emit NR inline without a branch.
+    IRBuilder<> Builder(&FDiv);
+    Result = EmitNRPath(Builder, Den);
+  } else {
+    BasicBlock *OrigBB = FDiv.getParent();
+    Function *Fn = OrigBB->getParent();
+    LLVMContext &Ctx = Fn->getContext();
+
+    BasicBlock *TailBB =
+        OrigBB->splitBasicBlock(FDiv.getIterator(), "fdiv.tail");
+    OrigBB->getTerminator()->eraseFromParent();
+
+    BasicBlock *FastBB = BasicBlock::Create(Ctx, "fdiv.fast", Fn, TailBB);
+    BasicBlock *SlowBB = BasicBlock::Create(Ctx, "fdiv.slow", Fn, TailBB);
+
+    IRBuilder<> Builder(Ctx);
+    Builder.SetInsertPoint(OrigBB);
+    Value *IsNormal = Builder.CreateIntrinsic(
+        Intrinsic::is_fpclass, {Ty}, {Den, Builder.getInt32(fcNormal)});
+    Builder.CreateCondBr(IsNormal, FastBB, SlowBB);
+
+    Builder.SetInsertPoint(FastBB);
+    Value *FastResult = EmitNRPath(Builder, Den);
+    Builder.CreateBr(TailBB);
+
+    Builder.SetInsertPoint(SlowBB);
+    Value *SlowResult = Builder.CreateFDiv(Num, Den);
----------------
carlobertolli wrote:

Good catch! I added both fast math metadata and debug location, which were 
missing. There is another place in the same file where this happens, so it was 
easy to do.

https://github.com/llvm/llvm-project/pull/194716
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to