[llvm-branch-commits] [llvm] AMDGPU: Introduce f64 rsq pattern in AMDGPUCodeGenPrepare (PR #172053)

via llvm-branch-commits Fri, 12 Dec 2025 09:41:40 -0800

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

Handle this here instead of DAGCombine, mostly because the f32
case is handled here due to the dependency on !fpmath. Also we can
take advantage of computeKnownFPClass.

---

Patch is 604.88 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/172053.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+129-12) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll 
(+309-56) 
- (modified) llvm/test/CodeGen/AMDGPU/rsq.f64.ll (+5716-4375) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 71ea9ef6fc050..e45d0652a65ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -236,6 +236,9 @@ class AMDGPUCodeGenPrepareImpl
                       FastMathFlags FMF) const;
   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
                           FastMathFlags FMF) const;
+  Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
+                    FastMathFlags DivFMF, const Instruction *CtxI,
+                    bool IsNegative) const;
 
   bool tryNarrowMathIfNoOverflow(Instruction *I);
 
@@ -605,6 +608,94 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value 
*Src,
   return Builder.CreateFMul(Rsq, OutputScaleFactor);
 }
 
+/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
+/// v_rsq_f64. This should give a 1ulp result.
+Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
+                                            FastMathFlags SqrtFMF,
+                                            FastMathFlags DivFMF,
+                                            const Instruction *CtxI,
+                                            bool IsNegative) const {
+  // rsq(x):
+  //   double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
+  //   double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 
1.0);
+  //   return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
+  //
+  // The rsq instruction handles the special cases correctly. We need to check
+  // for the edge case conditions to ensure the special case propagates through
+  // the later instructions.
+
+  Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
+
+  // Try to elide the edge case check.
+  //
+  // Fast math flags imply:
+  //   sqrt ninf => !isinf(x)
+  //   sqrt nnan => not helpful
+  //   fdiv ninf => x != 0, !isinf(x)
+  //   fdiv nnan => x != 0
+  bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
+  bool MaybeZero = !DivFMF.noInfs() && !DivFMF.noNaNs();
+
+  DenormalMode DenormMode;
+  FPClassTest Interested = fcNone;
+  if (MaybeZero)
+    Interested = fcZero;
+  if (MaybePosInf)
+    Interested = fcPosInf;
+
+  if (Interested != fcNone) {
+    KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
+    if (KnownSrc.isKnownNeverPosInfinity())
+      MaybePosInf = false;
+
+    DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
+    if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
+      MaybeZero = false;
+  }
+
+  Value *SpecialOrRsq = Y0;
+  if (MaybeZero || MaybePosInf) {
+    Value *Cond;
+    if (MaybePosInf && MaybeZero) {
+      if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
+        FPClassTest TestMask = fcPosInf | fcZero;
+        if (DenormMode.inputsAreZero())
+          TestMask |= fcSubnormal;
+
+        Cond = Builder.createIsFPClass(X, TestMask);
+      } else {
+        // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
+        // doesn't respect the floating-point environment.
+        Value *IsZero =
+            Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+        Value *IsInf =
+            Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+        Cond = Builder.CreateOr(IsZero, IsInf);
+      }
+    } else if (MaybeZero) {
+      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
+    } else {
+      Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
+    }
+
+    SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
+  }
+
+  Value *NegY0 = Builder.CreateFNeg(Y0);
+  Value *NegXY0 = Builder.CreateFMul(NegY0, SpecialOrRsq);
+
+  // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
+  Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
+  Value *Y0E = Builder.CreateFMul(Y0, E);
+
+  Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
+                                  ConstantFP::get(X->getType(), 0.5));
+  if (IsNegative)
+    EFMA = Builder.CreateFNeg(EFMA);
+
+  return Builder.CreateFMA(Y0E, EFMA, Y0);
+}
+
 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
                                                   FastMathFlags DivFMF,
                                                   FastMathFlags SqrtFMF) const 
{
@@ -612,8 +703,22 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const 
FPMathOperator *SqrtOp,
   if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
     return false;
 
-  // v_rsq_f32 gives 1ulp
-  return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
+  Type *EltTy = SqrtOp->getType()->getScalarType();
+  switch (EltTy->getTypeID()) {
+  case Type::FloatTyID:
+    // v_rsq_f32 gives 1ulp
+    // Separate correctly rounded fdiv + sqrt give ~1.81 ulp.
+
+    // FIXME: rsq formation should not depend on approx func or the fpmath
+    // accuracy. This strictly improves precision.
+    return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
+  case Type::DoubleTyID:
+    return true;
+  default:
+    return false;
+  }
+
+  llvm_unreachable("covered switch");
 }
 
 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -629,8 +734,6 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
   if (!CLHS)
     return nullptr;
 
-  assert(Den->getType()->isFloatTy());
-
   bool IsNegative = false;
 
   // TODO: Handle other numerator values with arcp.
@@ -639,14 +742,20 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
     IRBuilder<>::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(DivFMF | SqrtFMF);
 
-    if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
-        canIgnoreDenormalInput(Den, CtxI)) {
-      Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
-      // -1.0 / sqrt(x) -> fneg(rsq(x))
-      return IsNegative ? Builder.CreateFNeg(Result) : Result;
+    if (Den->getType()->isFloatTy()) {
+      if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+          canIgnoreDenormalInput(Den, CtxI)) {
+        Value *Result =
+            Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
+        // -1.0 / sqrt(x) -> fneg(rsq(x))
+        return IsNegative ? Builder.CreateFNeg(Result) : Result;
+      }
+
+      return emitRsqIEEE1ULP(Builder, Den, IsNegative);
     }
 
-    return emitRsqIEEE1ULP(Builder, Den, IsNegative);
+    if (Den->getType()->isDoubleTy())
+      return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
   }
 
   return nullptr;
@@ -758,6 +867,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
       return Rsq;
   }
 
+  if (!Num->getType()->isFloatTy())
+    return nullptr;
+
   Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
   if (Rcp)
     return Rcp;
@@ -793,7 +905,8 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator 
&FDiv) {
     return false;
 
   Type *Ty = FDiv.getType()->getScalarType();
-  if (!Ty->isFloatTy())
+  const bool IsFloat = Ty->isFloatTy();
+  if (!IsFloat && !Ty->isDoubleTy())
     return false;
 
   // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
@@ -818,6 +931,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator 
&FDiv) {
       RsqOp = SqrtOp->getOperand(0);
   }
 
+  // rcp path not yet implemented for f64.
+  if (!IsFloat && !RsqOp)
+    return false;
+
   // Inaccurate rcp is allowed with afn.
   //
   // Defer to codegen to handle this.
@@ -832,7 +949,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator 
&FDiv) {
     return false;
 
   // Defer the correct implementations to codegen.
-  if (ReqdAccuracy < 1.0f)
+  if (IsFloat && ReqdAccuracy < 1.0f)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll 
b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll
index b97cd91f2ab32..764b10a7d1987 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.f64.ll
@@ -4,8 +4,15 @@
 define double @rsq_f64(double %x) {
 ; CHECK-LABEL: define double @rsq_f64(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double 
[[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 
608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], 
double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double 
[[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double 
[[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double 
[[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -16,8 +23,16 @@ define double @rsq_f64(double %x) {
 define double @neg_rsq_f64(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double 
[[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double -1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f64(double [[X]], i32 
608)
+; CHECK-NEXT:    [[TMP3:%.*]] = select contract i1 [[TMP2]], double [[TMP1]], 
double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul contract double [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call contract double @llvm.fma.f64(double 
[[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul contract double [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call contract double @llvm.fma.f64(double 
[[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP9:%.*]] = fneg contract double [[TMP8]]
+; CHECK-NEXT:    [[FDIV:%.*]] = call contract double @llvm.fma.f64(double 
[[TMP7]], double [[TMP9]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract double @llvm.sqrt.f64(double %x)
@@ -28,8 +43,15 @@ define double @neg_rsq_f64(double %x) {
 define double @rsq_f64_nnan(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nnan(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan contract double 1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp nnan contract oeq double [[X]], 
0x7FF0000000000000
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan contract i1 [[TMP2]], double 
[[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan contract double [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan contract double @llvm.fma.f64(double 
[[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan contract double [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan contract double @llvm.fma.f64(double 
[[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan contract double @llvm.fma.f64(double 
[[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan double @llvm.sqrt.f64(double %x)
@@ -40,8 +62,16 @@ define double @rsq_f64_nnan(double %x) {
 define double @neg_rsq_f64_nnan(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64_nnan(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan contract double -1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp nnan contract oeq double [[X]], 
0x7FF0000000000000
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan contract i1 [[TMP2]], double 
[[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan contract double [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan contract double @llvm.fma.f64(double 
[[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan contract double [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan contract double @llvm.fma.f64(double 
[[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP9:%.*]] = fneg nnan contract double [[TMP8]]
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan contract double @llvm.fma.f64(double 
[[TMP7]], double [[TMP9]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan double @llvm.sqrt.f64(double %x)
@@ -52,8 +82,13 @@ define double @neg_rsq_f64_nnan(double %x) {
 define double @rsq_f64_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call ninf contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv ninf contract double 1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul ninf contract double [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call ninf contract double @llvm.fma.f64(double 
[[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul ninf contract double [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call ninf contract double @llvm.fma.f64(double 
[[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call ninf contract double @llvm.fma.f64(double 
[[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract ninf double @llvm.sqrt.f64(double %x)
@@ -64,8 +99,14 @@ define double @rsq_f64_ninf(double %x) {
 define double @neg_rsq_f64_ninf(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call ninf contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv ninf contract double -1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul ninf contract double [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call ninf contract double @llvm.fma.f64(double 
[[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul ninf contract double [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call ninf contract double @llvm.fma.f64(double 
[[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP7:%.*]] = fneg ninf contract double [[TMP6]]
+; CHECK-NEXT:    [[FDIV:%.*]] = call ninf contract double @llvm.fma.f64(double 
[[TMP5]], double [[TMP7]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract ninf double @llvm.sqrt.f64(double %x)
@@ -76,8 +117,13 @@ define double @neg_rsq_f64_ninf(double %x) {
 define double @rsq_f64_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan ninf contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan ninf contract double 1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[TMP2]], 
[[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP1]], 
[[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan ninf double @llvm.sqrt.f64(double %x)
@@ -88,8 +134,14 @@ define double @rsq_f64_nnan_ninf(double %x) {
 define double @neg_rsq_f64_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @neg_rsq_f64_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan ninf contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan ninf contract double -1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul nnan ninf contract double [[TMP2]], 
[[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP3]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP1]], 
[[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP4]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[TMP7:%.*]] = fneg nnan ninf contract double [[TMP6]]
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP5]], double [[TMP7]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan ninf double @llvm.sqrt.f64(double %x)
@@ -100,8 +152,15 @@ define double @neg_rsq_f64_nnan_ninf(double %x) {
 define double @rsq_f64_sqrt_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_sqrt_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call nnan ninf contract double 
@llvm.sqrt.f64(double [[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv contract double 1.000000e+00, [[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp nnan ninf contract oeq double [[X]], 
0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = select nnan ninf contract i1 [[TMP2]], double 
[[TMP1]], double [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg nnan ninf contract double [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul nnan ninf contract double [[TMP4]], 
[[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP5]], double [[TMP1]], double 1.000000e+00)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul nnan ninf contract double [[TMP1]], 
[[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP6]], double 3.750000e-01, double 5.000000e-01)
+; CHECK-NEXT:    [[FDIV:%.*]] = call nnan ninf contract double 
@llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[FDIV]]
 ;
   %sqrt.x = call contract nnan ninf double @llvm.sqrt.f64(double %x)
@@ -112,8 +171,13 @@ define double @rsq_f64_sqrt_nnan_ninf(double %x) {
 define double @rsq_f64_fdiv_nnan_ninf(double %x) {
 ; CHECK-LABEL: define double @rsq_f64_fdiv_nnan_ninf(
 ; CHECK-SAME: double [[X:%.*]]) {
-; CHECK-NEXT:    [[SQRT_X:%.*]] = call contract double @llvm.sqrt.f64(double 
[[X]])
-; CHECK-NEXT:    [[FDIV:%.*]] = fdiv nnan ninf contract double 1.000000e+00, 
[[SQRT_X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf contract double 
@llvm.amdgcn.rsq.f64(double [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/172053
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] AMDGPU: Introduce f64 rsq pattern in AMDGPUCodeGenPrepare (PR #172053)

Reply via email to