[clang] [llvm] [AMDGPU] Use 1-iteration Newton-Raphson refinement for FP32 reciprocal. (PR #194716)

via cfe-commits Sat, 09 May 2026 07:09:19 -0700

https://github.com/carlobertolli updated 
https://github.com/llvm/llvm-project/pull/194716


>From 2228d94c352b77194542cf57d293989fc33694f8 Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <[email protected]>
Date: Wed, 15 Apr 2026 23:39:37 +0000
Subject: [PATCH] [AMDGPU] Use 1-iteration Newton-Raphson refinement for FP32
 reciprocal. The AMDGPU backend lowers the following code

r = 1/x;

to a FP32 division, i.e. a 12-instruction code sequence that handles all 
denominator cases (normal, denormal, inf, Nans).

For normal denominators, we can lower it to a simple Newton-Raphson computation.
Large normals, whose reciprocal is subnormal and flushed to 0 by v_rcp_f32,
are scaled to a small normal before using v_rcp_f32 and then scaled back.
Max ULP is 1.

When using this algorith, we see ~1.50x performance improvements on arithmetic 
intensive kernels.
However, the patch introduces a if-then-else (see below) structure that
may prevent overlapping of load latencies with useful computation in 
memory-intensive kernels.

This patch introduces a runtime check on the denominator and
1. If the denominator is not normal, it executes the current code gen scheme.
2. If the denominator is normal, it executes the new code gen scheme described 
above.

This behavior is hidden behind a flag (turned off by default)
-enable-fp32-recip-newton-raphson

Assisted-by: Cursor (Claude)
---
 clang/include/clang/Options/Options.td        |   9 +
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |   5 +
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   8 +
 .../amdgpu-fp32-recip-newton-raphson.hip      |  30 +
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 170 ++++-
 .../AMDGPU/llvm.amdgcn.rcp.normal.f32.ll      | 705 ++++++++++++++++++
 6 files changed, 926 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.normal.f32.ll

diff --git a/clang/include/clang/Options/Options.td 
b/clang/include/clang/Options/Options.td
index c64ebba6f3dbf..d117114260ae0 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5820,6 +5820,15 @@ defm amdgpu_expand_waitcnt_profiling : 
BoolMOption<"amdgpu-expand-waitcnt-profil
   "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
   NegFlag<SetFalse, [], [ClangOption]>>;
 
+def mamdgpu_fp32_recip_newton_raphson : Flag<["-"], 
"mamdgpu-fp32-recip-newton-raphson">,
+  Group<m_Group>,
+  HelpText<"Use Newton-Raphson refinement for FP32 1.0f/x reciprocal when the 
denominator "
+  "is a normal float or NaN, falling back to the full division sequence for "
+  "denormals/inf/zero. (AMDGPU only)">;
+def mno_amdgpu_fp32_recip_newton_raphson : Flag<["-"], 
"mno-amdgpu-fp32-recip-newton-raphson">,
+  Group<m_Group>,
+  HelpText<"Disable Newton-Raphson refinement for FP32 reciprocal (AMDGPU 
only)">;
+
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, 
Group<m_Group>,
   HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
   Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp 
b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 44b2e13ae128c..82a0598cf8527 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -868,6 +868,11 @@ void AMDGPUToolChain::addClangTargetOptions(
       !DriverArgs.hasArg(options::OPT_disable_llvm_optzns))
     CC1Args.push_back("-disable-llvm-optzns");
 
+  if (DriverArgs.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson,
+                         options::OPT_mno_amdgpu_fp32_recip_newton_raphson,
+                         false))
+    CC1Args.append({"-mllvm", "-enable-fp32-recip-newton-raphson"});
+
   if (DeviceOffloadingKind == Action::OFK_None)
     addOpenCLBuiltinsLib(getDriver(), getTriple(), DriverArgs, CC1Args);
 }
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index b4ff90c1d61f0..00a89cfccc55d 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -66,6 +66,9 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, 
const JobAction &JA,
                         "-plugin-opt=-amdgpu-internalize-symbols"};
   if (Args.hasArg(options::OPT_hipstdpar))
     LldArgs.push_back("-plugin-opt=-amdgpu-enable-hipstdpar");
+  if (Args.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson,
+                   options::OPT_mno_amdgpu_fp32_recip_newton_raphson, false))
+    LldArgs.push_back("-plugin-opt=-enable-fp32-recip-newton-raphson");
 
   auto &TC = getToolChain();
   auto &D = TC.getDriver();
@@ -250,6 +253,11 @@ void HIPAMDToolChain::addClangTargetOptions(
       CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
   }
 
+  if (DriverArgs.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson,
+                         options::OPT_mno_amdgpu_fp32_recip_newton_raphson,
+                         false))
+    CC1Args.append({"-mllvm", "-enable-fp32-recip-newton-raphson"});
+
   StringRef MaxThreadsPerBlock =
       DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
   if (!MaxThreadsPerBlock.empty()) {
diff --git a/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip 
b/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip
new file mode 100644
index 0000000000000..3ebf42c7f504f
--- /dev/null
+++ b/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip
@@ -0,0 +1,30 @@
+// REQUIRES: amdgpu-registered-target
+
+// Check that -mamdgpu-fp32-recip-newton-raphson passes -mllvm flag to device 
cc1.
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 -mamdgpu-fp32-recip-newton-raphson %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=ON %s
+
+// ON: "-cc1" {{.*}} "-fcuda-is-device" {{.*}} "-mllvm" 
"-enable-fp32-recip-newton-raphson"
+
+// Check that -mno-amdgpu-fp32-recip-newton-raphson does not pass the flag.
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 -mno-amdgpu-fp32-recip-newton-raphson %s 2>&1 
\
+// RUN:   | FileCheck -check-prefix=OFF %s
+
+// OFF-NOT: "-enable-fp32-recip-newton-raphson"
+
+// Check that -mno overrides -m (last flag wins).
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 -mamdgpu-fp32-recip-newton-raphson \
+// RUN:   -mno-amdgpu-fp32-recip-newton-raphson %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=OFF %s
+
+// Check default (off).
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=OFF %s
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 915d2116bd268..e60e285bdd5af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -94,6 +94,12 @@ static cl::opt<bool> DisableFDivExpand(
   cl::ReallyHidden,
   cl::init(false));
 
+static cl::opt<bool> EnableFP32ReciprocalNewtonRaphson(
+    "enable-fp32-recip-newton-raphson", cl::Hidden, cl::init(false),
+    cl::desc("Use Newton-Raphson refinement for 1.0f/x when the denominator "
+             "is a normal float, falling back to the full division sequence "
+             "for denormals/inf/nan/zero."));
+
 class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
@@ -248,6 +254,16 @@ class AMDGPUCodeGenPrepareImpl
 
   bool tryNarrowMathIfNoOverflow(Instruction *I);
 
+  // When a division has 1.0f as nominator, expand it
+  // to use a single iteration Newton-Raphson (NR) refinement
+  // algorithm instead of a full division. This is only
+  // safe for normal single precision floating point numbers
+  // and for NaNs. Codegen in the function introduces an if-then-else
+  // structure that checks if the denominator is normal or NaN and
+  // executes NR if that is true; otherwise, it executes a full
+  // division.
+  bool expandReciprocalNewtonRaphson(BinaryOperator &FDiv);
+
 public:
   bool visitFDiv(BinaryOperator &I);
 
@@ -277,7 +293,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
 
     // FIXME: Division expansion needs to preserve the dominator tree.
-    if (!ExpandDiv64InIR)
+    if (!ExpandDiv64InIR && !EnableFP32ReciprocalNewtonRaphson)
       AU.setPreservesAll();
   }
   bool runOnFunction(Function &F) override;
@@ -853,6 +869,155 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
   return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
 }
 
+bool AMDGPUCodeGenPrepareImpl::expandReciprocalNewtonRaphson(
+    BinaryOperator &FDiv) {
+  if (!EnableFP32ReciprocalNewtonRaphson)
+    return false;
+
+  Type *Ty = FDiv.getType();
+  if (Ty->getScalarType() != Type::getFloatTy(FDiv.getContext()))
+    return false;
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  const APFloat *NumVal;
+  if (!match(Num, m_APFloat(NumVal)) ||
+      (!NumVal->isExactlyValue(1.0) && !NumVal->isExactlyValue(-1.0)))
+    return false;
+
+  bool IsNegative = NumVal->isExactlyValue(-1.0);
+
+  // Skip fdivs that were created in the slow path.
+  if (FDiv.getMetadata("amdgpu.no.rcp.transform"))
+    return false;
+
+  const FastMathFlags FMF = FDiv.getFastMathFlags();
+  const DebugLoc DL = FDiv.getDebugLoc();
+
+  // For vector types, scalarize into per-element scalar fdivs, then expand
+  // each one. The pass iterates in reverse so newly created instructions
+  // before the current one would be missed; expand them explicitly here.
+  if (Ty->isVectorTy()) {
+    IRBuilder<> Builder(&FDiv);
+    Builder.setFastMathFlags(FMF);
+    Builder.SetCurrentDebugLocation(DL);
+    SmallVector<Value *, 4> NumVals, DenVals;
+    extractValues(Builder, NumVals, Num);
+    extractValues(Builder, DenVals, Den);
+
+    SmallVector<Value *, 4> ResultVals(NumVals.size());
+    SmallVector<BinaryOperator *, 4> ScalarDivs;
+    for (int I = 0, E = NumVals.size(); I != E; ++I) {
+      Value *EltDiv = Builder.CreateFDiv(NumVals[I], DenVals[I]);
+      ScalarDivs.push_back(cast<BinaryOperator>(EltDiv));
+      ResultVals[I] = EltDiv;
+    }
+
+    Value *Result = insertValues(Builder, Ty, ResultVals);
+    FDiv.replaceAllUsesWith(Result);
+    FDiv.eraseFromParent();
+    for (BinaryOperator *SD : ScalarDivs)
+      expandReciprocalNewtonRaphson(*SD);
+    return true;
+  }
+
+  // -1.0 / x -> 1.0 / (fneg x)
+  // Negate the denominator so the NR path computes rcp(-x) = -1/x directly.
+  IRBuilder<> Builder(&FDiv);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(DL);
+  if (IsNegative)
+    Den = Builder.CreateFNeg(Den);
+
+  // 2^126: largest FP32 magnitude whose reciprocal (2^-126) is still normal.
+  constexpr float MaxNormalWithNormalRcp = 0x1.0p126f;
+  // Scale factor for large normals.
+  constexpr float LargeNormalScaleFactor = 0x1.0p-32f;
+
+  // Emit rcp + one Newton-Raphson iteration.
+  // In IEEE denormal mode, v_rcp_f32 flushes subnormal outputs to zero,
+  // so for |x| > 2^126 we scale x down before rcp and scale the result
+  // back. In FTZ mode the flush is expected, so scaling is unnecessary.
+  auto EmitNRPath = [&](IRBuilder<> &B, Value *D) -> Value * {
+    Value *DForRcp = D;
+    Value *Scale = nullptr;
+    if (!HasFP32DenormalFlush) {
+      Value *AbsD = B.CreateUnaryIntrinsic(Intrinsic::fabs, D);
+      Value *IsLarge =
+          B.CreateFCmpOGT(AbsD, ConstantFP::get(Ty, MaxNormalWithNormalRcp));
+      Scale =
+          B.CreateSelect(IsLarge, ConstantFP::get(Ty, LargeNormalScaleFactor),
+                         ConstantFP::get(Ty, 1.0));
+      DForRcp = B.CreateFMul(D, Scale);
+    }
+    Value *Y0 = B.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, DForRcp);
+    Value *Err = B.CreateIntrinsic(Intrinsic::fma, {Ty},
+                                   {DForRcp, Y0, ConstantFP::get(Ty, -1.0)});
+    Value *NegErr = B.CreateFNeg(Err);
+    Value *Y1 = B.CreateIntrinsic(Intrinsic::fma, {Ty}, {Y0, NegErr, Y0});
+    if (!HasFP32DenormalFlush)
+      return B.CreateFMul(Y1, Scale);
+    return Y1;
+  };
+
+  KnownFPClass Known = computeKnownFPClass(Den, fcAllFlags, &FDiv);
+  bool DenIsKnownNormalOrNaN = Known.isKnownNever(~(fcNormal | fcNan));
+
+  Value *Result;
+  if (DenIsKnownNormalOrNaN) {
+    // Denominator is provably normal or NaN -- emit NR inline without a 
branch.
+    Result = EmitNRPath(Builder, Den);
+  } else {
+    // Code generation scheme:
+    // if (denominator is normal) // fdiv.fast
+    //   one-iteration Newton-Raphson refinement
+    // else // fdiv.slow
+    //   regular division of 1/denominator
+    BasicBlock *OrigBB = FDiv.getParent();
+    Function *Fn = OrigBB->getParent();
+    LLVMContext &Ctx = Fn->getContext();
+
+    BasicBlock *TailBB =
+        OrigBB->splitBasicBlock(FDiv.getIterator(), "fdiv.tail");
+    OrigBB->getTerminator()->eraseFromParent();
+
+    BasicBlock *FastBB = BasicBlock::Create(Ctx, "fdiv.fast", Fn, TailBB);
+    BasicBlock *SlowBB = BasicBlock::Create(Ctx, "fdiv.slow", Fn, TailBB);
+
+    IRBuilder<> Builder(Ctx);
+    Builder.setFastMathFlags(FMF);
+    Builder.SetCurrentDebugLocation(DL);
+    Builder.SetInsertPoint(OrigBB);
+    Value *IsNormalOrNaN = Builder.CreateIntrinsic(
+        Intrinsic::is_fpclass, {Ty}, {Den, Builder.getInt32(fcNormal | 
fcNan)});
+    Builder.CreateCondBr(IsNormalOrNaN, FastBB, SlowBB);
+
+    Builder.SetInsertPoint(FastBB);
+    Value *FastResult = EmitNRPath(Builder, Den);
+    Builder.CreateBr(TailBB);
+
+    Builder.SetInsertPoint(SlowBB);
+    Value *SlowNum = ConstantFP::get(Ty, 1.0);
+    Value *SlowResult = Builder.CreateFDiv(SlowNum, Den);
+    cast<Instruction>(SlowResult)
+        ->setMetadata("amdgpu.no.rcp.transform", MDNode::get(Ctx, {}));
+    Builder.CreateBr(TailBB);
+
+    Builder.SetInsertPoint(TailBB, TailBB->begin());
+    PHINode *Phi = Builder.CreatePHI(Ty, 2);
+    Phi->addIncoming(FastResult, FastBB);
+    Phi->addIncoming(SlowResult, SlowBB);
+    Result = Phi;
+
+    FlowChanged = true;
+  }
+
+  FDiv.replaceAllUsesWith(Result);
+  FDiv.eraseFromParent();
+  return true;
+}
+
 Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
     IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
     FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
@@ -901,6 +1066,9 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator 
&FDiv) {
   if (DisableFDivExpand)
     return false;
 
+  if (expandReciprocalNewtonRaphson(FDiv))
+    return true;
+
   Type *Ty = FDiv.getType()->getScalarType();
   const bool IsFloat = Ty->isFloatTy();
   if (!IsFloat && !Ty->isDoubleTy())
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.normal.f32.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.normal.f32.ll
new file mode 100644
index 0000000000000..c4d9a8f7f25b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.normal.f32.ll
@@ -0,0 +1,705 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare \
+; RUN:   -enable-fp32-recip-newton-raphson %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare \
+; RUN:   -enable-fp32-recip-newton-raphson -denormal-fp-math-f32=preserve-sign 
\
+; RUN:   %s | FileCheck -check-prefix=OPT-FTZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -enable-fp32-recip-newton-raphson < %s 
\
+; RUN:   | FileCheck -check-prefix=GFX9 %s
+
+; IEEE mode (default): branch + NR with large-normal scaling.
+; FTZ mode: branch + NR without scaling.
+define amdgpu_kernel void @test_fdiv_recip_f32(ptr addrspace(1) %out, float 
%x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 
267)
+; OPT-NEXT:    br i1 [[TMP1]], label %[[FDIV_FAST:.*]], label %[[FDIV_SLOW:.*]]
+; OPT:       [[FDIV_FAST]]:
+; OPT-NEXT:    [[TMP2:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; OPT-NEXT:    [[TMP3:%.*]] = fcmp ogt float [[TMP2]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP5:%.*]] = fmul float [[X]], [[TMP4]]
+; OPT-NEXT:    [[TMP6:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP5]])
+; OPT-NEXT:    [[TMP7:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float 
[[TMP6]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP8:%.*]] = fneg float [[TMP7]]
+; OPT-NEXT:    [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float 
[[TMP8]], float [[TMP6]])
+; OPT-NEXT:    [[TMP10:%.*]] = fmul float [[TMP9]], [[TMP4]]
+; OPT-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT:       [[FDIV_SLOW]]:
+; OPT-NEXT:    [[TMP11:%.*]] = fdiv float 1.000000e+00, [[X]], 
!amdgpu.no.rcp.transform [[META0:![0-9]+]]
+; OPT-NEXT:    br label %[[FDIV_TAIL]]
+; OPT:       [[FDIV_TAIL]]:
+; OPT-NEXT:    [[TMP12:%.*]] = phi float [ [[TMP10]], %[[FDIV_FAST]] ], [ 
[[TMP11]], %[[FDIV_SLOW]] ]
+; OPT-NEXT:    store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) 
#[[ATTR0:[0-9]+]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], 
i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP1]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT-FTZ:       [[FDIV_FAST]]:
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[X]])
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call float @llvm.fma.f32(float [[X]], float 
[[TMP2]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP2]], 
float [[TMP4]], float [[TMP2]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT-FTZ:       [[FDIV_SLOW]]:
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = fdiv float 1.000000e+00, [[X]], 
!amdgpu.no.rcp.transform [[META0:![0-9]+]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL]]
+; OPT-FTZ:       [[FDIV_TAIL]]:
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = phi float [ [[TMP5]], %[[FDIV_FAST]] ], [ 
[[TMP6]], %[[FDIV_SLOW]] ]
+; OPT-FTZ-NEXT:    store float [[TMP7]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+; GFX9-LABEL: test_fdiv_recip_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x10b
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, vcc
+; GFX9-NEXT:    s_cbranch_vccz .LBB0_2
+; GFX9-NEXT:  ; %bb.1: ; %fdiv.slow
+; GFX9-NEXT:    v_div_scale_f32 v0, s[0:1], s2, s2, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v1, vcc, 1.0, s2, 1.0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX9-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
+; GFX9-NEXT:    v_fma_f32 v2, v3, v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v4, -v0, v3, v1
+; GFX9-NEXT:    v_fma_f32 v3, v4, v2, v3
+; GFX9-NEXT:    v_fma_f32 v0, -v0, v3, v1
+; GFX9-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
+; GFX9-NEXT:    v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX9-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-NEXT:    s_branch .LBB0_4
+; GFX9-NEXT:  .LBB0_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr0
+; GFX9-NEXT:  .LBB0_3: ; %fdiv.fast
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x2f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |s2|, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v2, -1.0
+; GFX9-NEXT:    v_fma_f32 v1, v2, -v1, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:  .LBB0_4: ; %fdiv.tail
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %fdiv = fdiv float 1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Negative reciprocal: -1.0/x should also be transformed (fneg + rcp).
+define amdgpu_kernel void @test_fdiv_neg_recip_f32(ptr addrspace(1) %out, 
float %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = fneg float [[X]]
+; OPT-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP1]], i32 
267)
+; OPT-NEXT:    br i1 [[TMP2]], label %[[FDIV_FAST:.*]], label %[[FDIV_SLOW:.*]]
+; OPT:       [[FDIV_FAST]]:
+; OPT-NEXT:    [[TMP3:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
+; OPT-NEXT:    [[TMP4:%.*]] = fcmp ogt float [[TMP3]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP6:%.*]] = fmul float [[TMP1]], [[TMP5]]
+; OPT-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP6]])
+; OPT-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float 
[[TMP7]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP9:%.*]] = fneg float [[TMP8]]
+; OPT-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float 
[[TMP9]], float [[TMP7]])
+; OPT-NEXT:    [[TMP11:%.*]] = fmul float [[TMP10]], [[TMP5]]
+; OPT-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT:       [[FDIV_SLOW]]:
+; OPT-NEXT:    [[TMP12:%.*]] = fdiv float 1.000000e+00, [[TMP1]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL]]
+; OPT:       [[FDIV_TAIL]]:
+; OPT-NEXT:    [[TMP13:%.*]] = phi float [ [[TMP11]], %[[FDIV_FAST]] ], [ 
[[TMP12]], %[[FDIV_SLOW]] ]
+; OPT-NEXT:    store float [[TMP13]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = fneg float [[X]]
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP1]], 
i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP2]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT-FTZ:       [[FDIV_FAST]]:
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], 
float [[TMP3]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = fneg float [[TMP4]]
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], 
float [[TMP5]], float [[TMP3]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT-FTZ:       [[FDIV_SLOW]]:
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = fdiv float 1.000000e+00, [[TMP1]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL]]
+; OPT-FTZ:       [[FDIV_TAIL]]:
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP6]], %[[FDIV_FAST]] ], [ 
[[TMP7]], %[[FDIV_SLOW]] ]
+; OPT-FTZ-NEXT:    store float [[TMP8]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+; GFX9-LABEL: test_fdiv_neg_recip_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x10b
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], -s2, v0
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB1_2
+; GFX9-NEXT:  ; %bb.1: ; %fdiv.slow
+; GFX9-NEXT:    v_div_scale_f32 v0, s[0:1], s2, s2, -1.0
+; GFX9-NEXT:    v_div_scale_f32 v1, vcc, -1.0, s2, -1.0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX9-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
+; GFX9-NEXT:    v_fma_f32 v2, v3, v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v4, -v0, v3, v1
+; GFX9-NEXT:    v_fma_f32 v3, v4, v2, v3
+; GFX9-NEXT:    v_fma_f32 v0, -v0, v3, v1
+; GFX9-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
+; GFX9-NEXT:    v_div_fixup_f32 v0, v0, s2, -1.0
+; GFX9-NEXT:    s_cbranch_execz .LBB1_3
+; GFX9-NEXT:    s_branch .LBB1_4
+; GFX9-NEXT:  .LBB1_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr0
+; GFX9-NEXT:  .LBB1_3: ; %fdiv.fast
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x2f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |s2|, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e64 v1, -s2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v2, -1.0
+; GFX9-NEXT:    v_fma_f32 v1, v2, -v1, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:  .LBB1_4: ; %fdiv.tail
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %fdiv = fdiv float -1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Non-reciprocal division should not be transformed.
+define amdgpu_kernel void @test_fdiv_non_recip_f32(ptr addrspace(1) %out, 
float %x, float %y) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_non_recip_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]]) 
#[[ATTR0]] {
+; OPT-NEXT:    [[FDIV:%.*]] = fdiv float [[Y]], [[X]]
+; OPT-NEXT:    store float [[FDIV]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_non_recip_f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float 
[[Y:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[FDIV:%.*]] = fdiv float [[Y]], [[X]]
+; OPT-FTZ-NEXT:    store float [[FDIV]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+; GFX9-LABEL: test_fdiv_non_recip_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_div_scale_f32 v1, s[4:5], s2, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_div_scale_f32 v2, vcc, s3, v2, s3
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX9-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX9-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX9-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX9-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX9-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX9-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_div_fixup_f32 v0, v1, s2, v0
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %fdiv = fdiv float %y, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Known-normal denominator (IEEE mode): NR inline with scaling, no branch.
+; Known-normal denominator (FTZ mode): NR inline, no scaling, no branch.
+define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(ptr addrspace(1) 
%out, float nofpclass(nan inf zero sub) %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan inf zero sub) 
[[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; OPT-NEXT:    [[TMP2:%.*]] = fcmp ogt float [[TMP1]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP4:%.*]] = fmul float [[X]], [[TMP3]]
+; OPT-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP4]])
+; OPT-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float 
[[TMP5]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP7:%.*]] = fneg float [[TMP6]]
+; OPT-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float 
[[TMP7]], float [[TMP5]])
+; OPT-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[TMP3]]
+; OPT-NEXT:    store float [[TMP9]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan inf zero 
sub) [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[X]])
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call float @llvm.fma.f32(float [[X]], float 
[[TMP1]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = fneg float [[TMP2]]
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], 
float [[TMP3]], float [[TMP1]])
+; OPT-FTZ-NEXT:    store float [[TMP4]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+; GFX9-LABEL: test_fdiv_recip_f32_known_normal:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x2f800000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |s0|, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_fma_f32 v1, v1, v2, -1.0
+; GFX9-NEXT:    v_fma_f32 v1, v2, -v1, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %fdiv = fdiv float 1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+; Vector reciprocal: <2 x float> should be scalarized and each lane 
transformed.
+define amdgpu_kernel void @test_fdiv_recip_v2f32(ptr addrspace(1) %out, <2 x 
float> %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-NEXT:    [[TMP13:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP1]], 
i32 267)
+; OPT-NEXT:    br i1 [[TMP13]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT:       [[FDIV_FAST]]:
+; OPT-NEXT:    [[TMP14:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
+; OPT-NEXT:    [[TMP15:%.*]] = fcmp ogt float [[TMP14]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP27:%.*]] = select i1 [[TMP15]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP28:%.*]] = fmul float [[TMP1]], [[TMP27]]
+; OPT-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP28]])
+; OPT-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float 
[[TMP7]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP9:%.*]] = fneg float [[TMP8]]
+; OPT-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float 
[[TMP9]], float [[TMP7]])
+; OPT-NEXT:    [[TMP11:%.*]] = fmul float [[TMP10]], [[TMP27]]
+; OPT-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT:       [[FDIV_SLOW]]:
+; OPT-NEXT:    [[TMP26:%.*]] = fdiv float 1.000000e+00, [[TMP1]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL]]
+; OPT:       [[FDIV_TAIL]]:
+; OPT-NEXT:    [[TMP29:%.*]] = phi float [ [[TMP11]], %[[FDIV_FAST]] ], [ 
[[TMP26]], %[[FDIV_SLOW]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP4]], 
i32 267)
+; OPT-NEXT:    br i1 [[TMP16]], label %[[FDIV_FAST2:.*]], label 
%[[FDIV_SLOW3:.*]]
+; OPT:       [[FDIV_FAST2]]:
+; OPT-NEXT:    [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP4]])
+; OPT-NEXT:    [[TMP18:%.*]] = fcmp ogt float [[TMP17]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP20:%.*]] = fmul float [[TMP4]], [[TMP19]]
+; OPT-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]])
+; OPT-NEXT:    [[TMP22:%.*]] = call float @llvm.fma.f32(float [[TMP20]], float 
[[TMP21]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP23:%.*]] = fneg float [[TMP22]]
+; OPT-NEXT:    [[TMP24:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float 
[[TMP23]], float [[TMP21]])
+; OPT-NEXT:    [[TMP25:%.*]] = fmul float [[TMP24]], [[TMP19]]
+; OPT-NEXT:    br label %[[FDIV_TAIL1:.*]]
+; OPT:       [[FDIV_SLOW3]]:
+; OPT-NEXT:    [[TMP30:%.*]] = fdiv float 1.000000e+00, [[TMP4]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL1]]
+; OPT:       [[FDIV_TAIL1]]:
+; OPT-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP25]], %[[FDIV_FAST2]] ], [ 
[[TMP30]], %[[FDIV_SLOW3]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float 
[[TMP29]], i64 0
+; OPT-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float 
[[TMP5]], i64 1
+; OPT-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) 
#[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-FTZ-NEXT:    [[TMP13:%.*]] = call i1 @llvm.is.fpclass.f32(float 
[[TMP1]], i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP13]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT-FTZ:       [[FDIV_FAST]]:
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP1]], 
float [[TMP7]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP9:%.*]] = fneg float [[TMP8]]
+; OPT-FTZ-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], 
float [[TMP9]], float [[TMP7]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT-FTZ:       [[FDIV_SLOW]]:
+; OPT-FTZ-NEXT:    [[TMP11:%.*]] = fdiv float 1.000000e+00, [[TMP1]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL]]
+; OPT-FTZ:       [[FDIV_TAIL]]:
+; OPT-FTZ-NEXT:    [[TMP12:%.*]] = phi float [ [[TMP10]], %[[FDIV_FAST]] ], [ 
[[TMP11]], %[[FDIV_SLOW]] ]
+; OPT-FTZ-NEXT:    [[TMP16:%.*]] = call i1 @llvm.is.fpclass.f32(float 
[[TMP4]], i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP16]], label %[[FDIV_FAST2:.*]], label 
%[[FDIV_SLOW3:.*]]
+; OPT-FTZ:       [[FDIV_FAST2]]:
+; OPT-FTZ-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP4]])
+; OPT-FTZ-NEXT:    [[TMP22:%.*]] = call float @llvm.fma.f32(float [[TMP4]], 
float [[TMP21]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP23:%.*]] = fneg float [[TMP22]]
+; OPT-FTZ-NEXT:    [[TMP24:%.*]] = call float @llvm.fma.f32(float [[TMP21]], 
float [[TMP23]], float [[TMP21]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL1:.*]]
+; OPT-FTZ:       [[FDIV_SLOW3]]:
+; OPT-FTZ-NEXT:    [[TMP15:%.*]] = fdiv float 1.000000e+00, [[TMP4]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL1]]
+; OPT-FTZ:       [[FDIV_TAIL1]]:
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP24]], %[[FDIV_FAST2]] ], [ 
[[TMP15]], %[[FDIV_SLOW3]] ]
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float 
[[TMP12]], i64 0
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float 
[[TMP5]], i64 1
+; OPT-FTZ-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[OUT]], align 
8
+; OPT-FTZ-NEXT:    ret void
+;
+; GFX9-LABEL: test_fdiv_recip_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x10b
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, vcc
+; GFX9-NEXT:    s_cbranch_vccz .LBB4_2
+; GFX9-NEXT:  ; %bb.1: ; %fdiv.slow
+; GFX9-NEXT:    v_div_scale_f32 v0, s[2:3], s0, s0, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v1, vcc, 1.0, s0, 1.0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX9-NEXT:    v_fma_f32 v3, -v0, v2, 1.0
+; GFX9-NEXT:    v_fma_f32 v2, v3, v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v4, -v0, v3, v1
+; GFX9-NEXT:    v_fma_f32 v3, v4, v2, v3
+; GFX9-NEXT:    v_fma_f32 v0, -v0, v3, v1
+; GFX9-NEXT:    v_div_fmas_f32 v0, v0, v2, v3
+; GFX9-NEXT:    v_div_fixup_f32 v0, v0, s0, 1.0
+; GFX9-NEXT:    s_cbranch_execz .LBB4_3
+; GFX9-NEXT:    s_branch .LBB4_4
+; GFX9-NEXT:  .LBB4_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr0
+; GFX9-NEXT:  .LBB4_3: ; %fdiv.fast
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e800000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x2f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |s0|, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v2, -1.0
+; GFX9-NEXT:    v_fma_f32 v1, v2, -v1, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:  .LBB4_4: ; %fdiv.tail
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x10b
+; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    s_andn2_b64 vcc, exec, vcc
+; GFX9-NEXT:    s_cbranch_vccz .LBB4_6
+; GFX9-NEXT:  ; %bb.5: ; %fdiv.slow3
+; GFX9-NEXT:    v_div_scale_f32 v1, s[2:3], s1, s1, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v2, vcc, 1.0, s1, 1.0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX9-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX9-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX9-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX9-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX9-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX9-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX9-NEXT:    v_div_fixup_f32 v1, v1, s1, 1.0
+; GFX9-NEXT:    s_cbranch_execz .LBB4_7
+; GFX9-NEXT:    s_branch .LBB4_8
+; GFX9-NEXT:  .LBB4_6:
+; GFX9-NEXT:  .LBB4_7: ; %fdiv.fast2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e800000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |s1|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v2, s1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX9-NEXT:    v_fma_f32 v2, v2, v3, -1.0
+; GFX9-NEXT:    v_fma_f32 v2, v3, -v2, v3
+; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX9-NEXT:  .LBB4_8: ; %fdiv.tail1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
+  store <2 x float> %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+
+; Negative vector reciprocal: <-1.0, -1.0>/x should be scalarized and each 
lane transformed.
+define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(ptr addrspace(1) %out, <2 
x float> %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = fneg float [[TMP1]]
+; OPT-NEXT:    [[TMP4:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP3]], i32 
267)
+; OPT-NEXT:    br i1 [[TMP4]], label %[[FDIV_FAST:.*]], label %[[FDIV_SLOW:.*]]
+; OPT:       [[FDIV_FAST]]:
+; OPT-NEXT:    [[TMP5:%.*]] = call float @llvm.fabs.f32(float [[TMP3]])
+; OPT-NEXT:    [[TMP6:%.*]] = fcmp ogt float [[TMP5]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP8:%.*]] = fmul float [[TMP3]], [[TMP7]]
+; OPT-NEXT:    [[TMP9:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]])
+; OPT-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP8]], float 
[[TMP9]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP11:%.*]] = fneg float [[TMP10]]
+; OPT-NEXT:    [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float 
[[TMP11]], float [[TMP9]])
+; OPT-NEXT:    [[TMP13:%.*]] = fmul float [[TMP12]], [[TMP7]]
+; OPT-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT:       [[FDIV_SLOW]]:
+; OPT-NEXT:    [[TMP14:%.*]] = fdiv float 1.000000e+00, [[TMP3]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL]]
+; OPT:       [[FDIV_TAIL]]:
+; OPT-NEXT:    [[TMP15:%.*]] = phi float [ [[TMP13]], %[[FDIV_FAST]] ], [ 
[[TMP14]], %[[FDIV_SLOW]] ]
+; OPT-NEXT:    [[TMP16:%.*]] = fneg float [[TMP2]]
+; OPT-NEXT:    [[TMP17:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP16]], 
i32 267)
+; OPT-NEXT:    br i1 [[TMP17]], label %[[FDIV_FAST2:.*]], label 
%[[FDIV_SLOW3:.*]]
+; OPT:       [[FDIV_FAST2]]:
+; OPT-NEXT:    [[TMP18:%.*]] = call float @llvm.fabs.f32(float [[TMP16]])
+; OPT-NEXT:    [[TMP19:%.*]] = fcmp ogt float [[TMP18]], 0x47D0000000000000
+; OPT-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float 0x3DF0000000000000, 
float 1.000000e+00
+; OPT-NEXT:    [[TMP21:%.*]] = fmul float [[TMP16]], [[TMP20]]
+; OPT-NEXT:    [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP21]])
+; OPT-NEXT:    [[TMP23:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float 
[[TMP22]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP24:%.*]] = fneg float [[TMP23]]
+; OPT-NEXT:    [[TMP25:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float 
[[TMP24]], float [[TMP22]])
+; OPT-NEXT:    [[TMP26:%.*]] = fmul float [[TMP25]], [[TMP20]]
+; OPT-NEXT:    br label %[[FDIV_TAIL1:.*]]
+; OPT:       [[FDIV_SLOW3]]:
+; OPT-NEXT:    [[TMP27:%.*]] = fdiv float 1.000000e+00, [[TMP16]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL1]]
+; OPT:       [[FDIV_TAIL1]]:
+; OPT-NEXT:    [[TMP28:%.*]] = phi float [ [[TMP26]], %[[FDIV_FAST2]] ], [ 
[[TMP27]], %[[FDIV_SLOW3]] ]
+; OPT-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> poison, float 
[[TMP15]], i64 0
+; OPT-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float 
[[TMP28]], i64 1
+; OPT-NEXT:    store <2 x float> [[TMP30]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) 
#[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = fneg float [[TMP1]]
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP3]], 
i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP4]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT-FTZ:       [[FDIV_FAST]]:
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP3]])
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], 
float [[TMP5]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = fneg float [[TMP6]]
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], 
float [[TMP7]], float [[TMP5]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT-FTZ:       [[FDIV_SLOW]]:
+; OPT-FTZ-NEXT:    [[TMP9:%.*]] = fdiv float 1.000000e+00, [[TMP3]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL]]
+; OPT-FTZ:       [[FDIV_TAIL]]:
+; OPT-FTZ-NEXT:    [[TMP10:%.*]] = phi float [ [[TMP8]], %[[FDIV_FAST]] ], [ 
[[TMP9]], %[[FDIV_SLOW]] ]
+; OPT-FTZ-NEXT:    [[TMP11:%.*]] = fneg float [[TMP2]]
+; OPT-FTZ-NEXT:    [[TMP12:%.*]] = call i1 @llvm.is.fpclass.f32(float 
[[TMP11]], i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP12]], label %[[FDIV_FAST2:.*]], label 
%[[FDIV_SLOW3:.*]]
+; OPT-FTZ:       [[FDIV_FAST2]]:
+; OPT-FTZ-NEXT:    [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP11]])
+; OPT-FTZ-NEXT:    [[TMP14:%.*]] = call float @llvm.fma.f32(float [[TMP11]], 
float [[TMP13]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP15:%.*]] = fneg float [[TMP14]]
+; OPT-FTZ-NEXT:    [[TMP16:%.*]] = call float @llvm.fma.f32(float [[TMP13]], 
float [[TMP15]], float [[TMP13]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL1:.*]]
+; OPT-FTZ:       [[FDIV_SLOW3]]:
+; OPT-FTZ-NEXT:    [[TMP17:%.*]] = fdiv float 1.000000e+00, [[TMP11]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL1]]
+; OPT-FTZ:       [[FDIV_TAIL1]]:
+; OPT-FTZ-NEXT:    [[TMP18:%.*]] = phi float [ [[TMP16]], %[[FDIV_FAST2]] ], [ 
[[TMP17]], %[[FDIV_SLOW3]] ]
+; OPT-FTZ-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> poison, float 
[[TMP10]], i64 0
+; OPT-FTZ-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float 
[[TMP18]], i64 1
+; OPT-FTZ-NEXT:    store <2 x float> [[TMP20]], ptr addrspace(1) [[OUT]], 
align 8
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv <2 x float> <float -1.0, float -1.0>, %x
+  store <2 x float> %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; f64 should not be transformed (only f32).
+define amdgpu_kernel void @test_fdiv_recip_f64(ptr addrspace(1) %out, double 
%x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f64(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], double [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[FDIV:%.*]] = fdiv double 1.000000e+00, [[X]]
+; OPT-NEXT:    store double [[FDIV]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f64(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], double [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[FDIV:%.*]] = fdiv double 1.000000e+00, [[X]]
+; OPT-FTZ-NEXT:    store double [[FDIV]], ptr addrspace(1) [[OUT]], align 8
+; OPT-FTZ-NEXT:    ret void
+;
+; GFX9-LABEL: test_fdiv_recip_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0
+; GFX9-NEXT:    v_div_scale_f64 v[6:7], vcc, 1.0, s[2:3], 1.0
+; GFX9-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3]
+; GFX9-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], v[6:7]
+; GFX9-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %fdiv = fdiv double 1.0, %x
+  store double %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; Verify that fast-math flags on the original fdiv are preserved on
+; the instructions emitted by the NR expansion.
+define amdgpu_kernel void @test_fdiv_recip_f32_fmf(ptr addrspace(1) %out, 
float %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_fmf(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], i32 
267)
+; OPT-NEXT:    br i1 [[TMP1]], label %[[FDIV_FAST:.*]], label %[[FDIV_SLOW:.*]]
+; OPT:       [[FDIV_FAST]]:
+; OPT-NEXT:    [[TMP2:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[X]])
+; OPT-NEXT:    [[TMP3:%.*]] = fcmp nnan ninf ogt float [[TMP2]], 
0x47D0000000000000
+; OPT-NEXT:    [[TMP4:%.*]] = select nnan ninf i1 [[TMP3]], float 
0x3DF0000000000000, float 1.000000e+00
+; OPT-NEXT:    [[TMP5:%.*]] = fmul nnan ninf float [[X]], [[TMP4]]
+; OPT-NEXT:    [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float 
[[TMP5]])
+; OPT-NEXT:    [[TMP7:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP5]], float [[TMP6]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP8:%.*]] = fneg nnan ninf float [[TMP7]]
+; OPT-NEXT:    [[TMP9:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP6]], float [[TMP8]], float [[TMP6]])
+; OPT-NEXT:    [[TMP10:%.*]] = fmul nnan ninf float [[TMP9]], [[TMP4]]
+; OPT-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT:       [[FDIV_SLOW]]:
+; OPT-NEXT:    [[TMP11:%.*]] = fdiv nnan ninf float 1.000000e+00, [[X]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL]]
+; OPT:       [[FDIV_TAIL]]:
+; OPT-NEXT:    [[TMP12:%.*]] = phi nnan ninf float [ [[TMP10]], %[[FDIV_FAST]] 
], [ [[TMP11]], %[[FDIV_SLOW]] ]
+; OPT-NEXT:    store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_fmf(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f32(float [[X]], 
i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP1]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT-FTZ:       [[FDIV_FAST]]:
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call nnan ninf float 
@llvm.amdgcn.rcp.f32(float [[X]])
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[X]], float [[TMP2]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = fneg nnan ninf float [[TMP3]]
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP2]], float [[TMP4]], float [[TMP2]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT-FTZ:       [[FDIV_SLOW]]:
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = fdiv nnan ninf float 1.000000e+00, [[X]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL]]
+; OPT-FTZ:       [[FDIV_TAIL]]:
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = phi nnan ninf float [ [[TMP5]], 
%[[FDIV_FAST]] ], [ [[TMP6]], %[[FDIV_SLOW]] ]
+; OPT-FTZ-NEXT:    store float [[TMP7]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv nnan ninf float 1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Verify that fast-math flags are preserved through vector scalarization.
+define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(ptr addrspace(1) %out, <2 
x float> %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP1]], i32 
267)
+; OPT-NEXT:    br i1 [[TMP3]], label %[[FDIV_FAST:.*]], label %[[FDIV_SLOW:.*]]
+; OPT:       [[FDIV_FAST]]:
+; OPT-NEXT:    [[TMP4:%.*]] = call nnan ninf float @llvm.fabs.f32(float 
[[TMP1]])
+; OPT-NEXT:    [[TMP5:%.*]] = fcmp nnan ninf ogt float [[TMP4]], 
0x47D0000000000000
+; OPT-NEXT:    [[TMP6:%.*]] = select nnan ninf i1 [[TMP5]], float 
0x3DF0000000000000, float 1.000000e+00
+; OPT-NEXT:    [[TMP7:%.*]] = fmul nnan ninf float [[TMP1]], [[TMP6]]
+; OPT-NEXT:    [[TMP8:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float 
[[TMP7]])
+; OPT-NEXT:    [[TMP9:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP7]], float [[TMP8]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP10:%.*]] = fneg nnan ninf float [[TMP9]]
+; OPT-NEXT:    [[TMP11:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP8]], float [[TMP10]], float [[TMP8]])
+; OPT-NEXT:    [[TMP12:%.*]] = fmul nnan ninf float [[TMP11]], [[TMP6]]
+; OPT-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT:       [[FDIV_SLOW]]:
+; OPT-NEXT:    [[TMP13:%.*]] = fdiv nnan ninf float 1.000000e+00, [[TMP1]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL]]
+; OPT:       [[FDIV_TAIL]]:
+; OPT-NEXT:    [[TMP14:%.*]] = phi nnan ninf float [ [[TMP12]], %[[FDIV_FAST]] 
], [ [[TMP13]], %[[FDIV_SLOW]] ]
+; OPT-NEXT:    [[TMP15:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP2]], 
i32 267)
+; OPT-NEXT:    br i1 [[TMP15]], label %[[FDIV_FAST2:.*]], label 
%[[FDIV_SLOW3:.*]]
+; OPT:       [[FDIV_FAST2]]:
+; OPT-NEXT:    [[TMP16:%.*]] = call nnan ninf float @llvm.fabs.f32(float 
[[TMP2]])
+; OPT-NEXT:    [[TMP17:%.*]] = fcmp nnan ninf ogt float [[TMP16]], 
0x47D0000000000000
+; OPT-NEXT:    [[TMP18:%.*]] = select nnan ninf i1 [[TMP17]], float 
0x3DF0000000000000, float 1.000000e+00
+; OPT-NEXT:    [[TMP19:%.*]] = fmul nnan ninf float [[TMP2]], [[TMP18]]
+; OPT-NEXT:    [[TMP20:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float 
[[TMP19]])
+; OPT-NEXT:    [[TMP21:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP19]], float [[TMP20]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP22:%.*]] = fneg nnan ninf float [[TMP21]]
+; OPT-NEXT:    [[TMP23:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP20]], float [[TMP22]], float [[TMP20]])
+; OPT-NEXT:    [[TMP24:%.*]] = fmul nnan ninf float [[TMP23]], [[TMP18]]
+; OPT-NEXT:    br label %[[FDIV_TAIL1:.*]]
+; OPT:       [[FDIV_SLOW3]]:
+; OPT-NEXT:    [[TMP25:%.*]] = fdiv nnan ninf float 1.000000e+00, [[TMP2]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-NEXT:    br label %[[FDIV_TAIL1]]
+; OPT:       [[FDIV_TAIL1]]:
+; OPT-NEXT:    [[TMP26:%.*]] = phi nnan ninf float [ [[TMP24]], 
%[[FDIV_FAST2]] ], [ [[TMP25]], %[[FDIV_SLOW3]] ]
+; OPT-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float 
[[TMP14]], i64 0
+; OPT-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float 
[[TMP26]], i64 1
+; OPT-NEXT:    store <2 x float> [[TMP28]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) 
#[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call i1 @llvm.is.fpclass.f32(float [[TMP1]], 
i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP3]], label %[[FDIV_FAST:.*]], label 
%[[FDIV_SLOW:.*]]
+; OPT-FTZ:       [[FDIV_FAST]]:
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call nnan ninf float 
@llvm.amdgcn.rcp.f32(float [[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP1]], float [[TMP4]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = fneg nnan ninf float [[TMP5]]
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP4]], float [[TMP6]], float [[TMP4]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL:.*]]
+; OPT-FTZ:       [[FDIV_SLOW]]:
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = fdiv nnan ninf float 1.000000e+00, [[TMP1]], 
!amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL]]
+; OPT-FTZ:       [[FDIV_TAIL]]:
+; OPT-FTZ-NEXT:    [[TMP9:%.*]] = phi nnan ninf float [ [[TMP7]], 
%[[FDIV_FAST]] ], [ [[TMP8]], %[[FDIV_SLOW]] ]
+; OPT-FTZ-NEXT:    [[TMP10:%.*]] = call i1 @llvm.is.fpclass.f32(float 
[[TMP2]], i32 267)
+; OPT-FTZ-NEXT:    br i1 [[TMP10]], label %[[FDIV_FAST2:.*]], label 
%[[FDIV_SLOW3:.*]]
+; OPT-FTZ:       [[FDIV_FAST2]]:
+; OPT-FTZ-NEXT:    [[TMP11:%.*]] = call nnan ninf float 
@llvm.amdgcn.rcp.f32(float [[TMP2]])
+; OPT-FTZ-NEXT:    [[TMP12:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP2]], float [[TMP11]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP13:%.*]] = fneg nnan ninf float [[TMP12]]
+; OPT-FTZ-NEXT:    [[TMP14:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP11]], float [[TMP13]], float [[TMP11]])
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL1:.*]]
+; OPT-FTZ:       [[FDIV_SLOW3]]:
+; OPT-FTZ-NEXT:    [[TMP15:%.*]] = fdiv nnan ninf float 1.000000e+00, 
[[TMP2]], !amdgpu.no.rcp.transform [[META0]]
+; OPT-FTZ-NEXT:    br label %[[FDIV_TAIL1]]
+; OPT-FTZ:       [[FDIV_TAIL1]]:
+; OPT-FTZ-NEXT:    [[TMP16:%.*]] = phi nnan ninf float [ [[TMP14]], 
%[[FDIV_FAST2]] ], [ [[TMP15]], %[[FDIV_SLOW3]] ]
+; OPT-FTZ-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> poison, float 
[[TMP9]], i64 0
+; OPT-FTZ-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float 
[[TMP16]], i64 1
+; OPT-FTZ-NEXT:    store <2 x float> [[TMP18]], ptr addrspace(1) [[OUT]], 
align 8
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv nnan ninf <2 x float> splat (float 1.0), %x
+  store <2 x float> %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+;.
+; OPT: [[META0]] = !{}
+;.
+; OPT-FTZ: [[META0]] = !{}
+;.

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AMDGPU] Use 1-iteration Newton-Raphson refinement for FP32 reciprocal. (PR #194716)

Reply via email to