https://github.com/carlobertolli updated 
https://github.com/llvm/llvm-project/pull/194716

>From 63eb24ce7e4ba44c659e47c5b0bab6d40da8e751 Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <[email protected]>
Date: Wed, 15 Apr 2026 23:39:37 +0000
Subject: [PATCH] [AMDGPU] Use 1-iteration Newton-Raphson refinement for FP32
 reciprocal. The AMDGPU backend lowers the following code

r = 1/x;

to a FP32 division, i.e. a 12-instruction code sequence that handles all 
denominator cases (normal, denormal, inf, Nans).

This patch lowers reciprocals to a single iteration Newton-Raphson refinement.
All FP32 cases are supported: normal, NaN, subnormal, infinity, and 0.
Large normals, whose reciprocal is subnormal and flushed to 0 by v_rcp_f32,
are scaled to a small normal before using v_rcp_f32 and then scaled back.
Subnormals are scaled before and after v_rcp_32, except when
denormals are flushed-to-zero.
Max ULP is 1 but for only 0.15% of all possible FP32.
The alternative solution in emitRcpIEEE1ULP supports 1ULP but for 10.76% of all 
FP32 values,
making this implementation more accurate.

When using this algorithm, we see ~7% performance improvements over full 
division for a kernel only executing reciprocals.

This behavior is hidden behind a flag (turned off by default)
-amdgpu-enable-fp32-recip-newton-raphson

Assisted-by: Cursor (Claude)
---
 clang/include/clang/Options/Options.td        |   9 +
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |   5 +
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   8 +
 .../amdgpu-fp32-recip-newton-raphson.hip      |  30 ++
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 144 +++++++
 .../llvm.amdgcn.rcp.newtonraphson.f32.ll      | 370 ++++++++++++++++++
 6 files changed, 566 insertions(+)
 create mode 100644 clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip
 create mode 100644 
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll

diff --git a/clang/include/clang/Options/Options.td 
b/clang/include/clang/Options/Options.td
index 6fc8806ba683c..5f6b7885d55b6 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -5831,6 +5831,15 @@ defm amdgpu_expand_waitcnt_profiling : 
BoolMOption<"amdgpu-expand-waitcnt-profil
   "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">,
   NegFlag<SetFalse, [], [ClangOption]>>;
 
+def mamdgpu_fp32_recip_newton_raphson : Flag<["-"], 
"mamdgpu-fp32-recip-newton-raphson">,
+  Group<m_Group>,
+  HelpText<"Use Newton-Raphson refinement for FP32 1.0f/x reciprocal when the 
denominator "
+  "is a normal float or NaN, falling back to the full division sequence for "
+  "denormals/inf/zero. (AMDGPU only)">;
+def mno_amdgpu_fp32_recip_newton_raphson : Flag<["-"], 
"mno-amdgpu-fp32-recip-newton-raphson">,
+  Group<m_Group>,
+  HelpText<"Disable Newton-Raphson refinement for FP32 reciprocal (AMDGPU 
only)">;
+
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, 
Group<m_Group>,
   HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">,
   Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp 
b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 631eb0b98df7e..923fa38d4edeb 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -865,6 +865,11 @@ void AMDGPUToolChain::addClangTargetOptions(
       !DriverArgs.hasArg(options::OPT_disable_llvm_optzns))
     CC1Args.push_back("-disable-llvm-optzns");
 
+  if (DriverArgs.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson,
+                         options::OPT_mno_amdgpu_fp32_recip_newton_raphson,
+                         false))
+    CC1Args.append({"-mllvm", "-amdgpu-enable-fp32-recip-newton-raphson"});
+
   if (DeviceOffloadingKind == Action::OFK_None)
     addOpenCLBuiltinsLib(getDriver(), getTriple(), DriverArgs, CC1Args);
 }
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp 
b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 9211803aa6a2f..4bb11021a1fa7 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -67,6 +67,9 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, 
const JobAction &JA,
                         "-plugin-opt=-amdgpu-internalize-symbols"};
   if (Args.hasArg(options::OPT_hipstdpar))
     LldArgs.push_back("-plugin-opt=-amdgpu-enable-hipstdpar");
+  if (Args.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson,
+                   options::OPT_mno_amdgpu_fp32_recip_newton_raphson, false))
+    LldArgs.push_back("-plugin-opt=-amdgpu-enable-fp32-recip-newton-raphson");
 
   auto &TC = getToolChain();
   auto &D = TC.getDriver();
@@ -249,6 +252,11 @@ void HIPAMDToolChain::addClangTargetOptions(
       CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"});
   }
 
+  if (DriverArgs.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson,
+                         options::OPT_mno_amdgpu_fp32_recip_newton_raphson,
+                         false))
+    CC1Args.append({"-mllvm", "-amdgpu-enable-fp32-recip-newton-raphson"});
+
   StringRef MaxThreadsPerBlock =
       DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
   if (!MaxThreadsPerBlock.empty()) {
diff --git a/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip 
b/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip
new file mode 100644
index 0000000000000..d2af25ae760fe
--- /dev/null
+++ b/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip
@@ -0,0 +1,30 @@
+// REQUIRES: amdgpu-registered-target
+
+// Check that -mamdgpu-fp32-recip-newton-raphson passes -mllvm flag to device 
cc1.
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 -mamdgpu-fp32-recip-newton-raphson %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=ON %s
+
+// ON: "-cc1" {{.*}} "-fcuda-is-device" {{.*}} "-mllvm" 
"-amdgpu-enable-fp32-recip-newton-raphson"
+
+// Check that -mno-amdgpu-fp32-recip-newton-raphson does not pass the flag.
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 -mno-amdgpu-fp32-recip-newton-raphson %s 2>&1 
\
+// RUN:   | FileCheck -check-prefix=OFF %s
+
+// OFF-NOT: "-amdgpu-enable-fp32-recip-newton-raphson"
+
+// Check that -mno overrides -m (last flag wins).
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 -mamdgpu-fp32-recip-newton-raphson \
+// RUN:   -mno-amdgpu-fp32-recip-newton-raphson %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=OFF %s
+
+// Check default (off).
+
+// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=OFF %s
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index ffc4b484de072..ae046f77cf169 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -94,6 +94,12 @@ static cl::opt<bool> DisableFDivExpand(
   cl::ReallyHidden,
   cl::init(false));
 
+static cl::opt<bool> EnableFP32ReciprocalNewtonRaphson(
+    "amdgpu-enable-fp32-recip-newton-raphson", cl::Hidden, cl::init(false),
+    cl::desc("Use Newton-Raphson refinement for 1.0f/x when the denominator "
+             "is a normal float, falling back to the full division sequence "
+             "for denormals/inf/nan/zero."));
+
 class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
@@ -248,6 +254,16 @@ class AMDGPUCodeGenPrepareImpl
 
   bool tryNarrowMathIfNoOverflow(Instruction *I);
 
+  // When a division has 1.0f as nominator, expand it
+  // to use a single iteration Newton-Raphson (NR) refinement
+  // algorithm instead of a full division. This is only
+  // safe for normal single precision floating point numbers
+  // and for NaNs. Codegen in the function introduces an if-then-else
+  // structure that checks if the denominator is normal or NaN and
+  // executes NR if that is true; otherwise, it executes a full
+  // division.
+  bool expandReciprocalNewtonRaphson(BinaryOperator &FDiv);
+
 public:
   bool visitFDiv(BinaryOperator &I);
 
@@ -855,6 +871,131 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
   return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
 }
 
+bool AMDGPUCodeGenPrepareImpl::expandReciprocalNewtonRaphson(
+    BinaryOperator &FDiv) {
+  if (!EnableFP32ReciprocalNewtonRaphson)
+    return false;
+
+  Type *Ty = FDiv.getType();
+  if (Ty->getScalarType() != Type::getFloatTy(FDiv.getContext()))
+    return false;
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  const APFloat *NumVal;
+  if (!match(Num, m_APFloat(NumVal)) ||
+      (!NumVal->isExactlyValue(1.0) && !NumVal->isExactlyValue(-1.0)))
+    return false;
+
+  bool IsNegative = NumVal->isExactlyValue(-1.0);
+
+  // Skip fdivs that were created in the slow path.
+  if (FDiv.getMetadata("amdgpu.no.rcp.transform"))
+    return false;
+
+  const FastMathFlags FMF = FDiv.getFastMathFlags();
+  const DebugLoc DL = FDiv.getDebugLoc();
+
+  // For vector types, scalarize into per-element scalar fdivs, then expand
+  // each one. The pass iterates in reverse so newly created instructions
+  // before the current one would be missed; expand them explicitly here.
+  if (Ty->isVectorTy()) {
+    IRBuilder<> Builder(&FDiv);
+    Builder.setFastMathFlags(FMF);
+    Builder.SetCurrentDebugLocation(DL);
+    SmallVector<Value *, 4> NumVals, DenVals;
+    extractValues(Builder, NumVals, Num);
+    extractValues(Builder, DenVals, Den);
+
+    SmallVector<Value *, 4> ResultVals(NumVals.size());
+    SmallVector<BinaryOperator *, 4> ScalarDivs;
+    for (int I = 0, E = NumVals.size(); I != E; ++I) {
+      Value *EltDiv = Builder.CreateFDiv(NumVals[I], DenVals[I]);
+      ScalarDivs.push_back(cast<BinaryOperator>(EltDiv));
+      ResultVals[I] = EltDiv;
+    }
+
+    Value *Result = insertValues(Builder, Ty, ResultVals);
+    FDiv.replaceAllUsesWith(Result);
+    FDiv.eraseFromParent();
+    for (BinaryOperator *SD : ScalarDivs)
+      expandReciprocalNewtonRaphson(*SD);
+    return true;
+  }
+
+  // -1.0 / x -> 1.0 / (fneg x)
+  // Negate the denominator so the NR path computes rcp(-x) = -1/x directly.
+  IRBuilder<> Builder(&FDiv);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(DL);
+  if (IsNegative)
+    Den = Builder.CreateFNeg(Den);
+
+  // 2^126: largest FP32 magnitude whose reciprocal (2^-126) is still normal.
+  constexpr float MaxNormalWithNormalRcp = 0x1.0p126f;
+  // Scale factor for large normals whose reciprocal would be subnormal.
+  constexpr float LargeNormalScaleFactor = 0x1.0p-32f;
+  // Smallest positive normal FP32 value (FLT_MIN).
+  constexpr float SmallestNormal = 0x1.0p-126f;
+  // Scale factor for subnormals to bring them into normal range before rcp.
+  constexpr float SubnormalScaleFactor = 0x1.0p32f;
+
+  // Code generation scheme (branchless):
+  //   y = rcp(x)
+  //   [optional scaling for IEEE denormals]
+  //   NR fixup: y = y - y * (x*y - 1)
+  //   y = div_fixup(y, x, 1.0)
+  //
+  // The NR step improves accuracy for finite nonzero values. For special
+  // inputs (zero, inf, NaN) div_fixup replaces the NR result with the correct
+  // special value.
+  //
+  // In IEEE denormal mode, we scale the denominator before rcp and scale the
+  // result after NR to handle subnormals and large normals whose reciprocal
+  // would be subnormal. In FTZ mode no scaling is needed.
+
+  // Emit NR fixup: given the rcp approximation Y0 and the (possibly scaled)
+  // denominator DForRcp, compute one NR iteration.
+  auto EmitNRFixup = [&](IRBuilder<> &B, Value *Y0, Value *DForRcp) -> Value * 
{
+    Value *Err = B.CreateIntrinsic(Intrinsic::fma, {Ty},
+                                   {DForRcp, Y0, ConstantFP::get(Ty, -1.0)});
+    Value *NegErr = B.CreateFNeg(Err);
+    return B.CreateIntrinsic(Intrinsic::fma, {Ty}, {Y0, NegErr, Y0});
+  };
+
+  Value *DForRcp = Den;
+  Value *Scale = nullptr;
+  if (!HasFP32DenormalFlush) {
+    Value *AbsD = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Den);
+    Value *IsLarge =
+        Builder.CreateFCmpOGT(AbsD, ConstantFP::get(Ty, 
MaxNormalWithNormalRcp));
+    Scale =
+        Builder.CreateSelect(IsLarge, ConstantFP::get(Ty, 
LargeNormalScaleFactor),
+                             ConstantFP::get(Ty, 1.0));
+    Value *IsSubnormal =
+        Builder.CreateFCmpOLT(AbsD, ConstantFP::get(Ty, SmallestNormal));
+    Scale = Builder.CreateSelect(IsSubnormal,
+                                 ConstantFP::get(Ty, SubnormalScaleFactor), 
Scale);
+    DForRcp = Builder.CreateFMul(Den, Scale);
+  }
+
+  Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, DForRcp);
+  Value *Y1 = EmitNRFixup(Builder, Y0, DForRcp);
+
+  Value *NRResult = Y1;
+  if (!HasFP32DenormalFlush)
+    NRResult = Builder.CreateFMul(Y1, Scale);
+
+  Value *Result = Builder.CreateIntrinsic(
+      Intrinsic::amdgcn_div_fixup, {Ty},
+      {NRResult, Den, ConstantFP::get(Ty, 1.0)});
+
+  FDiv.replaceAllUsesWith(Result);
+  FDiv.eraseFromParent();
+  return true;
+}
+
 Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
     IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
     FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
@@ -903,6 +1044,9 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator 
&FDiv) {
   if (DisableFDivExpand)
     return false;
 
+  if (expandReciprocalNewtonRaphson(FDiv))
+    return true;
+
   Type *Ty = FDiv.getType()->getScalarType();
   const bool IsFloat = Ty->isFloatTy();
   if (!IsFloat && !Ty->isDoubleTy())
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll
new file mode 100644
index 0000000000000..cab7b57538c01
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare \
+; RUN:   -amdgpu-enable-fp32-recip-newton-raphson %s | FileCheck 
-check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare \
+; RUN:   -amdgpu-enable-fp32-recip-newton-raphson 
-denormal-fp-math-f32=preserve-sign \
+; RUN:   %s | FileCheck -check-prefix=OPT-FTZ %s
+
+; IEEE mode (default): branch + NR with large-normal scaling.
+; FTZ mode: branch + NR without scaling.
+define amdgpu_kernel void @test_fdiv_recip_f32(ptr addrspace(1) %out, float 
%x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:    [[TMP2:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; OPT-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP2]], f0x7E800000
+; OPT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP10]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP4:%.*]] = fcmp olt float [[TMP2]], f0x00800000
+; OPT-NEXT:    [[TMP11:%.*]] = select i1 [[TMP4]], float f0x4F800000, float 
[[TMP3]]
+; OPT-NEXT:    [[TMP5:%.*]] = fmul float [[X]], [[TMP11]]
+; OPT-NEXT:    [[TMP6:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP5]])
+; OPT-NEXT:    [[TMP7:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float 
[[TMP6]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP8:%.*]] = fneg float [[TMP7]]
+; OPT-NEXT:    [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float 
[[TMP8]], float [[TMP6]])
+; OPT-NEXT:    [[TMP14:%.*]] = fmul float [[TMP9]], [[TMP11]]
+; OPT-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP14]], float [[X]], float 1.000000e+00)
+; OPT-NEXT:    store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) 
#[[ATTR0:[0-9]+]] {
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[X]])
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call float @llvm.fma.f32(float [[X]], float 
[[TMP2]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP2]], 
float [[TMP4]], float [[TMP2]])
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP5]], float [[X]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    store float [[TMP7]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv float 1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Negative reciprocal: -1.0/x should also be transformed (fneg + rcp).
+define amdgpu_kernel void @test_fdiv_neg_recip_f32(ptr addrspace(1) %out, 
float %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = fneg float [[X]]
+; OPT-NEXT:    [[TMP3:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
+; OPT-NEXT:    [[TMP11:%.*]] = fcmp ogt float [[TMP3]], f0x7E800000
+; OPT-NEXT:    [[TMP4:%.*]] = select i1 [[TMP11]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP5:%.*]] = fcmp olt float [[TMP3]], f0x00800000
+; OPT-NEXT:    [[TMP12:%.*]] = select i1 [[TMP5]], float f0x4F800000, float 
[[TMP4]]
+; OPT-NEXT:    [[TMP6:%.*]] = fmul float [[TMP1]], [[TMP12]]
+; OPT-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP6]])
+; OPT-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float 
[[TMP7]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP9:%.*]] = fneg float [[TMP8]]
+; OPT-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float 
[[TMP9]], float [[TMP7]])
+; OPT-NEXT:    [[TMP15:%.*]] = fmul float [[TMP10]], [[TMP12]]
+; OPT-NEXT:    [[TMP13:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP15]], float [[TMP1]], float 1.000000e+00)
+; OPT-NEXT:    store float [[TMP13]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = fneg float [[X]]
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], 
float [[TMP3]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = fneg float [[TMP4]]
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], 
float [[TMP5]], float [[TMP3]])
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP6]], float [[TMP1]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    store float [[TMP8]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv float -1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Non-reciprocal division should not be transformed.
+define amdgpu_kernel void @test_fdiv_non_recip_f32(ptr addrspace(1) %out, 
float %x, float %y) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_non_recip_f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]]) 
#[[ATTR0]] {
+; OPT-NEXT:    [[FDIV:%.*]] = fdiv float [[Y]], [[X]]
+; OPT-NEXT:    store float [[FDIV]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_non_recip_f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float 
[[Y:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[FDIV:%.*]] = fdiv float [[Y]], [[X]]
+; OPT-FTZ-NEXT:    store float [[FDIV]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv float %y, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Known-normal denominator (IEEE mode): NR inline with scaling, no branch.
+; Known-normal denominator (FTZ mode): NR inline, no scaling, no branch.
+define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(ptr addrspace(1) 
%out, float nofpclass(nan inf zero sub) %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan inf zero sub) 
[[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; OPT-NEXT:    [[TMP2:%.*]] = fcmp ogt float [[TMP1]], f0x7E800000
+; OPT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP10:%.*]] = fcmp olt float [[TMP1]], f0x00800000
+; OPT-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float f0x4F800000, float 
[[TMP3]]
+; OPT-NEXT:    [[TMP4:%.*]] = fmul float [[X]], [[TMP11]]
+; OPT-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP4]])
+; OPT-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float 
[[TMP5]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP7:%.*]] = fneg float [[TMP6]]
+; OPT-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float 
[[TMP7]], float [[TMP5]])
+; OPT-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[TMP11]]
+; OPT-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP9]], float [[X]], float 1.000000e+00)
+; OPT-NEXT:    store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan inf zero 
sub) [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[X]])
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call float @llvm.fma.f32(float [[X]], float 
[[TMP1]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = fneg float [[TMP2]]
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], 
float [[TMP3]], float [[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP4]], float [[X]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    store float [[TMP5]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv float 1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+
+; Vector reciprocal: <2 x float> should be scalarized and each lane 
transformed.
+define amdgpu_kernel void @test_fdiv_recip_v2f32(ptr addrspace(1) %out, <2 x 
float> %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-NEXT:    [[TMP14:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
+; OPT-NEXT:    [[TMP11:%.*]] = fcmp ogt float [[TMP14]], f0x7E800000
+; OPT-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP15:%.*]] = fcmp olt float [[TMP14]], f0x00800000
+; OPT-NEXT:    [[TMP12:%.*]] = select i1 [[TMP15]], float f0x4F800000, float 
[[TMP13]]
+; OPT-NEXT:    [[TMP28:%.*]] = fmul float [[TMP1]], [[TMP12]]
+; OPT-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP28]])
+; OPT-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float 
[[TMP7]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP9:%.*]] = fneg float [[TMP8]]
+; OPT-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float 
[[TMP9]], float [[TMP7]])
+; OPT-NEXT:    [[TMP25:%.*]] = fmul float [[TMP10]], [[TMP12]]
+; OPT-NEXT:    [[TMP29:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP25]], float [[TMP1]], float 1.000000e+00)
+; OPT-NEXT:    [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP4]])
+; OPT-NEXT:    [[TMP26:%.*]] = fcmp ogt float [[TMP17]], f0x7E800000
+; OPT-NEXT:    [[TMP18:%.*]] = select i1 [[TMP26]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP19:%.*]] = fcmp olt float [[TMP17]], f0x00800000
+; OPT-NEXT:    [[TMP31:%.*]] = select i1 [[TMP19]], float f0x4F800000, float 
[[TMP18]]
+; OPT-NEXT:    [[TMP20:%.*]] = fmul float [[TMP4]], [[TMP31]]
+; OPT-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]])
+; OPT-NEXT:    [[TMP22:%.*]] = call float @llvm.fma.f32(float [[TMP20]], float 
[[TMP21]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP23:%.*]] = fneg float [[TMP22]]
+; OPT-NEXT:    [[TMP24:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float 
[[TMP23]], float [[TMP21]])
+; OPT-NEXT:    [[TMP32:%.*]] = fmul float [[TMP24]], [[TMP31]]
+; OPT-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP32]], float [[TMP4]], float 1.000000e+00)
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float 
[[TMP29]], i64 0
+; OPT-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float 
[[TMP5]], i64 1
+; OPT-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) 
#[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP1]], 
float [[TMP7]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP9:%.*]] = fneg float [[TMP8]]
+; OPT-FTZ-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], 
float [[TMP9]], float [[TMP7]])
+; OPT-FTZ-NEXT:    [[TMP12:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP10]], float [[TMP1]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP21:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP4]])
+; OPT-FTZ-NEXT:    [[TMP22:%.*]] = call float @llvm.fma.f32(float [[TMP4]], 
float [[TMP21]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP23:%.*]] = fneg float [[TMP22]]
+; OPT-FTZ-NEXT:    [[TMP24:%.*]] = call float @llvm.fma.f32(float [[TMP21]], 
float [[TMP23]], float [[TMP21]])
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP24]], float [[TMP4]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float 
[[TMP12]], i64 0
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float 
[[TMP5]], i64 1
+; OPT-FTZ-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[OUT]], align 
8
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
+  store <2 x float> %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+
+; Negative vector reciprocal: <-1.0, -1.0>/x should be scalarized and each 
lane transformed.
+define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(ptr addrspace(1) %out, <2 
x float> %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = fneg float [[TMP1]]
+; OPT-NEXT:    [[TMP5:%.*]] = call float @llvm.fabs.f32(float [[TMP3]])
+; OPT-NEXT:    [[TMP13:%.*]] = fcmp ogt float [[TMP5]], f0x7E800000
+; OPT-NEXT:    [[TMP6:%.*]] = select i1 [[TMP13]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP7:%.*]] = fcmp olt float [[TMP5]], f0x00800000
+; OPT-NEXT:    [[TMP14:%.*]] = select i1 [[TMP7]], float f0x4F800000, float 
[[TMP6]]
+; OPT-NEXT:    [[TMP8:%.*]] = fmul float [[TMP3]], [[TMP14]]
+; OPT-NEXT:    [[TMP9:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]])
+; OPT-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP8]], float 
[[TMP9]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP11:%.*]] = fneg float [[TMP10]]
+; OPT-NEXT:    [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float 
[[TMP11]], float [[TMP9]])
+; OPT-NEXT:    [[TMP26:%.*]] = fmul float [[TMP12]], [[TMP14]]
+; OPT-NEXT:    [[TMP15:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP26]], float [[TMP3]], float 1.000000e+00)
+; OPT-NEXT:    [[TMP16:%.*]] = fneg float [[TMP2]]
+; OPT-NEXT:    [[TMP18:%.*]] = call float @llvm.fabs.f32(float [[TMP16]])
+; OPT-NEXT:    [[TMP19:%.*]] = fcmp ogt float [[TMP18]], f0x7E800000
+; OPT-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float f0x2F800000, float 
1.000000e+00
+; OPT-NEXT:    [[TMP27:%.*]] = fcmp olt float [[TMP18]], f0x00800000
+; OPT-NEXT:    [[TMP33:%.*]] = select i1 [[TMP27]], float f0x4F800000, float 
[[TMP20]]
+; OPT-NEXT:    [[TMP21:%.*]] = fmul float [[TMP16]], [[TMP33]]
+; OPT-NEXT:    [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP21]])
+; OPT-NEXT:    [[TMP23:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float 
[[TMP22]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP24:%.*]] = fneg float [[TMP23]]
+; OPT-NEXT:    [[TMP25:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float 
[[TMP24]], float [[TMP22]])
+; OPT-NEXT:    [[TMP34:%.*]] = fmul float [[TMP25]], [[TMP33]]
+; OPT-NEXT:    [[TMP28:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP34]], float [[TMP16]], float 1.000000e+00)
+; OPT-NEXT:    [[TMP29:%.*]] = insertelement <2 x float> poison, float 
[[TMP15]], i64 0
+; OPT-NEXT:    [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float 
[[TMP28]], i64 1
+; OPT-NEXT:    store <2 x float> [[TMP30]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) 
#[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = fneg float [[TMP1]]
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP3]])
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], 
float [[TMP5]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = fneg float [[TMP6]]
+; OPT-FTZ-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], 
float [[TMP7]], float [[TMP5]])
+; OPT-FTZ-NEXT:    [[TMP10:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP8]], float [[TMP3]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP11:%.*]] = fneg float [[TMP2]]
+; OPT-FTZ-NEXT:    [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float 
[[TMP11]])
+; OPT-FTZ-NEXT:    [[TMP14:%.*]] = call float @llvm.fma.f32(float [[TMP11]], 
float [[TMP13]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP15:%.*]] = fneg float [[TMP14]]
+; OPT-FTZ-NEXT:    [[TMP16:%.*]] = call float @llvm.fma.f32(float [[TMP13]], 
float [[TMP15]], float [[TMP13]])
+; OPT-FTZ-NEXT:    [[TMP18:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float 
[[TMP16]], float [[TMP11]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> poison, float 
[[TMP10]], i64 0
+; OPT-FTZ-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float 
[[TMP18]], i64 1
+; OPT-FTZ-NEXT:    store <2 x float> [[TMP20]], ptr addrspace(1) [[OUT]], 
align 8
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv <2 x float> <float -1.0, float -1.0>, %x
+  store <2 x float> %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; f64 should not be transformed (only f32).
+define amdgpu_kernel void @test_fdiv_recip_f64(ptr addrspace(1) %out, double 
%x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f64(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], double [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[FDIV:%.*]] = fdiv double 1.000000e+00, [[X]]
+; OPT-NEXT:    store double [[FDIV]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f64(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], double [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[FDIV:%.*]] = fdiv double 1.000000e+00, [[X]]
+; OPT-FTZ-NEXT:    store double [[FDIV]], ptr addrspace(1) [[OUT]], align 8
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv double 1.0, %x
+  store double %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; Verify that fast-math flags on the original fdiv are preserved on
+; the instructions emitted by the NR expansion.
+define amdgpu_kernel void @test_fdiv_recip_f32_fmf(ptr addrspace(1) %out, 
float %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_fmf(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP2:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[X]])
+; OPT-NEXT:    [[TMP10:%.*]] = fcmp nnan ninf ogt float [[TMP2]], f0x7E800000
+; OPT-NEXT:    [[TMP3:%.*]] = select nnan ninf i1 [[TMP10]], float 
f0x2F800000, float 1.000000e+00
+; OPT-NEXT:    [[TMP4:%.*]] = fcmp nnan ninf olt float [[TMP2]], f0x00800000
+; OPT-NEXT:    [[TMP11:%.*]] = select nnan ninf i1 [[TMP4]], float 
f0x4F800000, float [[TMP3]]
+; OPT-NEXT:    [[TMP5:%.*]] = fmul nnan ninf float [[X]], [[TMP11]]
+; OPT-NEXT:    [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float 
[[TMP5]])
+; OPT-NEXT:    [[TMP7:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP5]], float [[TMP6]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP8:%.*]] = fneg nnan ninf float [[TMP7]]
+; OPT-NEXT:    [[TMP9:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP6]], float [[TMP8]], float [[TMP6]])
+; OPT-NEXT:    [[TMP14:%.*]] = fmul nnan ninf float [[TMP9]], [[TMP11]]
+; OPT-NEXT:    [[TMP12:%.*]] = call nnan ninf float 
@llvm.amdgcn.div.fixup.f32(float [[TMP14]], float [[X]], float 1.000000e+00)
+; OPT-NEXT:    store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_fmf(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = call nnan ninf float 
@llvm.amdgcn.rcp.f32(float [[X]])
+; OPT-FTZ-NEXT:    [[TMP3:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[X]], float [[TMP2]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = fneg nnan ninf float [[TMP3]]
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP2]], float [[TMP4]], float [[TMP2]])
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = call nnan ninf float 
@llvm.amdgcn.div.fixup.f32(float [[TMP5]], float [[X]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    store float [[TMP7]], ptr addrspace(1) [[OUT]], align 4
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv nnan ninf float 1.0, %x
+  store float %fdiv, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Verify that fast-math flags are preserved through vector scalarization.
+define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(ptr addrspace(1) %out, <2 
x float> %x) #0 {
+; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(
+; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-NEXT:    [[TMP4:%.*]] = call nnan ninf float @llvm.fabs.f32(float 
[[TMP1]])
+; OPT-NEXT:    [[TMP12:%.*]] = fcmp nnan ninf ogt float [[TMP4]], f0x7E800000
+; OPT-NEXT:    [[TMP5:%.*]] = select nnan ninf i1 [[TMP12]], float 
f0x2F800000, float 1.000000e+00
+; OPT-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf olt float [[TMP4]], f0x00800000
+; OPT-NEXT:    [[TMP13:%.*]] = select nnan ninf i1 [[TMP6]], float 
f0x4F800000, float [[TMP5]]
+; OPT-NEXT:    [[TMP7:%.*]] = fmul nnan ninf float [[TMP1]], [[TMP13]]
+; OPT-NEXT:    [[TMP8:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float 
[[TMP7]])
+; OPT-NEXT:    [[TMP9:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP7]], float [[TMP8]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP10:%.*]] = fneg nnan ninf float [[TMP9]]
+; OPT-NEXT:    [[TMP11:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP8]], float [[TMP10]], float [[TMP8]])
+; OPT-NEXT:    [[TMP24:%.*]] = fmul nnan ninf float [[TMP11]], [[TMP13]]
+; OPT-NEXT:    [[TMP14:%.*]] = call nnan ninf float 
@llvm.amdgcn.div.fixup.f32(float [[TMP24]], float [[TMP1]], float 1.000000e+00)
+; OPT-NEXT:    [[TMP16:%.*]] = call nnan ninf float @llvm.fabs.f32(float 
[[TMP2]])
+; OPT-NEXT:    [[TMP17:%.*]] = fcmp nnan ninf ogt float [[TMP16]], f0x7E800000
+; OPT-NEXT:    [[TMP18:%.*]] = select nnan ninf i1 [[TMP17]], float 
f0x2F800000, float 1.000000e+00
+; OPT-NEXT:    [[TMP25:%.*]] = fcmp nnan ninf olt float [[TMP16]], f0x00800000
+; OPT-NEXT:    [[TMP31:%.*]] = select nnan ninf i1 [[TMP25]], float 
f0x4F800000, float [[TMP18]]
+; OPT-NEXT:    [[TMP19:%.*]] = fmul nnan ninf float [[TMP2]], [[TMP31]]
+; OPT-NEXT:    [[TMP20:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float 
[[TMP19]])
+; OPT-NEXT:    [[TMP21:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP19]], float [[TMP20]], float -1.000000e+00)
+; OPT-NEXT:    [[TMP22:%.*]] = fneg nnan ninf float [[TMP21]]
+; OPT-NEXT:    [[TMP23:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP20]], float [[TMP22]], float [[TMP20]])
+; OPT-NEXT:    [[TMP32:%.*]] = fmul nnan ninf float [[TMP23]], [[TMP31]]
+; OPT-NEXT:    [[TMP26:%.*]] = call nnan ninf float 
@llvm.amdgcn.div.fixup.f32(float [[TMP32]], float [[TMP2]], float 1.000000e+00)
+; OPT-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float 
[[TMP14]], i64 0
+; OPT-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float 
[[TMP26]], i64 1
+; OPT-NEXT:    store <2 x float> [[TMP28]], ptr addrspace(1) [[OUT]], align 8
+; OPT-NEXT:    ret void
+;
+; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(
+; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) 
#[[ATTR0]] {
+; OPT-FTZ-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0
+; OPT-FTZ-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1
+; OPT-FTZ-NEXT:    [[TMP4:%.*]] = call nnan ninf float 
@llvm.amdgcn.rcp.f32(float [[TMP1]])
+; OPT-FTZ-NEXT:    [[TMP5:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP1]], float [[TMP4]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP6:%.*]] = fneg nnan ninf float [[TMP5]]
+; OPT-FTZ-NEXT:    [[TMP7:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP4]], float [[TMP6]], float [[TMP4]])
+; OPT-FTZ-NEXT:    [[TMP9:%.*]] = call nnan ninf float 
@llvm.amdgcn.div.fixup.f32(float [[TMP7]], float [[TMP1]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP11:%.*]] = call nnan ninf float 
@llvm.amdgcn.rcp.f32(float [[TMP2]])
+; OPT-FTZ-NEXT:    [[TMP12:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP2]], float [[TMP11]], float -1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP13:%.*]] = fneg nnan ninf float [[TMP12]]
+; OPT-FTZ-NEXT:    [[TMP14:%.*]] = call nnan ninf float @llvm.fma.f32(float 
[[TMP11]], float [[TMP13]], float [[TMP11]])
+; OPT-FTZ-NEXT:    [[TMP16:%.*]] = call nnan ninf float 
@llvm.amdgcn.div.fixup.f32(float [[TMP14]], float [[TMP2]], float 1.000000e+00)
+; OPT-FTZ-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> poison, float 
[[TMP9]], i64 0
+; OPT-FTZ-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float 
[[TMP16]], i64 1
+; OPT-FTZ-NEXT:    store <2 x float> [[TMP18]], ptr addrspace(1) [[OUT]], 
align 8
+; OPT-FTZ-NEXT:    ret void
+;
+  %fdiv = fdiv nnan ninf <2 x float> splat (float 1.0), %x
+  store <2 x float> %fdiv, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to