https://github.com/carlobertolli updated https://github.com/llvm/llvm-project/pull/194716
>From 63eb24ce7e4ba44c659e47c5b0bab6d40da8e751 Mon Sep 17 00:00:00 2001 From: Carlo Bertolli <[email protected]> Date: Wed, 15 Apr 2026 23:39:37 +0000 Subject: [PATCH] [AMDGPU] Use 1-iteration Newton-Raphson refinement for FP32 reciprocal. The AMDGPU backend lowers the following code r = 1/x; to a FP32 division, i.e. a 12-instruction code sequence that handles all denominator cases (normal, denormal, inf, Nans). This patch lowers reciprocals to a single iteration Newton-Raphson refinement. All FP32 cases are supported: normal, NaN, subnormal, infinity, and 0. Large normals, whose reciprocal is subnormal and flushed to 0 by v_rcp_f32, are scaled to a small normal before using v_rcp_f32 and then scaled back. Subnormals are scaled before and after v_rcp_32, except when denormals are flushed-to-zero. Max ULP is 1 but for only 0.15% of all possible FP32. The alternative solution in emitRcpIEEE1ULP supports 1ULP but for 10.76% of all FP32 values, making this implementation more accurate. When using this algorithm, we see ~7% performance improvements over full division for a kernel only executing reciprocals. This behavior is hidden behind a flag (turned off by default) -amdgpu-enable-fp32-recip-newton-raphson Assisted-by: Cursor (Claude) --- clang/include/clang/Options/Options.td | 9 + clang/lib/Driver/ToolChains/AMDGPU.cpp | 5 + clang/lib/Driver/ToolChains/HIPAMD.cpp | 8 + .../amdgpu-fp32-recip-newton-raphson.hip | 30 ++ .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 144 +++++++ .../llvm.amdgcn.rcp.newtonraphson.f32.ll | 370 ++++++++++++++++++ 6 files changed, 566 insertions(+) create mode 100644 clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 6fc8806ba683c..5f6b7885d55b6 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5831,6 +5831,15 @@ defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profil "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">, NegFlag<SetFalse, [], [ClangOption]>>; +def mamdgpu_fp32_recip_newton_raphson : Flag<["-"], "mamdgpu-fp32-recip-newton-raphson">, + Group<m_Group>, + HelpText<"Use Newton-Raphson refinement for FP32 1.0f/x reciprocal when the denominator " + "is a normal float or NaN, falling back to the full division sequence for " + "denormals/inf/zero. (AMDGPU only)">; +def mno_amdgpu_fp32_recip_newton_raphson : Flag<["-"], "mno-amdgpu-fp32-recip-newton-raphson">, + Group<m_Group>, + HelpText<"Disable Newton-Raphson refinement for FP32 reciprocal (AMDGPU only)">; + def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>, HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">, Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>, diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 631eb0b98df7e..923fa38d4edeb 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -865,6 +865,11 @@ void AMDGPUToolChain::addClangTargetOptions( !DriverArgs.hasArg(options::OPT_disable_llvm_optzns)) CC1Args.push_back("-disable-llvm-optzns"); + if (DriverArgs.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson, + options::OPT_mno_amdgpu_fp32_recip_newton_raphson, + false)) + CC1Args.append({"-mllvm", "-amdgpu-enable-fp32-recip-newton-raphson"}); + if (DeviceOffloadingKind == Action::OFK_None) addOpenCLBuiltinsLib(getDriver(), getTriple(), DriverArgs, CC1Args); } diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index 9211803aa6a2f..4bb11021a1fa7 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -67,6 +67,9 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, "-plugin-opt=-amdgpu-internalize-symbols"}; if (Args.hasArg(options::OPT_hipstdpar)) LldArgs.push_back("-plugin-opt=-amdgpu-enable-hipstdpar"); + if (Args.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson, + options::OPT_mno_amdgpu_fp32_recip_newton_raphson, false)) + LldArgs.push_back("-plugin-opt=-amdgpu-enable-fp32-recip-newton-raphson"); auto &TC = getToolChain(); auto &D = TC.getDriver(); @@ -249,6 +252,11 @@ void HIPAMDToolChain::addClangTargetOptions( CC1Args.append({"-mllvm", "-amdgpu-enable-hipstdpar"}); } + if (DriverArgs.hasFlag(options::OPT_mamdgpu_fp32_recip_newton_raphson, + options::OPT_mno_amdgpu_fp32_recip_newton_raphson, + false)) + CC1Args.append({"-mllvm", "-amdgpu-enable-fp32-recip-newton-raphson"}); + StringRef MaxThreadsPerBlock = DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); if (!MaxThreadsPerBlock.empty()) { diff --git a/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip b/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip new file mode 100644 index 0000000000000..d2af25ae760fe --- /dev/null +++ b/clang/test/Driver/amdgpu-fp32-recip-newton-raphson.hip @@ -0,0 +1,30 @@ +// REQUIRES: amdgpu-registered-target + +// Check that -mamdgpu-fp32-recip-newton-raphson passes -mllvm flag to device cc1. + +// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx900 -mamdgpu-fp32-recip-newton-raphson %s 2>&1 \ +// RUN: | FileCheck -check-prefix=ON %s + +// ON: "-cc1" {{.*}} "-fcuda-is-device" {{.*}} "-mllvm" "-amdgpu-enable-fp32-recip-newton-raphson" + +// Check that -mno-amdgpu-fp32-recip-newton-raphson does not pass the flag. + +// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx900 -mno-amdgpu-fp32-recip-newton-raphson %s 2>&1 \ +// RUN: | FileCheck -check-prefix=OFF %s + +// OFF-NOT: "-amdgpu-enable-fp32-recip-newton-raphson" + +// Check that -mno overrides -m (last flag wins). + +// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx900 -mamdgpu-fp32-recip-newton-raphson \ +// RUN: -mno-amdgpu-fp32-recip-newton-raphson %s 2>&1 \ +// RUN: | FileCheck -check-prefix=OFF %s + +// Check default (off). + +// RUN: %clang -### -x hip -nogpulib -nogpuinc --target=x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=gfx900 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=OFF %s diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index ffc4b484de072..ae046f77cf169 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -94,6 +94,12 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); +static cl::opt<bool> EnableFP32ReciprocalNewtonRaphson( + "amdgpu-enable-fp32-recip-newton-raphson", cl::Hidden, cl::init(false), + cl::desc("Use Newton-Raphson refinement for 1.0f/x when the denominator " + "is a normal float, falling back to the full division sequence " + "for denormals/inf/nan/zero.")); + class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -248,6 +254,16 @@ class AMDGPUCodeGenPrepareImpl bool tryNarrowMathIfNoOverflow(Instruction *I); + // When a division has 1.0f as nominator, expand it + // to use a single iteration Newton-Raphson (NR) refinement + // algorithm instead of a full division. This is only + // safe for normal single precision floating point numbers + // and for NaNs. Codegen in the function introduces an if-then-else + // structure that checks if the denominator is normal or NaN and + // executes NR if that is true; otherwise, it executes a full + // division. + bool expandReciprocalNewtonRaphson(BinaryOperator &FDiv); + public: bool visitFDiv(BinaryOperator &I); @@ -855,6 +871,131 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast( return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den}); } +bool AMDGPUCodeGenPrepareImpl::expandReciprocalNewtonRaphson( + BinaryOperator &FDiv) { + if (!EnableFP32ReciprocalNewtonRaphson) + return false; + + Type *Ty = FDiv.getType(); + if (Ty->getScalarType() != Type::getFloatTy(FDiv.getContext())) + return false; + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + const APFloat *NumVal; + if (!match(Num, m_APFloat(NumVal)) || + (!NumVal->isExactlyValue(1.0) && !NumVal->isExactlyValue(-1.0))) + return false; + + bool IsNegative = NumVal->isExactlyValue(-1.0); + + // Skip fdivs that were created in the slow path. + if (FDiv.getMetadata("amdgpu.no.rcp.transform")) + return false; + + const FastMathFlags FMF = FDiv.getFastMathFlags(); + const DebugLoc DL = FDiv.getDebugLoc(); + + // For vector types, scalarize into per-element scalar fdivs, then expand + // each one. The pass iterates in reverse so newly created instructions + // before the current one would be missed; expand them explicitly here. + if (Ty->isVectorTy()) { + IRBuilder<> Builder(&FDiv); + Builder.setFastMathFlags(FMF); + Builder.SetCurrentDebugLocation(DL); + SmallVector<Value *, 4> NumVals, DenVals; + extractValues(Builder, NumVals, Num); + extractValues(Builder, DenVals, Den); + + SmallVector<Value *, 4> ResultVals(NumVals.size()); + SmallVector<BinaryOperator *, 4> ScalarDivs; + for (int I = 0, E = NumVals.size(); I != E; ++I) { + Value *EltDiv = Builder.CreateFDiv(NumVals[I], DenVals[I]); + ScalarDivs.push_back(cast<BinaryOperator>(EltDiv)); + ResultVals[I] = EltDiv; + } + + Value *Result = insertValues(Builder, Ty, ResultVals); + FDiv.replaceAllUsesWith(Result); + FDiv.eraseFromParent(); + for (BinaryOperator *SD : ScalarDivs) + expandReciprocalNewtonRaphson(*SD); + return true; + } + + // -1.0 / x -> 1.0 / (fneg x) + // Negate the denominator so the NR path computes rcp(-x) = -1/x directly. + IRBuilder<> Builder(&FDiv); + Builder.setFastMathFlags(FMF); + Builder.SetCurrentDebugLocation(DL); + if (IsNegative) + Den = Builder.CreateFNeg(Den); + + // 2^126: largest FP32 magnitude whose reciprocal (2^-126) is still normal. + constexpr float MaxNormalWithNormalRcp = 0x1.0p126f; + // Scale factor for large normals whose reciprocal would be subnormal. + constexpr float LargeNormalScaleFactor = 0x1.0p-32f; + // Smallest positive normal FP32 value (FLT_MIN). + constexpr float SmallestNormal = 0x1.0p-126f; + // Scale factor for subnormals to bring them into normal range before rcp. + constexpr float SubnormalScaleFactor = 0x1.0p32f; + + // Code generation scheme (branchless): + // y = rcp(x) + // [optional scaling for IEEE denormals] + // NR fixup: y = y - y * (x*y - 1) + // y = div_fixup(y, x, 1.0) + // + // The NR step improves accuracy for finite nonzero values. For special + // inputs (zero, inf, NaN) div_fixup replaces the NR result with the correct + // special value. + // + // In IEEE denormal mode, we scale the denominator before rcp and scale the + // result after NR to handle subnormals and large normals whose reciprocal + // would be subnormal. In FTZ mode no scaling is needed. + + // Emit NR fixup: given the rcp approximation Y0 and the (possibly scaled) + // denominator DForRcp, compute one NR iteration. + auto EmitNRFixup = [&](IRBuilder<> &B, Value *Y0, Value *DForRcp) -> Value * { + Value *Err = B.CreateIntrinsic(Intrinsic::fma, {Ty}, + {DForRcp, Y0, ConstantFP::get(Ty, -1.0)}); + Value *NegErr = B.CreateFNeg(Err); + return B.CreateIntrinsic(Intrinsic::fma, {Ty}, {Y0, NegErr, Y0}); + }; + + Value *DForRcp = Den; + Value *Scale = nullptr; + if (!HasFP32DenormalFlush) { + Value *AbsD = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Den); + Value *IsLarge = + Builder.CreateFCmpOGT(AbsD, ConstantFP::get(Ty, MaxNormalWithNormalRcp)); + Scale = + Builder.CreateSelect(IsLarge, ConstantFP::get(Ty, LargeNormalScaleFactor), + ConstantFP::get(Ty, 1.0)); + Value *IsSubnormal = + Builder.CreateFCmpOLT(AbsD, ConstantFP::get(Ty, SmallestNormal)); + Scale = Builder.CreateSelect(IsSubnormal, + ConstantFP::get(Ty, SubnormalScaleFactor), Scale); + DForRcp = Builder.CreateFMul(Den, Scale); + } + + Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, DForRcp); + Value *Y1 = EmitNRFixup(Builder, Y0, DForRcp); + + Value *NRResult = Y1; + if (!HasFP32DenormalFlush) + NRResult = Builder.CreateFMul(Y1, Scale); + + Value *Result = Builder.CreateIntrinsic( + Intrinsic::amdgcn_div_fixup, {Ty}, + {NRResult, Den, ConstantFP::get(Ty, 1.0)}); + + FDiv.replaceAllUsesWith(Result); + FDiv.eraseFromParent(); + return true; +} + Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst, @@ -903,6 +1044,9 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { if (DisableFDivExpand) return false; + if (expandReciprocalNewtonRaphson(FDiv)) + return true; + Type *Ty = FDiv.getType()->getScalarType(); const bool IsFloat = Ty->isFloatTy(); if (!IsFloat && !Ty->isDoubleTy()) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll new file mode 100644 index 0000000000000..cab7b57538c01 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.newtonraphson.f32.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare \ +; RUN: -amdgpu-enable-fp32-recip-newton-raphson %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare \ +; RUN: -amdgpu-enable-fp32-recip-newton-raphson -denormal-fp-math-f32=preserve-sign \ +; RUN: %s | FileCheck -check-prefix=OPT-FTZ %s + +; IEEE mode (default): branch + NR with large-normal scaling. +; FTZ mode: branch + NR without scaling. +define amdgpu_kernel void @test_fdiv_recip_f32(ptr addrspace(1) %out, float %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[TMP2:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; OPT-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP2]], f0x7E800000 +; OPT-NEXT: [[TMP3:%.*]] = select i1 [[TMP10]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP4:%.*]] = fcmp olt float [[TMP2]], f0x00800000 +; OPT-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], float f0x4F800000, float [[TMP3]] +; OPT-NEXT: [[TMP5:%.*]] = fmul float [[X]], [[TMP11]] +; OPT-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP5]]) +; OPT-NEXT: [[TMP7:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float -1.000000e+00) +; OPT-NEXT: [[TMP8:%.*]] = fneg float [[TMP7]] +; OPT-NEXT: [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP8]], float [[TMP6]]) +; OPT-NEXT: [[TMP14:%.*]] = fmul float [[TMP9]], [[TMP11]] +; OPT-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP14]], float [[X]], float 1.000000e+00) +; OPT-NEXT: store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-FTZ-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[X]]) +; OPT-FTZ-NEXT: [[TMP3:%.*]] = call float @llvm.fma.f32(float [[X]], float [[TMP2]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] +; OPT-FTZ-NEXT: [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP4]], float [[TMP2]]) +; OPT-FTZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP5]], float [[X]], float 1.000000e+00) +; OPT-FTZ-NEXT: store float [[TMP7]], ptr addrspace(1) [[OUT]], align 4 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv float 1.0, %x + store float %fdiv, ptr addrspace(1) %out, align 4 + ret void +} + +; Negative reciprocal: -1.0/x should also be transformed (fneg + rcp). +define amdgpu_kernel void @test_fdiv_neg_recip_f32(ptr addrspace(1) %out, float %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = fneg float [[X]] +; OPT-NEXT: [[TMP3:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) +; OPT-NEXT: [[TMP11:%.*]] = fcmp ogt float [[TMP3]], f0x7E800000 +; OPT-NEXT: [[TMP4:%.*]] = select i1 [[TMP11]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP5:%.*]] = fcmp olt float [[TMP3]], f0x00800000 +; OPT-NEXT: [[TMP12:%.*]] = select i1 [[TMP5]], float f0x4F800000, float [[TMP4]] +; OPT-NEXT: [[TMP6:%.*]] = fmul float [[TMP1]], [[TMP12]] +; OPT-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; OPT-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float -1.000000e+00) +; OPT-NEXT: [[TMP9:%.*]] = fneg float [[TMP8]] +; OPT-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP9]], float [[TMP7]]) +; OPT-NEXT: [[TMP15:%.*]] = fmul float [[TMP10]], [[TMP12]] +; OPT-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP15]], float [[TMP1]], float 1.000000e+00) +; OPT-NEXT: store float [[TMP13]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_f32( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[TMP1:%.*]] = fneg float [[X]] +; OPT-FTZ-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; OPT-FTZ-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP3]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP5:%.*]] = fneg float [[TMP4]] +; OPT-FTZ-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], float [[TMP5]], float [[TMP3]]) +; OPT-FTZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP6]], float [[TMP1]], float 1.000000e+00) +; OPT-FTZ-NEXT: store float [[TMP8]], ptr addrspace(1) [[OUT]], align 4 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv float -1.0, %x + store float %fdiv, ptr addrspace(1) %out, align 4 + ret void +} + +; Non-reciprocal division should not be transformed. +define amdgpu_kernel void @test_fdiv_non_recip_f32(ptr addrspace(1) %out, float %x, float %y) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_non_recip_f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[FDIV:%.*]] = fdiv float [[Y]], [[X]] +; OPT-NEXT: store float [[FDIV]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_non_recip_f32( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[FDIV:%.*]] = fdiv float [[Y]], [[X]] +; OPT-FTZ-NEXT: store float [[FDIV]], ptr addrspace(1) [[OUT]], align 4 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv float %y, %x + store float %fdiv, ptr addrspace(1) %out, align 4 + ret void +} + +; Known-normal denominator (IEEE mode): NR inline with scaling, no branch. +; Known-normal denominator (FTZ mode): NR inline, no scaling, no branch. +define amdgpu_kernel void @test_fdiv_recip_f32_known_normal(ptr addrspace(1) %out, float nofpclass(nan inf zero sub) %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_known_normal( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan inf zero sub) [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[X]]) +; OPT-NEXT: [[TMP2:%.*]] = fcmp ogt float [[TMP1]], f0x7E800000 +; OPT-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP10:%.*]] = fcmp olt float [[TMP1]], f0x00800000 +; OPT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], float f0x4F800000, float [[TMP3]] +; OPT-NEXT: [[TMP4:%.*]] = fmul float [[X]], [[TMP11]] +; OPT-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; OPT-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float [[TMP5]], float -1.000000e+00) +; OPT-NEXT: [[TMP7:%.*]] = fneg float [[TMP6]] +; OPT-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP7]], float [[TMP5]]) +; OPT-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[TMP11]] +; OPT-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP9]], float [[X]], float 1.000000e+00) +; OPT-NEXT: store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_known_normal( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float nofpclass(nan inf zero sub) [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[X]]) +; OPT-FTZ-NEXT: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[X]], float [[TMP1]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP3:%.*]] = fneg float [[TMP2]] +; OPT-FTZ-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP3]], float [[TMP1]]) +; OPT-FTZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP4]], float [[X]], float 1.000000e+00) +; OPT-FTZ-NEXT: store float [[TMP5]], ptr addrspace(1) [[OUT]], align 4 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv float 1.0, %x + store float %fdiv, ptr addrspace(1) %out, align 4 + ret void +} + + +; Vector reciprocal: <2 x float> should be scalarized and each lane transformed. +define amdgpu_kernel void @test_fdiv_recip_v2f32(ptr addrspace(1) %out, <2 x float> %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; OPT-NEXT: [[TMP14:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) +; OPT-NEXT: [[TMP11:%.*]] = fcmp ogt float [[TMP14]], f0x7E800000 +; OPT-NEXT: [[TMP13:%.*]] = select i1 [[TMP11]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP15:%.*]] = fcmp olt float [[TMP14]], f0x00800000 +; OPT-NEXT: [[TMP12:%.*]] = select i1 [[TMP15]], float f0x4F800000, float [[TMP13]] +; OPT-NEXT: [[TMP28:%.*]] = fmul float [[TMP1]], [[TMP12]] +; OPT-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP28]]) +; OPT-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP28]], float [[TMP7]], float -1.000000e+00) +; OPT-NEXT: [[TMP9:%.*]] = fneg float [[TMP8]] +; OPT-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP9]], float [[TMP7]]) +; OPT-NEXT: [[TMP25:%.*]] = fmul float [[TMP10]], [[TMP12]] +; OPT-NEXT: [[TMP29:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP25]], float [[TMP1]], float 1.000000e+00) +; OPT-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP4]]) +; OPT-NEXT: [[TMP26:%.*]] = fcmp ogt float [[TMP17]], f0x7E800000 +; OPT-NEXT: [[TMP18:%.*]] = select i1 [[TMP26]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP19:%.*]] = fcmp olt float [[TMP17]], f0x00800000 +; OPT-NEXT: [[TMP31:%.*]] = select i1 [[TMP19]], float f0x4F800000, float [[TMP18]] +; OPT-NEXT: [[TMP20:%.*]] = fmul float [[TMP4]], [[TMP31]] +; OPT-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; OPT-NEXT: [[TMP22:%.*]] = call float @llvm.fma.f32(float [[TMP20]], float [[TMP21]], float -1.000000e+00) +; OPT-NEXT: [[TMP23:%.*]] = fneg float [[TMP22]] +; OPT-NEXT: [[TMP24:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float [[TMP23]], float [[TMP21]]) +; OPT-NEXT: [[TMP32:%.*]] = fmul float [[TMP24]], [[TMP31]] +; OPT-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP32]], float [[TMP4]], float 1.000000e+00) +; OPT-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP29]], i64 0 +; OPT-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; OPT-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[OUT]], align 8 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; OPT-FTZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; OPT-FTZ-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; OPT-FTZ-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP7]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP9:%.*]] = fneg float [[TMP8]] +; OPT-FTZ-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP9]], float [[TMP7]]) +; OPT-FTZ-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP10]], float [[TMP1]], float 1.000000e+00) +; OPT-FTZ-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; OPT-FTZ-NEXT: [[TMP22:%.*]] = call float @llvm.fma.f32(float [[TMP4]], float [[TMP21]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP23:%.*]] = fneg float [[TMP22]] +; OPT-FTZ-NEXT: [[TMP24:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float [[TMP23]], float [[TMP21]]) +; OPT-FTZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP24]], float [[TMP4]], float 1.000000e+00) +; OPT-FTZ-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; OPT-FTZ-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP5]], i64 1 +; OPT-FTZ-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[OUT]], align 8 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x + store <2 x float> %fdiv, ptr addrspace(1) %out, align 8 + ret void +} + + +; Negative vector reciprocal: <-1.0, -1.0>/x should be scalarized and each lane transformed. +define amdgpu_kernel void @test_fdiv_neg_recip_v2f32(ptr addrspace(1) %out, <2 x float> %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_v2f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; OPT-NEXT: [[TMP3:%.*]] = fneg float [[TMP1]] +; OPT-NEXT: [[TMP5:%.*]] = call float @llvm.fabs.f32(float [[TMP3]]) +; OPT-NEXT: [[TMP13:%.*]] = fcmp ogt float [[TMP5]], f0x7E800000 +; OPT-NEXT: [[TMP6:%.*]] = select i1 [[TMP13]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP5]], f0x00800000 +; OPT-NEXT: [[TMP14:%.*]] = select i1 [[TMP7]], float f0x4F800000, float [[TMP6]] +; OPT-NEXT: [[TMP8:%.*]] = fmul float [[TMP3]], [[TMP14]] +; OPT-NEXT: [[TMP9:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]]) +; OPT-NEXT: [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP8]], float [[TMP9]], float -1.000000e+00) +; OPT-NEXT: [[TMP11:%.*]] = fneg float [[TMP10]] +; OPT-NEXT: [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP11]], float [[TMP9]]) +; OPT-NEXT: [[TMP26:%.*]] = fmul float [[TMP12]], [[TMP14]] +; OPT-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP26]], float [[TMP3]], float 1.000000e+00) +; OPT-NEXT: [[TMP16:%.*]] = fneg float [[TMP2]] +; OPT-NEXT: [[TMP18:%.*]] = call float @llvm.fabs.f32(float [[TMP16]]) +; OPT-NEXT: [[TMP19:%.*]] = fcmp ogt float [[TMP18]], f0x7E800000 +; OPT-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP27:%.*]] = fcmp olt float [[TMP18]], f0x00800000 +; OPT-NEXT: [[TMP33:%.*]] = select i1 [[TMP27]], float f0x4F800000, float [[TMP20]] +; OPT-NEXT: [[TMP21:%.*]] = fmul float [[TMP16]], [[TMP33]] +; OPT-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP21]]) +; OPT-NEXT: [[TMP23:%.*]] = call float @llvm.fma.f32(float [[TMP21]], float [[TMP22]], float -1.000000e+00) +; OPT-NEXT: [[TMP24:%.*]] = fneg float [[TMP23]] +; OPT-NEXT: [[TMP25:%.*]] = call float @llvm.fma.f32(float [[TMP22]], float [[TMP24]], float [[TMP22]]) +; OPT-NEXT: [[TMP34:%.*]] = fmul float [[TMP25]], [[TMP33]] +; OPT-NEXT: [[TMP28:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP34]], float [[TMP16]], float 1.000000e+00) +; OPT-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 +; OPT-NEXT: [[TMP30:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i64 1 +; OPT-NEXT: store <2 x float> [[TMP30]], ptr addrspace(1) [[OUT]], align 8 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_neg_recip_v2f32( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; OPT-FTZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; OPT-FTZ-NEXT: [[TMP3:%.*]] = fneg float [[TMP1]] +; OPT-FTZ-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP3]]) +; OPT-FTZ-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP3]], float [[TMP5]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP7:%.*]] = fneg float [[TMP6]] +; OPT-FTZ-NEXT: [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP7]], float [[TMP5]]) +; OPT-FTZ-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP8]], float [[TMP3]], float 1.000000e+00) +; OPT-FTZ-NEXT: [[TMP11:%.*]] = fneg float [[TMP2]] +; OPT-FTZ-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; OPT-FTZ-NEXT: [[TMP14:%.*]] = call float @llvm.fma.f32(float [[TMP11]], float [[TMP13]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP15:%.*]] = fneg float [[TMP14]] +; OPT-FTZ-NEXT: [[TMP16:%.*]] = call float @llvm.fma.f32(float [[TMP13]], float [[TMP15]], float [[TMP13]]) +; OPT-FTZ-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.div.fixup.f32(float [[TMP16]], float [[TMP11]], float 1.000000e+00) +; OPT-FTZ-NEXT: [[TMP19:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; OPT-FTZ-NEXT: [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1 +; OPT-FTZ-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[OUT]], align 8 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv <2 x float> <float -1.0, float -1.0>, %x + store <2 x float> %fdiv, ptr addrspace(1) %out, align 8 + ret void +} + +; f64 should not be transformed (only f32). +define amdgpu_kernel void @test_fdiv_recip_f64(ptr addrspace(1) %out, double %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f64( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], double [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[FDIV:%.*]] = fdiv double 1.000000e+00, [[X]] +; OPT-NEXT: store double [[FDIV]], ptr addrspace(1) [[OUT]], align 8 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f64( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], double [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[FDIV:%.*]] = fdiv double 1.000000e+00, [[X]] +; OPT-FTZ-NEXT: store double [[FDIV]], ptr addrspace(1) [[OUT]], align 8 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv double 1.0, %x + store double %fdiv, ptr addrspace(1) %out, align 8 + ret void +} + +; Verify that fast-math flags on the original fdiv are preserved on +; the instructions emitted by the NR expansion. +define amdgpu_kernel void @test_fdiv_recip_f32_fmf(ptr addrspace(1) %out, float %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_fmf( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP2:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[X]]) +; OPT-NEXT: [[TMP10:%.*]] = fcmp nnan ninf ogt float [[TMP2]], f0x7E800000 +; OPT-NEXT: [[TMP3:%.*]] = select nnan ninf i1 [[TMP10]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP4:%.*]] = fcmp nnan ninf olt float [[TMP2]], f0x00800000 +; OPT-NEXT: [[TMP11:%.*]] = select nnan ninf i1 [[TMP4]], float f0x4F800000, float [[TMP3]] +; OPT-NEXT: [[TMP5:%.*]] = fmul nnan ninf float [[X]], [[TMP11]] +; OPT-NEXT: [[TMP6:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP5]]) +; OPT-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float -1.000000e+00) +; OPT-NEXT: [[TMP8:%.*]] = fneg nnan ninf float [[TMP7]] +; OPT-NEXT: [[TMP9:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP6]], float [[TMP8]], float [[TMP6]]) +; OPT-NEXT: [[TMP14:%.*]] = fmul nnan ninf float [[TMP9]], [[TMP11]] +; OPT-NEXT: [[TMP12:%.*]] = call nnan ninf float @llvm.amdgcn.div.fixup.f32(float [[TMP14]], float [[X]], float 1.000000e+00) +; OPT-NEXT: store float [[TMP12]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_f32_fmf( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[TMP2:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[X]]) +; OPT-FTZ-NEXT: [[TMP3:%.*]] = call nnan ninf float @llvm.fma.f32(float [[X]], float [[TMP2]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP4:%.*]] = fneg nnan ninf float [[TMP3]] +; OPT-FTZ-NEXT: [[TMP5:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP2]], float [[TMP4]], float [[TMP2]]) +; OPT-FTZ-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.amdgcn.div.fixup.f32(float [[TMP5]], float [[X]], float 1.000000e+00) +; OPT-FTZ-NEXT: store float [[TMP7]], ptr addrspace(1) [[OUT]], align 4 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv nnan ninf float 1.0, %x + store float %fdiv, ptr addrspace(1) %out, align 4 + ret void +} + +; Verify that fast-math flags are preserved through vector scalarization. +define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf(ptr addrspace(1) %out, <2 x float> %x) #0 { +; OPT-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; OPT-NEXT: [[TMP4:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[TMP1]]) +; OPT-NEXT: [[TMP12:%.*]] = fcmp nnan ninf ogt float [[TMP4]], f0x7E800000 +; OPT-NEXT: [[TMP5:%.*]] = select nnan ninf i1 [[TMP12]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP6:%.*]] = fcmp nnan ninf olt float [[TMP4]], f0x00800000 +; OPT-NEXT: [[TMP13:%.*]] = select nnan ninf i1 [[TMP6]], float f0x4F800000, float [[TMP5]] +; OPT-NEXT: [[TMP7:%.*]] = fmul nnan ninf float [[TMP1]], [[TMP13]] +; OPT-NEXT: [[TMP8:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP7]]) +; OPT-NEXT: [[TMP9:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float -1.000000e+00) +; OPT-NEXT: [[TMP10:%.*]] = fneg nnan ninf float [[TMP9]] +; OPT-NEXT: [[TMP11:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP8]], float [[TMP10]], float [[TMP8]]) +; OPT-NEXT: [[TMP24:%.*]] = fmul nnan ninf float [[TMP11]], [[TMP13]] +; OPT-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.amdgcn.div.fixup.f32(float [[TMP24]], float [[TMP1]], float 1.000000e+00) +; OPT-NEXT: [[TMP16:%.*]] = call nnan ninf float @llvm.fabs.f32(float [[TMP2]]) +; OPT-NEXT: [[TMP17:%.*]] = fcmp nnan ninf ogt float [[TMP16]], f0x7E800000 +; OPT-NEXT: [[TMP18:%.*]] = select nnan ninf i1 [[TMP17]], float f0x2F800000, float 1.000000e+00 +; OPT-NEXT: [[TMP25:%.*]] = fcmp nnan ninf olt float [[TMP16]], f0x00800000 +; OPT-NEXT: [[TMP31:%.*]] = select nnan ninf i1 [[TMP25]], float f0x4F800000, float [[TMP18]] +; OPT-NEXT: [[TMP19:%.*]] = fmul nnan ninf float [[TMP2]], [[TMP31]] +; OPT-NEXT: [[TMP20:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; OPT-NEXT: [[TMP21:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP19]], float [[TMP20]], float -1.000000e+00) +; OPT-NEXT: [[TMP22:%.*]] = fneg nnan ninf float [[TMP21]] +; OPT-NEXT: [[TMP23:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP20]], float [[TMP22]], float [[TMP20]]) +; OPT-NEXT: [[TMP32:%.*]] = fmul nnan ninf float [[TMP23]], [[TMP31]] +; OPT-NEXT: [[TMP26:%.*]] = call nnan ninf float @llvm.amdgcn.div.fixup.f32(float [[TMP32]], float [[TMP2]], float 1.000000e+00) +; OPT-NEXT: [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0 +; OPT-NEXT: [[TMP28:%.*]] = insertelement <2 x float> [[TMP27]], float [[TMP26]], i64 1 +; OPT-NEXT: store <2 x float> [[TMP28]], ptr addrspace(1) [[OUT]], align 8 +; OPT-NEXT: ret void +; +; OPT-FTZ-LABEL: define amdgpu_kernel void @test_fdiv_recip_v2f32_fmf( +; OPT-FTZ-SAME: ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR0]] { +; OPT-FTZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X]], i64 0 +; OPT-FTZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[X]], i64 1 +; OPT-FTZ-NEXT: [[TMP4:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP1]]) +; OPT-FTZ-NEXT: [[TMP5:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP1]], float [[TMP4]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP6:%.*]] = fneg nnan ninf float [[TMP5]] +; OPT-FTZ-NEXT: [[TMP7:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP4]], float [[TMP6]], float [[TMP4]]) +; OPT-FTZ-NEXT: [[TMP9:%.*]] = call nnan ninf float @llvm.amdgcn.div.fixup.f32(float [[TMP7]], float [[TMP1]], float 1.000000e+00) +; OPT-FTZ-NEXT: [[TMP11:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; OPT-FTZ-NEXT: [[TMP12:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP2]], float [[TMP11]], float -1.000000e+00) +; OPT-FTZ-NEXT: [[TMP13:%.*]] = fneg nnan ninf float [[TMP12]] +; OPT-FTZ-NEXT: [[TMP14:%.*]] = call nnan ninf float @llvm.fma.f32(float [[TMP11]], float [[TMP13]], float [[TMP11]]) +; OPT-FTZ-NEXT: [[TMP16:%.*]] = call nnan ninf float @llvm.amdgcn.div.fixup.f32(float [[TMP14]], float [[TMP2]], float 1.000000e+00) +; OPT-FTZ-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP9]], i64 0 +; OPT-FTZ-NEXT: [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 +; OPT-FTZ-NEXT: store <2 x float> [[TMP18]], ptr addrspace(1) [[OUT]], align 8 +; OPT-FTZ-NEXT: ret void +; + %fdiv = fdiv nnan ninf <2 x float> splat (float 1.0), %x + store <2 x float> %fdiv, ptr addrspace(1) %out, align 8 + ret void +} + +attributes #0 = { nounwind } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
