llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> We were checking for afn or !fpmath attached to the sqrt. We are not trying to replace a correctly rounded rsqrt; we're replacing the two correctly rounded operations with the contracted operation. It's net a better precision, so contract on both instructions should be sufficient. Both the contracted and uncontracted sequences pass the OpenCL conformance test, with a lower maximum error contracted. --- Patch is 123.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172082.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+6-25) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll (+52-45) - (modified) llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll (+507-1532) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index e45d0652a65ef..01acb60a68629 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -210,8 +210,7 @@ class AMDGPUCodeGenPrepareImpl Value *matchFractPat(IntrinsicInst &I); Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); - bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF, - FastMathFlags SqrtFMF) const; + bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const; Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, FastMathFlags SqrtFMF, @@ -696,29 +695,11 @@ Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X, return Builder.CreateFMA(Y0E, EFMA, Y0); } -bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, - FastMathFlags DivFMF, +bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const { - // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. - if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) - return false; - - Type *EltTy = SqrtOp->getType()->getScalarType(); - switch (EltTy->getTypeID()) { - case Type::FloatTyID: - // v_rsq_f32 gives 1ulp - // Separate correctly rounded fdiv + sqrt give ~1.81 ulp. - - // FIXME: rsq formation should not depend on approx func or the fpmath - // accuracy. This strictly improves precision. - return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f; - case Type::DoubleTyID: - return true; - default: - return false; - } - - llvm_unreachable("covered switch"); + // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and + // f64. + return DivFMF.allowContract() && SqrtFMF.allowContract(); } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( @@ -927,7 +908,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { DenII->hasOneUse()) { const auto *SqrtOp = cast<FPMathOperator>(DenII); SqrtFMF = SqrtOp->getFastMathFlags(); - if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) + if (canOptimizeWithRsq(DivFMF, SqrtFMF)) RsqOp = SqrtOp->getOperand(0); } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll index 7ff86ac152feb..cc0d279fe4ec8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -1563,13 +1563,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) { ; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00 ; IEEE-GOODFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]] ; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]] -; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]]) -; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]] -; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]]) -; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]] ; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) ; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]]) @@ -1644,13 +1643,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) { ; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00 ; IEEE-BADFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]] ; IEEE-BADFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]] -; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]]) -; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 -; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_HALF_ULP]]) -; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]] -; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]]) -; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]] ; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) ; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]]) @@ -1701,8 +1699,7 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) { ; DAZ-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fneg contract float [[TMP1]] ; DAZ-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]] -; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_HALF_ULP]]) +; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD1]]) @@ -3490,19 +3487,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl ; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]] -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]] +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]]) +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]] ; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) ; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 ; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 @@ -3536,19 +3536,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl ; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]] -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]] -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]]) -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]] +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]]) +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]] ; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) ; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 ; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) @@ -3582,9 +3585,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl ; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) -; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP2]] -; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP31]]) +; DAZ-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP34]] ; DAZ-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) ; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 ; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll index 7f822c135ffb4..d9fdfb38ef344 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll @@ -12,123 +12,48 @@ declare float @llvm.sqrt.f32(float) nounwind readnone declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { -; GCN-DAZ-SAFE-LABEL: rsq_f32: -; GCN-DAZ-SAFE: ; %bb.0: -; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7 -; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3 -; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000 -; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0 -; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1 -; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0 -; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3 -; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-DAZ-SAFE-NEXT: s_endpgm -; -; SI-IEEE-SAFE-LABEL: rsq_f32: -; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 -; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 -; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 -; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 -; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2 -; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0 -; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0 -; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2 -; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1 -; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 -; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-IEEE-SAFE-NEXT: s_endpgm +; GCN-DAZ-LABEL: rsq_f32: +; GCN-DAZ: ; %bb.0: +; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000 +; GCN-DAZ-NEXT: s_mov_b32 s6, -1 +; GCN-DAZ-NEXT: s_mov_b32 s10, s6 +; GCN-DAZ-NEXT: s_mov_b32 s11, s7 +; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DAZ-NEXT: s_mov_b32 s8, s2 +; GCN-DAZ-NEXT: s_mov_b32 s9, s3 +; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/172082 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
