https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/164847
>From ddda6473ab7ae8485a906a749eebad0853b857ca Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 11:50:32 -0500 Subject: [PATCH 1/6] Initial work --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++ .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++- 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8428fa97fe445..f17156f8a24ab 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") + TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9e334d4316336..50b43a1c927ce 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_amdgcn_bcnt032_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + +def int_amdgcn_bcnt064_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..39b558694edf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -35,6 +36,7 @@ #include "llvm/Support/KnownFPClass.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include <cstdint> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt<bool> + DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", + cl::desc("Prevent transforming bitsin(typeof(x)) - " + "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); + class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); + bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::ctpop: + return visitCtpop(I); default: return false; } @@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } +bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { + uint32_t BitWidth, DestinationWidth, IntrinsicWidth; + if (!I.hasOneUse() || + !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + return false; + + BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back()); + if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + return false; + + ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0)); + if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + return false; + + IRBuilder<> Builder(MustBeSub); + Instruction *TransformedIns = + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo + : Intrinsic::amdgcn_bcnt032_lo, + {}, {I.getArgOperand(0)}); + + if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != + (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) + TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + + MustBeSub->replaceAllUsesWith(TransformedIns); + TransformedIns->takeName(MustBeSub); + MustBeSub->eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b621fe78..29104d33a8aa8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", } // End isReMaterializable = 1, isAsCheapAsAMove = 1 let Defs = [SCC] in { -def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; +def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))] +>; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))] +>; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; >From 249ee64fd6ec23cb65433a5dc56145f3effa158d Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 14:20:42 -0500 Subject: [PATCH 2/6] Update testcases --- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 38 +++++++++++++---------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index dd5f838b4a206..db030d2b19d90 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { ; CHECK-LABEL: bcnt032: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_sub_i32 s0, 32, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) @@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK-LABEL: bcnt064: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_sub_u32 s0, 64, s0 -; CHECK-NEXT: s_subb_u32 s1, 0, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) >From 5bd7c7b2045c7669d8d326d8bc3ca4216dda6597 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 17:31:31 -0500 Subject: [PATCH 3/6] Don't perform optimization on vector types --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 39b558694edf8..8f13fa79d3637 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1991,7 +1991,7 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { uint32_t BitWidth, DestinationWidth, IntrinsicWidth; - if (!I.hasOneUse() || + if (!I.hasOneUse() || !I.getType()->isIntegerTy() || !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) return false; >From 1030ef31f96040975f02191af0a5a57374c5e0e9 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 18:29:17 -0500 Subject: [PATCH 4/6] Review changes --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++-- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++---- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 6 +++--- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f17156f8a24ab..f18d1f8df0b71 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,8 +63,8 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc") TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 50b43a1c927ce..476f0bcb42b31 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,12 +2359,12 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt032_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, +def int_amdgcn_bcnt32_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt064_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, +def int_amdgcn_bcnt64_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8f13fa79d3637..169541d9d45f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -95,7 +95,7 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -// Disable processing of fdiv so we can better test the backend implementations. +// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation. static cl::opt<bool> DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", cl::desc("Prevent transforming bitsin(typeof(x)) - " @@ -2005,8 +2005,8 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { IRBuilder<> Builder(MustBeSub); Instruction *TransformedIns = - Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo - : Intrinsic::amdgcn_bcnt032_lo, + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo + : Intrinsic::amdgcn_bcnt32_lo, {}, {I.getArgOperand(0)}); if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 29104d33a8aa8..00d5cab2de479 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,10 +265,10 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt32_lo> i32:$src0))] >; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", - [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt64_lo> i64:$src0))] >; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >From 165f82de021625f430571cdeb6894fb3acf42cba Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Fri, 24 Oct 2025 14:12:23 -0500 Subject: [PATCH 5/6] Review changes: - Add tests - Remove builtin (users will need inline assembly if pattern match fails) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 - llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 - llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 109 +++++++++++++++++++ 3 files changed, 109 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f18d1f8df0b71..8428fa97fe445 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,9 +63,6 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc") - TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 476f0bcb42b31..ca4abe29dd96a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2360,11 +2360,9 @@ def int_amdgcn_mbcnt_hi : [IntrNoMem]>; def int_amdgcn_bcnt32_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_amdgcn_bcnt64_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index db030d2b19d90..a9516057be1ef 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -621,3 +621,112 @@ if: endif: ret i32 1 } + +define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt032_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i32, ptr addrspace(1) %gep + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt064_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: b32 s0, s0, 2 +; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: nt vmcnt(0) +; CHECK-NEXT: 32_e32 v4, 0 +; CHECK-NEXT: u32_b32 v2, v2, 0 +; CHECK-NEXT: u32_b32 v3, v3, v2 +; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: TART +; CHECK-NEXT: [5:6] +; CHECK-NEXT: ND +; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: m + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i64, ptr addrspace(1) %gep + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { +; CHECK-LABEL: bcnt032_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: _i32_b32 s0, s0 +; CHECK-NEXT: 32 s1, 32, s0 +; CHECK-NEXT: g_u32 s1, 0 +; CHECK-NEXT: TART +; CHECK-NEXT: 0 +; CHECK-NEXT: ND +; CHECK-NEXT: TART +; CHECK-NEXT: 1 +; CHECK-NEXT: ND +; CHECK-NEXT: ct_b64 s[0:1], -1, 0 +; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: irstlane_b32 s0, v0 +; CHECK-NEXT: n to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result) + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { +; CHECK-LABEL: bcnt064_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: _i32_b64 s0, s[0:1] +; CHECK-NEXT: 32 s2, 64, s0 +; CHECK-NEXT: u32 s3, 0, 0 +; CHECK-NEXT: 32 s1, 0 +; CHECK-NEXT: g_u64 s[2:3], 0 +; CHECK-NEXT: TART +; CHECK-NEXT: [0:1] +; CHECK-NEXT: ND +; CHECK-NEXT: ct_b64 s[0:1], -1, 0 +; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: irstlane_b32 s0, v0 +; CHECK-NEXT: TART +; CHECK-NEXT: [2:3] +; CHECK-NEXT: ND +; CHECK-NEXT: n to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result) + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} \ No newline at end of file >From 168a5e33042afbc49c7d7063248ea32e49c7e3b5 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Mon, 27 Oct 2025 14:18:45 -0500 Subject: [PATCH 6/6] Reviewer-suggested refactoring --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 169541d9d45f6..94dcba7aab3e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -34,6 +34,7 @@ #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" #include <cstdint> @@ -1990,17 +1991,16 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, } bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { - uint32_t BitWidth, DestinationWidth, IntrinsicWidth; - if (!I.hasOneUse() || !I.getType()->isIntegerTy() || - !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + uint32_t BitWidth, DestinationWidth; + if (!I.hasOneUse() || !I.getType()->isIntegerTy()) return false; - BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back()); - if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + BitWidth = I.getType()->getIntegerBitWidth(); + if(!ST.hasBCNT(BitWidth)) return false; - ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0)); - if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + Instruction *MustBeSub = I.user_back(); + if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I)))) return false; IRBuilder<> Builder(MustBeSub); @@ -2009,14 +2009,12 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { : Intrinsic::amdgcn_bcnt32_lo, {}, {I.getArgOperand(0)}); - if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != - (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) - TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( - TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + DestinationWidth = MustBeSub->getType()->getIntegerBitWidth(); + TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); - MustBeSub->replaceAllUsesWith(TransformedIns); - TransformedIns->takeName(MustBeSub); - MustBeSub->eraseFromParent(); + BasicBlock::iterator SubIt = MustBeSub->getIterator(); + ReplaceInstWithValue(SubIt,TransformedIns); return true; } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
