https://github.com/yoonseoch updated https://github.com/llvm/llvm-project/pull/177432
>From a7b28cf2424e788a3416ba7b411241fc4387d7d0 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi <[email protected]> Date: Thu, 22 Jan 2026 12:48:23 -0600 Subject: [PATCH 1/2] [AMDGPU] Move AMDGPUAttributor earlier with lowering kernel attributes --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 - llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 362 +++++++++++++- .../AMDGPU/AMDGPULowerKernelAttributes.cpp | 443 ------------------ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 - .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 29 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - ...amdgpu-max-num-workgroups-load-annotate.ll | 10 +- .../AMDGPU/implicit-arg-block-count.ll | 37 +- .../CodeGen/AMDGPU/implicit-arg-v5-opt.ll | 2 +- .../CodeGen/AMDGPU/reqd-work-group-size.ll | 4 +- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 2 +- .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 - 12 files changed, 402 insertions(+), 500 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 5df11a45b4889..de76dd6ab3bb5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -123,15 +123,6 @@ struct AMDGPUPromoteKernelArgumentsPass PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; -ModulePass *createAMDGPULowerKernelAttributesPass(); -void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); -extern char &AMDGPULowerKernelAttributesID; - -struct AMDGPULowerKernelAttributesPass - : PassInfoMixin<AMDGPULowerKernelAttributesPass> { - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; - void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &); extern char &AMDGPULowerModuleLDSLegacyPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 0b2ee6371da06..1f4229a2b15a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,8 +13,14 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/Attributor.h" @@ -50,6 +56,343 @@ static constexpr std::pair<ImplicitArgumentMask, StringLiteral> #include "AMDGPUAttributes.def" }; +// Field offsets in hsa_kernel_dispatch_packet_t. +enum DispatchPackedOffsets { + WORKGROUP_SIZE_X = 4, + WORKGROUP_SIZE_Y = 6, + WORKGROUP_SIZE_Z = 8, + + GRID_SIZE_X = 12, + GRID_SIZE_Y = 16, + GRID_SIZE_Z = 20 +}; + +// Field offsets to implicit kernel argument pointer. +enum ImplicitArgOffsets { + HIDDEN_BLOCK_COUNT_X = 0, + HIDDEN_BLOCK_COUNT_Y = 4, + HIDDEN_BLOCK_COUNT_Z = 8, + + HIDDEN_GROUP_SIZE_X = 12, + HIDDEN_GROUP_SIZE_Y = 14, + HIDDEN_GROUP_SIZE_Z = 16, + + HIDDEN_REMAINDER_X = 18, + HIDDEN_REMAINDER_Y = 20, + HIDDEN_REMAINDER_Z = 22, +}; + +static Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { + auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr + : Intrinsic::amdgcn_dispatch_ptr; + return Intrinsic::getDeclarationIfExists(&M, IntrinsicId); +} + +static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, + uint32_t MaxNumGroups) { + if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) + return; + + if (!Load->getType()->isIntegerTy(32)) + return; + + // TODO: If there is existing range metadata, preserve it if it is stricter. + MDBuilder MDB(Load->getContext()); + MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); + Load->setMetadata(LLVMContext::MD_range, Range); +} + +static bool processUse(CallInst *CI, bool IsV5OrAbove) { + Function *F = CI->getFunction(); + + auto *MD = F->getMetadata("reqd_work_group_size"); + const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; + + const bool HasUniformWorkGroupSize = + F->getFnAttribute("uniform-work-group-size").getValueAsBool(); + + SmallVector<unsigned> MaxNumWorkgroups = + AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", + /*Size=*/3, /*DefaultVal=*/0); + + if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && + !Intrinsic::getDeclarationIfExists(CI->getModule(), + Intrinsic::amdgcn_dispatch_ptr) && + none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) + return false; + + Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + Value *GridSizes[3] = {nullptr, nullptr, nullptr}; + + const DataLayout &DL = F->getDataLayout(); + + // We expect to see several GEP users, casted to the appropriate type and + // loaded. + for (User *U : CI->users()) { + if (!U->hasOneUse()) + continue; + + int64_t Offset = 0; + auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr? + auto *BCI = dyn_cast<BitCastInst>(U); + if (!Load && !BCI) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? + BCI = dyn_cast<BitCastInst>(*U->user_begin()); + } + + if (BCI) { + if (!BCI->hasOneUse()) + continue; + Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI? + } + + if (!Load || !Load->isSimple()) + continue; + + unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); + + // TODO: Handle merged loads. + if (IsV5OrAbove) { // Base is ImplicitArgPtr. + switch (Offset) { + case HIDDEN_BLOCK_COUNT_X: + if (LoadSize == 4) { + BlockCounts[0] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); + } + break; + case HIDDEN_BLOCK_COUNT_Y: + if (LoadSize == 4) { + BlockCounts[1] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); + } + break; + case HIDDEN_BLOCK_COUNT_Z: + if (LoadSize == 4) { + BlockCounts[2] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); + } + break; + case HIDDEN_GROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case HIDDEN_GROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case HIDDEN_GROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case HIDDEN_REMAINDER_X: + if (LoadSize == 2) + Remainders[0] = Load; + break; + case HIDDEN_REMAINDER_Y: + if (LoadSize == 2) + Remainders[1] = Load; + break; + case HIDDEN_REMAINDER_Z: + if (LoadSize == 2) + Remainders[2] = Load; + break; + default: + break; + } + } else { // Base is DispatchPtr. + switch (Offset) { + case WORKGROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case WORKGROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case WORKGROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case GRID_SIZE_X: + if (LoadSize == 4) + GridSizes[0] = Load; + break; + case GRID_SIZE_Y: + if (LoadSize == 4) + GridSizes[1] = Load; + break; + case GRID_SIZE_Z: + if (LoadSize == 4) + GridSizes[2] = Load; + break; + default: + break; + } + } + } + + bool MadeChange = false; + if (IsV5OrAbove && HasUniformWorkGroupSize) { + // Under v5 __ockl_get_local_size returns the value computed by the + // expression: + // + // workgroup_id < hidden_block_count ? hidden_group_size : + // hidden_remainder + // + // For functions with the attribute uniform-work-group-size=true. we can + // evaluate workgroup_id < hidden_block_count as true, and thus + // hidden_group_size is returned for __ockl_get_local_size. + for (int I = 0; I < 3; ++I) { + Value *BlockCount = BlockCounts[I]; + if (!BlockCount) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *ICmp : BlockCount->users()) { + if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin, + m_Specific(BlockCount)))) { + ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } + } + } + + // All remainders should be 0 with uniform work group size. + for (Value *Remainder : Remainders) { + if (!Remainder) + continue; + Remainder->replaceAllUsesWith( + Constant::getNullValue(Remainder->getType())); + MadeChange = true; + } + } else if (HasUniformWorkGroupSize) { // Pre-V5. + // Pattern match the code used to handle partial workgroup dispatches in the + // library implementation of get_local_size, so the entire function can be + // constant folded with a known group size. + // + // uint r = grid_size - group_id * group_size; + // get_local_size = (r < group_size) ? r : group_size; + // + // If we have uniform-work-group-size (which is the default in OpenCL 1.2), + // the grid_size is required to be a multiple of group_size). In this case: + // + // grid_size - (group_id * group_size) < group_size + // -> + // grid_size < group_size + (group_id * group_size) + // + // (grid_size / group_size) < 1 + group_id + // + // grid_size / group_size is at least 1, so we can conclude the select + // condition is false (except for group_id == 0, where the select result is + // the same). + for (int I = 0; I < 3; ++I) { + Value *GroupSize = GroupSizes[I]; + Value *GridSize = GridSizes[I]; + if (!GroupSize || !GridSize) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *U : GroupSize->users()) { + auto *ZextGroupSize = dyn_cast<ZExtInst>(U); + if (!ZextGroupSize) + continue; + + for (User *UMin : ZextGroupSize->users()) { + if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, + m_Specific(ZextGroupSize))), + m_Specific(ZextGroupSize)))) { + if (HasReqdWorkGroupSize) { + ConstantInt *KnownSize = + mdconst::extract<ConstantInt>(MD->getOperand(I)); + UMin->replaceAllUsesWith(ConstantFoldIntegerCast( + KnownSize, UMin->getType(), false, DL)); + } else { + UMin->replaceAllUsesWith(ZextGroupSize); + } + + MadeChange = true; + } + } + } + } + } + + // Upgrade the old method of calculating the block size using the grid size. + // We pattern match any case where the implicit argument group size is the + // divisor to a dispatch packet grid size read of the same dimension. + if (IsV5OrAbove) { + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; + if (!GroupSize || !GroupSize->getType()->isIntegerTy(16)) + continue; + + for (User *U : GroupSize->users()) { + Instruction *Inst = cast<Instruction>(U); + if (isa<ZExtInst>(Inst) && !Inst->use_empty()) + Inst = cast<Instruction>(*Inst->user_begin()); + + using namespace llvm::PatternMatch; + if (!match( + Inst, + m_UDiv(m_ZExtOrSelf(m_Load(m_GEP( + m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(), + m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))), + m_Value()))) + continue; + + IRBuilder<> Builder(Inst); + + Value *GEP = Builder.CreateInBoundsGEP( + Builder.getInt8Ty(), CI, + {ConstantInt::get(Type::getInt64Ty(CI->getContext()), + HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))}); + Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP); + BlockCount->setMetadata(LLVMContext::MD_invariant_load, + MDNode::get(CI->getContext(), {})); + BlockCount->setMetadata(LLVMContext::MD_noundef, + MDNode::get(CI->getContext(), {})); + + Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType()); + Inst->replaceAllUsesWith(BlockCountExt); + Inst->eraseFromParent(); + MadeChange = true; + } + } + } + + // If reqd_work_group_size is set, we can replace work group size with it. + if (!HasReqdWorkGroupSize) + return MadeChange; + + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; + if (!GroupSize) + continue; + + ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); + GroupSize->replaceAllUsesWith( + ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL)); + MadeChange = true; + } + + return MadeChange; +} + // We do not need to note the x workitem or workgroup id because they are always // initialized. // @@ -1660,7 +2003,24 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, } } - return A.run() == ChangeStatus::CHANGED; + bool Changed = A.run() == ChangeStatus::CHANGED; + + // Kernel attribute lowering (merged from AMDGPULowerKernelAttributesPass) + bool IsV5OrAbove = + AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; + Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); + if (BasePtr) { + for (auto *F : Functions) { + for (Instruction &I : instructions(*F)) { + if (CallInst *CI = dyn_cast<CallInst>(&I)) { + if (CI->getCalledFunction() == BasePtr) + Changed |= processUse(CI, IsV5OrAbove); + } + } + } + } + + return Changed; } } // namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp deleted file mode 100644 index fbfb71059b6b1..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ /dev/null @@ -1,443 +0,0 @@ -//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file This pass does attempts to make use of reqd_work_group_size metadata -/// to eliminate loads from the dispatch packet and to constant fold OpenCL -/// get_local_size-like functions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/MDBuilder.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/Pass.h" - -#define DEBUG_TYPE "amdgpu-lower-kernel-attributes" - -using namespace llvm; - -namespace { - -// Field offsets in hsa_kernel_dispatch_packet_t. -enum DispatchPackedOffsets { - WORKGROUP_SIZE_X = 4, - WORKGROUP_SIZE_Y = 6, - WORKGROUP_SIZE_Z = 8, - - GRID_SIZE_X = 12, - GRID_SIZE_Y = 16, - GRID_SIZE_Z = 20 -}; - -// Field offsets to implicit kernel argument pointer. -enum ImplicitArgOffsets { - HIDDEN_BLOCK_COUNT_X = 0, - HIDDEN_BLOCK_COUNT_Y = 4, - HIDDEN_BLOCK_COUNT_Z = 8, - - HIDDEN_GROUP_SIZE_X = 12, - HIDDEN_GROUP_SIZE_Y = 14, - HIDDEN_GROUP_SIZE_Z = 16, - - HIDDEN_REMAINDER_X = 18, - HIDDEN_REMAINDER_Y = 20, - HIDDEN_REMAINDER_Z = 22, -}; - -class AMDGPULowerKernelAttributes : public ModulePass { -public: - static char ID; - - AMDGPULowerKernelAttributes() : ModulePass(ID) {} - - bool runOnModule(Module &M) override; - - StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - } -}; - -Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { - auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr - : Intrinsic::amdgcn_dispatch_ptr; - return Intrinsic::getDeclarationIfExists(&M, IntrinsicId); -} - -} // end anonymous namespace - -static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, - uint32_t MaxNumGroups) { - if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) - return; - - if (!Load->getType()->isIntegerTy(32)) - return; - - // TODO: If there is existing range metadata, preserve it if it is stricter. - MDBuilder MDB(Load->getContext()); - MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); - Load->setMetadata(LLVMContext::MD_range, Range); -} - -static bool processUse(CallInst *CI, bool IsV5OrAbove) { - Function *F = CI->getFunction(); - - auto *MD = F->getMetadata("reqd_work_group_size"); - const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; - - const bool HasUniformWorkGroupSize = - F->getFnAttribute("uniform-work-group-size").getValueAsBool(); - - SmallVector<unsigned> MaxNumWorkgroups = - AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", - /*Size=*/3, /*DefaultVal=*/0); - - if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && - !Intrinsic::getDeclarationIfExists(CI->getModule(), - Intrinsic::amdgcn_dispatch_ptr) && - none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) - return false; - - Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; - Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; - Value *Remainders[3] = {nullptr, nullptr, nullptr}; - Value *GridSizes[3] = {nullptr, nullptr, nullptr}; - - const DataLayout &DL = F->getDataLayout(); - - // We expect to see several GEP users, casted to the appropriate type and - // loaded. - for (User *U : CI->users()) { - if (!U->hasOneUse()) - continue; - - int64_t Offset = 0; - auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr? - auto *BCI = dyn_cast<BitCastInst>(U); - if (!Load && !BCI) { - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; - Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? - BCI = dyn_cast<BitCastInst>(*U->user_begin()); - } - - if (BCI) { - if (!BCI->hasOneUse()) - continue; - Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI? - } - - if (!Load || !Load->isSimple()) - continue; - - unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); - - // TODO: Handle merged loads. - if (IsV5OrAbove) { // Base is ImplicitArgPtr. - switch (Offset) { - case HIDDEN_BLOCK_COUNT_X: - if (LoadSize == 4) { - BlockCounts[0] = Load; - annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); - } - break; - case HIDDEN_BLOCK_COUNT_Y: - if (LoadSize == 4) { - BlockCounts[1] = Load; - annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); - } - break; - case HIDDEN_BLOCK_COUNT_Z: - if (LoadSize == 4) { - BlockCounts[2] = Load; - annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); - } - break; - case HIDDEN_GROUP_SIZE_X: - if (LoadSize == 2) - GroupSizes[0] = Load; - break; - case HIDDEN_GROUP_SIZE_Y: - if (LoadSize == 2) - GroupSizes[1] = Load; - break; - case HIDDEN_GROUP_SIZE_Z: - if (LoadSize == 2) - GroupSizes[2] = Load; - break; - case HIDDEN_REMAINDER_X: - if (LoadSize == 2) - Remainders[0] = Load; - break; - case HIDDEN_REMAINDER_Y: - if (LoadSize == 2) - Remainders[1] = Load; - break; - case HIDDEN_REMAINDER_Z: - if (LoadSize == 2) - Remainders[2] = Load; - break; - default: - break; - } - } else { // Base is DispatchPtr. - switch (Offset) { - case WORKGROUP_SIZE_X: - if (LoadSize == 2) - GroupSizes[0] = Load; - break; - case WORKGROUP_SIZE_Y: - if (LoadSize == 2) - GroupSizes[1] = Load; - break; - case WORKGROUP_SIZE_Z: - if (LoadSize == 2) - GroupSizes[2] = Load; - break; - case GRID_SIZE_X: - if (LoadSize == 4) - GridSizes[0] = Load; - break; - case GRID_SIZE_Y: - if (LoadSize == 4) - GridSizes[1] = Load; - break; - case GRID_SIZE_Z: - if (LoadSize == 4) - GridSizes[2] = Load; - break; - default: - break; - } - } - } - - bool MadeChange = false; - if (IsV5OrAbove && HasUniformWorkGroupSize) { - // Under v5 __ockl_get_local_size returns the value computed by the - // expression: - // - // workgroup_id < hidden_block_count ? hidden_group_size : - // hidden_remainder - // - // For functions with the attribute uniform-work-group-size=true. we can - // evaluate workgroup_id < hidden_block_count as true, and thus - // hidden_group_size is returned for __ockl_get_local_size. - for (int I = 0; I < 3; ++I) { - Value *BlockCount = BlockCounts[I]; - if (!BlockCount) - continue; - - using namespace llvm::PatternMatch; - auto GroupIDIntrin = - I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() - : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() - : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); - - for (User *ICmp : BlockCount->users()) { - if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin, - m_Specific(BlockCount)))) { - ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); - MadeChange = true; - } - } - } - - // All remainders should be 0 with uniform work group size. - for (Value *Remainder : Remainders) { - if (!Remainder) - continue; - Remainder->replaceAllUsesWith( - Constant::getNullValue(Remainder->getType())); - MadeChange = true; - } - } else if (HasUniformWorkGroupSize) { // Pre-V5. - // Pattern match the code used to handle partial workgroup dispatches in the - // library implementation of get_local_size, so the entire function can be - // constant folded with a known group size. - // - // uint r = grid_size - group_id * group_size; - // get_local_size = (r < group_size) ? r : group_size; - // - // If we have uniform-work-group-size (which is the default in OpenCL 1.2), - // the grid_size is required to be a multiple of group_size). In this case: - // - // grid_size - (group_id * group_size) < group_size - // -> - // grid_size < group_size + (group_id * group_size) - // - // (grid_size / group_size) < 1 + group_id - // - // grid_size / group_size is at least 1, so we can conclude the select - // condition is false (except for group_id == 0, where the select result is - // the same). - for (int I = 0; I < 3; ++I) { - Value *GroupSize = GroupSizes[I]; - Value *GridSize = GridSizes[I]; - if (!GroupSize || !GridSize) - continue; - - using namespace llvm::PatternMatch; - auto GroupIDIntrin = - I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() - : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() - : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); - - for (User *U : GroupSize->users()) { - auto *ZextGroupSize = dyn_cast<ZExtInst>(U); - if (!ZextGroupSize) - continue; - - for (User *UMin : ZextGroupSize->users()) { - if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize), - m_Mul(GroupIDIntrin, - m_Specific(ZextGroupSize))), - m_Specific(ZextGroupSize)))) { - if (HasReqdWorkGroupSize) { - ConstantInt *KnownSize = - mdconst::extract<ConstantInt>(MD->getOperand(I)); - UMin->replaceAllUsesWith(ConstantFoldIntegerCast( - KnownSize, UMin->getType(), false, DL)); - } else { - UMin->replaceAllUsesWith(ZextGroupSize); - } - - MadeChange = true; - } - } - } - } - } - - // Upgrade the old method of calculating the block size using the grid size. - // We pattern match any case where the implicit argument group size is the - // divisor to a dispatch packet grid size read of the same dimension. - if (IsV5OrAbove) { - for (int I = 0; I < 3; I++) { - Value *GroupSize = GroupSizes[I]; - if (!GroupSize || !GroupSize->getType()->isIntegerTy(16)) - continue; - - for (User *U : GroupSize->users()) { - Instruction *Inst = cast<Instruction>(U); - if (isa<ZExtInst>(Inst) && !Inst->use_empty()) - Inst = cast<Instruction>(*Inst->user_begin()); - - using namespace llvm::PatternMatch; - if (!match( - Inst, - m_UDiv(m_ZExtOrSelf(m_Load(m_GEP( - m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(), - m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))), - m_Value()))) - continue; - - IRBuilder<> Builder(Inst); - - Value *GEP = Builder.CreateInBoundsGEP( - Builder.getInt8Ty(), CI, - {ConstantInt::get(Type::getInt64Ty(CI->getContext()), - HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))}); - Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP); - BlockCount->setMetadata(LLVMContext::MD_invariant_load, - MDNode::get(CI->getContext(), {})); - BlockCount->setMetadata(LLVMContext::MD_noundef, - MDNode::get(CI->getContext(), {})); - - Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType()); - Inst->replaceAllUsesWith(BlockCountExt); - Inst->eraseFromParent(); - MadeChange = true; - } - } - } - - // If reqd_work_group_size is set, we can replace work group size with it. - if (!HasReqdWorkGroupSize) - return MadeChange; - - for (int I = 0; I < 3; I++) { - Value *GroupSize = GroupSizes[I]; - if (!GroupSize) - continue; - - ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); - GroupSize->replaceAllUsesWith( - ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL)); - MadeChange = true; - } - - return MadeChange; -} - -// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get -// TargetPassConfig for subtarget. -bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { - bool MadeChange = false; - bool IsV5OrAbove = - AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; - Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); - - if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. - return false; - - SmallPtrSet<Instruction *, 4> HandledUses; - for (auto *U : BasePtr->users()) { - CallInst *CI = cast<CallInst>(U); - if (HandledUses.insert(CI).second) { - if (processUse(CI, IsV5OrAbove)) - MadeChange = true; - } - } - - return MadeChange; -} - -INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, - "AMDGPU Kernel Attributes", false, false) -INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, - "AMDGPU Kernel Attributes", false, false) - -char AMDGPULowerKernelAttributes::ID = 0; - -ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { - return new AMDGPULowerKernelAttributes(); -} - -PreservedAnalyses -AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { - bool IsV5OrAbove = - AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5; - Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove); - - if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. - return PreservedAnalyses::all(); - - bool Changed = false; - for (Instruction &I : instructions(F)) { - if (CallInst *CI = dyn_cast<CallInst>(&I)) { - if (CI->getCalledFunction() == BasePtr) - Changed |= processUse(CI, IsV5OrAbove); - } - } - - return !Changed ? PreservedAnalyses::all() - : PreservedAnalyses::none().preserveSet<CFGAnalyses>(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index f464fbf31c754..40d12e6c10b80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -62,8 +62,6 @@ FUNCTION_PASS("amdgpu-late-codegenprepare", *static_cast<const GCNTargetMachine *>(this))) FUNCTION_PASS("amdgpu-lower-kernel-arguments", AMDGPULowerKernelArgumentsPass(*this)) -FUNCTION_PASS("amdgpu-lower-kernel-attributes", - AMDGPULowerKernelAttributesPass()) FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this)) FUNCTION_PASS("amdgpu-promote-alloca-to-vector", AMDGPUPromoteAllocaToVectorPass(*this)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d25b22b2b96dc..86b6e8b878ba1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -582,7 +582,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); - initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); @@ -874,8 +873,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { }); PB.registerPipelineEarlySimplificationEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level, - ThinOrFullLTOPhase Phase) { + [this](ModulePassManager &PM, OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { if (!isLTOPreLink(Phase)) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent @@ -898,6 +897,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); + + if (!isLTOPreLink(Phase)) + if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { + AMDGPUAttributorOptions Opts; + PM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); + } }); PB.registerPeepholeEPCallback( @@ -931,10 +936,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // but before SROA to increase SROA opportunities. FPM.addPass(InferAddressSpacesPass()); - // This should run after inlining to have any chance of doing - // anything, and before other cleanup optimizations. - FPM.addPass(AMDGPULowerKernelAttributesPass()); - if (Level != OptimizationLevel::O0) { // Promote alloca to vector before SROA and loop unroll. If we // manage to eliminate allocas before unroll we may choose to unroll @@ -945,20 +946,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); }); - // FIXME: Why is AMDGPUAttributor not in CGSCC? - PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM, - OptimizationLevel Level, - ThinOrFullLTOPhase Phase) { - if (Level != OptimizationLevel::O0) { - if (!isLTOPreLink(Phase)) { - if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) { - AMDGPUAttributorOptions Opts; - MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase)); - } - } - } - }); - PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { // When we are using -fgpu-rdc, we can only run accelerator code diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 782cbfa76e6e9..d85852beb803f 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -75,7 +75,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerBufferFatPointers.cpp AMDGPULowerIntrinsics.cpp AMDGPULowerKernelArguments.cpp - AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp AMDGPUPrepareAGPRAlloc.cpp AMDGPULowerExecSync.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll index 9064292129928..d8b80626f1974 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s define i32 @use_grid_size_x_max_num_workgroups() #0 { ; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups( @@ -111,10 +111,10 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" } !0 = !{i32 0, i32 -1} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[RNG0]] = !{i32 1, i32 37} diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll index 25e43a0f332c6..914658031f12e 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine %s | FileCheck %s define i32 @num_blocks_x() { -; CHECK-LABEL: define i32 @num_blocks_x() { +; CHECK-LABEL: define i32 @num_blocks_x( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0:![0-9]+]], !noundef [[META0]] @@ -21,7 +22,8 @@ entry: } define i32 @num_blocks_y() { -; CHECK-LABEL: define i32 @num_blocks_y() { +; CHECK-LABEL: define i32 @num_blocks_y( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 4 @@ -41,7 +43,8 @@ entry: } define i32 @num_blocks_z() { -; CHECK-LABEL: define i32 @num_blocks_z() { +; CHECK-LABEL: define i32 @num_blocks_z( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 8 @@ -62,7 +65,7 @@ entry: define i32 @num_blocks(i32 %dim) { ; CHECK-LABEL: define i32 @num_blocks( -; CHECK-SAME: i32 [[DIM:%.*]]) { +; CHECK-SAME: i32 [[DIM:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: switch i32 [[DIM]], label %[[DEFAULT:.*]] [ @@ -131,7 +134,8 @@ exit: } define i64 @larger() { -; CHECK-LABEL: define i64 @larger() { +; CHECK-LABEL: define i64 @larger( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]] @@ -152,7 +156,8 @@ entry: } define i32 @bad_offset() { -; CHECK-LABEL: define i32 @bad_offset() { +; CHECK-LABEL: define i32 @bad_offset( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK-NEXT: [[D_GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16 @@ -177,7 +182,8 @@ entry: } define i32 @dangling() { -; CHECK-LABEL: define i32 @dangling() { +; CHECK-LABEL: define i32 @dangling( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12 @@ -196,7 +202,8 @@ entry: } define i32 @wrong_cast() { -; CHECK-LABEL: define i32 @wrong_cast() { +; CHECK-LABEL: define i32 @wrong_cast( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12 @@ -221,7 +228,8 @@ entry: } define i32 @wrong_size() { -; CHECK-LABEL: define i32 @wrong_size() { +; CHECK-LABEL: define i32 @wrong_size( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12 @@ -246,7 +254,8 @@ entry: } define i32 @wrong_intrinsic() { -; CHECK-LABEL: define i32 @wrong_intrinsic() { +; CHECK-LABEL: define i32 @wrong_intrinsic( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16 @@ -271,7 +280,8 @@ entry: } define i16 @empty_use() { -; CHECK-LABEL: define i16 @empty_use() { +; CHECK-LABEL: define i16 @empty_use( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12 @@ -296,7 +306,8 @@ entry: } define i32 @multiple_use() { -; CHECK-LABEL: define i32 @multiple_use() { +; CHECK-LABEL: define i32 @multiple_use( +; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]] diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll index 3563e737f5520..1fa939977fc7e 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll index 8c4bd4e882ac6..aa8feb59bbbda 100644 --- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll @@ -1,5 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s ; CHECK-LABEL: @invalid_reqd_work_group_size( ; CHECK: load i16, diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index 6a88be6e55859..d655306dee19d 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -826,5 +826,5 @@ entry: ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "uniform-work-group-size"="false" } +; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) } attributes #0 = { nounwind } diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index d078403135963..d5cc5be2b6aa9 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -165,7 +165,6 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPULowerExecSync.cpp", "AMDGPULowerIntrinsics.cpp", "AMDGPULowerKernelArguments.cpp", - "AMDGPULowerKernelAttributes.cpp", "AMDGPULowerModuleLDSPass.cpp", "AMDGPULowerVGPREncoding.cpp", "AMDGPUMCInstLower.cpp", >From c61763845288ad74246466f167dff32154057c75 Mon Sep 17 00:00:00 2001 From: Yoonseo Choi <[email protected]> Date: Thu, 22 Jan 2026 17:46:00 -0600 Subject: [PATCH 2/2] Add missed change on a test --- .../amdgcnspirv-uses-amdgpu-abi.cpp | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp index 8f92d1fed1f9f..b6645409722aa 100644 --- a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp +++ b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp @@ -81,7 +81,7 @@ __global__ void k4(SingleElement) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local amdgpu_kernel void @_Z2k55ByRef( -// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -117,7 +117,7 @@ __global__ void k7(unsigned*) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f0s( -// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -129,7 +129,7 @@ __device__ void f0(short) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f1j( -// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -141,7 +141,7 @@ __device__ void f1(unsigned) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f2d( -// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -153,7 +153,7 @@ __device__ void f2(double) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f311Transparent( -// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -165,7 +165,7 @@ __device__ void f3(Transparent) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f413SingleElement( -// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -177,7 +177,7 @@ __device__ void f4(SingleElement) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f55ByRef( -// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -189,7 +189,7 @@ __device__ void f5(ByRef) { } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z2f6Dv1_jDv2_jDv3_jDv4_j( -// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret void // @@ -201,7 +201,7 @@ __device__ void f6(V1, V2, V3, V4) { } // AMDGCNSPIRV-NEXT: ret i16 0 // // AMDGPU-LABEL: define dso_local noundef signext i16 @_Z2f7v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i16 0 // @@ -213,7 +213,7 @@ __device__ short f7() { return 0; } // AMDGCNSPIRV-NEXT: ret i32 0 // // AMDGPU-LABEL: define dso_local noundef i32 @_Z2f8v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i32 0 // @@ -225,7 +225,7 @@ __device__ unsigned f8() { return 0; } // AMDGCNSPIRV-NEXT: ret double 0.000000e+00 // // AMDGPU-LABEL: define dso_local noundef double @_Z2f9v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret double 0.000000e+00 // @@ -237,7 +237,7 @@ __device__ double f9() { return 0.; } // AMDGCNSPIRV-NEXT: ret i32 0 // // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f10v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i32 0 // @@ -249,7 +249,7 @@ __device__ Transparent f10() { return {}; } // AMDGCNSPIRV-NEXT: ret i32 0 // // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f11v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret i32 0 // @@ -262,7 +262,7 @@ __device__ SingleElement f11() { return {}; } // AMDGCNSPIRV-NEXT: ret void // // AMDGPU-LABEL: define dso_local void @_Z3f12v( -// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 4 dereferenceable(68) [[AGG_RESULT]], i8 0, i64 68, i1 false) // AMDGPU-NEXT: ret void @@ -275,7 +275,7 @@ __device__ ByRef f12() { return {}; } // AMDGCNSPIRV-NEXT: ret <1 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <1 x i32> @_Z3f13v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <1 x i32> zeroinitializer // @@ -287,7 +287,7 @@ __device__ V1 f13() { return {}; } // AMDGCNSPIRV-NEXT: ret <2 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <2 x i32> @_Z3f14v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <2 x i32> zeroinitializer // @@ -299,7 +299,7 @@ __device__ V2 f14() { return {}; } // AMDGCNSPIRV-NEXT: ret <3 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <3 x i32> @_Z3f15v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <3 x i32> zeroinitializer // @@ -311,7 +311,7 @@ __device__ V3 f15() { return {}; } // AMDGCNSPIRV-NEXT: ret <4 x i32> zeroinitializer // // AMDGPU-LABEL: define dso_local noundef <4 x i32> @_Z3f16v( -// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] { +// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] { // AMDGPU-NEXT: [[ENTRY:.*:]] // AMDGPU-NEXT: ret <4 x i32> zeroinitializer // @@ -319,3 +319,11 @@ __device__ V4 f16() { return {}; } //. // AMDGCNSPIRV: [[META9]] = !{i32 1024, i32 1, i32 1} //. + +// For recording purpose of AMDGPU +// attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" } +// attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" } +// attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" } +// attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" } +// attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" } +// attributes #5 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
