https://github.com/yoonseoch updated 
https://github.com/llvm/llvm-project/pull/177432

>From a7b28cf2424e788a3416ba7b411241fc4387d7d0 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <[email protected]>
Date: Thu, 22 Jan 2026 12:48:23 -0600
Subject: [PATCH 1/3] [AMDGPU] Move AMDGPUAttributor earlier with lowering
 kernel attributes

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   9 -
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 362 +++++++++++++-
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    | 443 ------------------
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 -
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  29 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 -
 ...amdgpu-max-num-workgroups-load-annotate.ll |  10 +-
 .../AMDGPU/implicit-arg-block-count.ll        |  37 +-
 .../CodeGen/AMDGPU/implicit-arg-v5-opt.ll     |   2 +-
 .../CodeGen/AMDGPU/reqd-work-group-size.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll |   2 +-
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 -
 12 files changed, 402 insertions(+), 500 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..de76dd6ab3bb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -123,15 +123,6 @@ struct AMDGPUPromoteKernelArgumentsPass
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-ModulePass *createAMDGPULowerKernelAttributesPass();
-void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
-extern char &AMDGPULowerKernelAttributesID;
-
-struct AMDGPULowerKernelAttributesPass
-    : PassInfoMixin<AMDGPULowerKernelAttributesPass> {
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-};
-
 void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
 extern char &AMDGPULowerModuleLDSLegacyPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0b2ee6371da06..1f4229a2b15a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,8 +13,14 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/Attributor.h"
 
@@ -50,6 +56,343 @@ static constexpr std::pair<ImplicitArgumentMask, 
StringLiteral>
 #include "AMDGPUAttributes.def"
 };
 
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+  WORKGROUP_SIZE_X = 4,
+  WORKGROUP_SIZE_Y = 6,
+  WORKGROUP_SIZE_Z = 8,
+
+  GRID_SIZE_X = 12,
+  GRID_SIZE_Y = 16,
+  GRID_SIZE_Z = 20
+};
+
+// Field offsets to implicit kernel argument pointer.
+enum ImplicitArgOffsets {
+  HIDDEN_BLOCK_COUNT_X = 0,
+  HIDDEN_BLOCK_COUNT_Y = 4,
+  HIDDEN_BLOCK_COUNT_Z = 8,
+
+  HIDDEN_GROUP_SIZE_X = 12,
+  HIDDEN_GROUP_SIZE_Y = 14,
+  HIDDEN_GROUP_SIZE_Z = 16,
+
+  HIDDEN_REMAINDER_X = 18,
+  HIDDEN_REMAINDER_Y = 20,
+  HIDDEN_REMAINDER_Z = 22,
+};
+
+static Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
+  auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
+                                 : Intrinsic::amdgcn_dispatch_ptr;
+  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
+}
+
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+                                            uint32_t MaxNumGroups) {
+  if (MaxNumGroups == 0 || MaxNumGroups == 
std::numeric_limits<uint32_t>::max())
+    return;
+
+  if (!Load->getType()->isIntegerTy(32))
+    return;
+
+  // TODO: If there is existing range metadata, preserve it if it is stricter.
+  MDBuilder MDB(Load->getContext());
+  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+  Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
+static bool processUse(CallInst *CI, bool IsV5OrAbove) {
+  Function *F = CI->getFunction();
+
+  auto *MD = F->getMetadata("reqd_work_group_size");
+  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+  const bool HasUniformWorkGroupSize =
+      F->getFnAttribute("uniform-work-group-size").getValueAsBool();
+
+  SmallVector<unsigned> MaxNumWorkgroups =
+      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
+                                     /*Size=*/3, /*DefaultVal=*/0);
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      !Intrinsic::getDeclarationIfExists(CI->getModule(),
+                                         Intrinsic::amdgcn_dispatch_ptr) &&
+      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
+    return false;
+
+  Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
+  Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+  Value *Remainders[3] = {nullptr, nullptr, nullptr};
+  Value *GridSizes[3] = {nullptr, nullptr, nullptr};
+
+  const DataLayout &DL = F->getDataLayout();
+
+  // We expect to see several GEP users, casted to the appropriate type and
+  // loaded.
+  for (User *U : CI->users()) {
+    if (!U->hasOneUse())
+      continue;
+
+    int64_t Offset = 0;
+    auto *Load = dyn_cast<LoadInst>(U); // Load from 
ImplicitArgPtr/DispatchPtr?
+    auto *BCI = dyn_cast<BitCastInst>(U);
+    if (!Load && !BCI) {
+      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+        continue;
+      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+      BCI = dyn_cast<BitCastInst>(*U->user_begin());
+    }
+
+    if (BCI) {
+      if (!BCI->hasOneUse())
+        continue;
+      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
+    }
+
+    if (!Load || !Load->isSimple())
+      continue;
+
+    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+    // TODO: Handle merged loads.
+    if (IsV5OrAbove) { // Base is ImplicitArgPtr.
+      switch (Offset) {
+      case HIDDEN_BLOCK_COUNT_X:
+        if (LoadSize == 4) {
+          BlockCounts[0] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+        }
+        break;
+      case HIDDEN_BLOCK_COUNT_Y:
+        if (LoadSize == 4) {
+          BlockCounts[1] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+        }
+        break;
+      case HIDDEN_BLOCK_COUNT_Z:
+        if (LoadSize == 4) {
+          BlockCounts[2] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+        }
+        break;
+      case HIDDEN_GROUP_SIZE_X:
+        if (LoadSize == 2)
+          GroupSizes[0] = Load;
+        break;
+      case HIDDEN_GROUP_SIZE_Y:
+        if (LoadSize == 2)
+          GroupSizes[1] = Load;
+        break;
+      case HIDDEN_GROUP_SIZE_Z:
+        if (LoadSize == 2)
+          GroupSizes[2] = Load;
+        break;
+      case HIDDEN_REMAINDER_X:
+        if (LoadSize == 2)
+          Remainders[0] = Load;
+        break;
+      case HIDDEN_REMAINDER_Y:
+        if (LoadSize == 2)
+          Remainders[1] = Load;
+        break;
+      case HIDDEN_REMAINDER_Z:
+        if (LoadSize == 2)
+          Remainders[2] = Load;
+        break;
+      default:
+        break;
+      }
+    } else { // Base is DispatchPtr.
+      switch (Offset) {
+      case WORKGROUP_SIZE_X:
+        if (LoadSize == 2)
+          GroupSizes[0] = Load;
+        break;
+      case WORKGROUP_SIZE_Y:
+        if (LoadSize == 2)
+          GroupSizes[1] = Load;
+        break;
+      case WORKGROUP_SIZE_Z:
+        if (LoadSize == 2)
+          GroupSizes[2] = Load;
+        break;
+      case GRID_SIZE_X:
+        if (LoadSize == 4)
+          GridSizes[0] = Load;
+        break;
+      case GRID_SIZE_Y:
+        if (LoadSize == 4)
+          GridSizes[1] = Load;
+        break;
+      case GRID_SIZE_Z:
+        if (LoadSize == 4)
+          GridSizes[2] = Load;
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  bool MadeChange = false;
+  if (IsV5OrAbove && HasUniformWorkGroupSize) {
+    // Under v5  __ockl_get_local_size returns the value computed by the
+    // expression:
+    //
+    //   workgroup_id < hidden_block_count ? hidden_group_size :
+    //                                       hidden_remainder
+    //
+    // For functions with the attribute uniform-work-group-size=true. we can
+    // evaluate workgroup_id < hidden_block_count as true, and thus
+    // hidden_group_size is returned for __ockl_get_local_size.
+    for (int I = 0; I < 3; ++I) {
+      Value *BlockCount = BlockCounts[I];
+      if (!BlockCount)
+        continue;
+
+      using namespace llvm::PatternMatch;
+      auto GroupIDIntrin =
+          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+      for (User *ICmp : BlockCount->users()) {
+        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
+                                       m_Specific(BlockCount)))) {
+          
ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
+          MadeChange = true;
+        }
+      }
+    }
+
+    // All remainders should be 0 with uniform work group size.
+    for (Value *Remainder : Remainders) {
+      if (!Remainder)
+        continue;
+      Remainder->replaceAllUsesWith(
+          Constant::getNullValue(Remainder->getType()));
+      MadeChange = true;
+    }
+  } else if (HasUniformWorkGroupSize) { // Pre-V5.
+    // Pattern match the code used to handle partial workgroup dispatches in 
the
+    // library implementation of get_local_size, so the entire function can be
+    // constant folded with a known group size.
+    //
+    // uint r = grid_size - group_id * group_size;
+    // get_local_size = (r < group_size) ? r : group_size;
+    //
+    // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+    // the grid_size is required to be a multiple of group_size). In this case:
+    //
+    // grid_size - (group_id * group_size) < group_size
+    // ->
+    // grid_size < group_size + (group_id * group_size)
+    //
+    // (grid_size / group_size) < 1 + group_id
+    //
+    // grid_size / group_size is at least 1, so we can conclude the select
+    // condition is false (except for group_id == 0, where the select result is
+    // the same).
+    for (int I = 0; I < 3; ++I) {
+      Value *GroupSize = GroupSizes[I];
+      Value *GridSize = GridSizes[I];
+      if (!GroupSize || !GridSize)
+        continue;
+
+      using namespace llvm::PatternMatch;
+      auto GroupIDIntrin =
+          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+      for (User *U : GroupSize->users()) {
+        auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+        if (!ZextGroupSize)
+          continue;
+
+        for (User *UMin : ZextGroupSize->users()) {
+          if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
+                                       m_Mul(GroupIDIntrin,
+                                             m_Specific(ZextGroupSize))),
+                                 m_Specific(ZextGroupSize)))) {
+            if (HasReqdWorkGroupSize) {
+              ConstantInt *KnownSize =
+                  mdconst::extract<ConstantInt>(MD->getOperand(I));
+              UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
+                  KnownSize, UMin->getType(), false, DL));
+            } else {
+              UMin->replaceAllUsesWith(ZextGroupSize);
+            }
+
+            MadeChange = true;
+          }
+        }
+      }
+    }
+  }
+
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+          Inst = cast<Instruction>(*Inst->user_begin());
+
+        using namespace llvm::PatternMatch;
+        if (!match(
+                Inst,
+                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                           m_SpecificInt(GRID_SIZE_X + I * 
sizeof(uint32_t))))),
+                       m_Value())))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateInBoundsGEP(
+            Builder.getInt8Ty(), CI,
+            {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
+                              HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
+        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), 
GEP);
+        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+                                MDNode::get(CI->getContext(), {}));
+        BlockCount->setMetadata(LLVMContext::MD_noundef,
+                                MDNode::get(CI->getContext(), {}));
+
+        Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
+        Inst->replaceAllUsesWith(BlockCountExt);
+        Inst->eraseFromParent();
+        MadeChange = true;
+      }
+    }
+  }
+
+  // If reqd_work_group_size is set, we can replace work group size with it.
+  if (!HasReqdWorkGroupSize)
+    return MadeChange;
+
+  for (int I = 0; I < 3; I++) {
+    Value *GroupSize = GroupSizes[I];
+    if (!GroupSize)
+      continue;
+
+    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+    GroupSize->replaceAllUsesWith(
+        ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
 // We do not need to note the x workitem or workgroup id because they are 
always
 // initialized.
 //
@@ -1660,7 +2003,24 @@ static bool runImpl(Module &M, AnalysisGetter &AG, 
TargetMachine &TM,
     }
   }
 
-  return A.run() == ChangeStatus::CHANGED;
+  bool Changed = A.run() == ChangeStatus::CHANGED;
+
+  // Kernel attribute lowering (merged from AMDGPULowerKernelAttributesPass)
+  bool IsV5OrAbove =
+      AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
+  Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
+  if (BasePtr) {
+    for (auto *F : Functions) {
+      for (Instruction &I : instructions(*F)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+          if (CI->getCalledFunction() == BasePtr)
+            Changed |= processUse(CI, IsV5OrAbove);
+        }
+      }
+    }
+  }
+
+  return Changed;
 }
 } // namespace
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
deleted file mode 100644
index fbfb71059b6b1..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-//===-- 
AMDGPULowerKernelAttributes.cpp------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This pass does attempts to make use of reqd_work_group_size metadata
-/// to eliminate loads from the dispatch packet and to constant fold OpenCL
-/// get_local_size-like functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Pass.h"
-
-#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
-
-using namespace llvm;
-
-namespace {
-
-// Field offsets in hsa_kernel_dispatch_packet_t.
-enum DispatchPackedOffsets {
-  WORKGROUP_SIZE_X = 4,
-  WORKGROUP_SIZE_Y = 6,
-  WORKGROUP_SIZE_Z = 8,
-
-  GRID_SIZE_X = 12,
-  GRID_SIZE_Y = 16,
-  GRID_SIZE_Z = 20
-};
-
-// Field offsets to implicit kernel argument pointer.
-enum ImplicitArgOffsets {
-  HIDDEN_BLOCK_COUNT_X = 0,
-  HIDDEN_BLOCK_COUNT_Y = 4,
-  HIDDEN_BLOCK_COUNT_Z = 8,
-
-  HIDDEN_GROUP_SIZE_X = 12,
-  HIDDEN_GROUP_SIZE_Y = 14,
-  HIDDEN_GROUP_SIZE_Z = 16,
-
-  HIDDEN_REMAINDER_X = 18,
-  HIDDEN_REMAINDER_Y = 20,
-  HIDDEN_REMAINDER_Z = 22,
-};
-
-class AMDGPULowerKernelAttributes : public ModulePass {
-public:
-  static char ID;
-
-  AMDGPULowerKernelAttributes() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override;
-
-  StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-};
-
-Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
-  auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
-                                 : Intrinsic::amdgcn_dispatch_ptr;
-  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
-}
-
-} // end anonymous namespace
-
-static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
-                                            uint32_t MaxNumGroups) {
-  if (MaxNumGroups == 0 || MaxNumGroups == 
std::numeric_limits<uint32_t>::max())
-    return;
-
-  if (!Load->getType()->isIntegerTy(32))
-    return;
-
-  // TODO: If there is existing range metadata, preserve it if it is stricter.
-  MDBuilder MDB(Load->getContext());
-  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
-  Load->setMetadata(LLVMContext::MD_range, Range);
-}
-
-static bool processUse(CallInst *CI, bool IsV5OrAbove) {
-  Function *F = CI->getFunction();
-
-  auto *MD = F->getMetadata("reqd_work_group_size");
-  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
-
-  const bool HasUniformWorkGroupSize =
-      F->getFnAttribute("uniform-work-group-size").getValueAsBool();
-
-  SmallVector<unsigned> MaxNumWorkgroups =
-      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
-                                     /*Size=*/3, /*DefaultVal=*/0);
-
-  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
-      !Intrinsic::getDeclarationIfExists(CI->getModule(),
-                                         Intrinsic::amdgcn_dispatch_ptr) &&
-      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
-    return false;
-
-  Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
-  Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
-  Value *Remainders[3] = {nullptr, nullptr, nullptr};
-  Value *GridSizes[3] = {nullptr, nullptr, nullptr};
-
-  const DataLayout &DL = F->getDataLayout();
-
-  // We expect to see several GEP users, casted to the appropriate type and
-  // loaded.
-  for (User *U : CI->users()) {
-    if (!U->hasOneUse())
-      continue;
-
-    int64_t Offset = 0;
-    auto *Load = dyn_cast<LoadInst>(U); // Load from 
ImplicitArgPtr/DispatchPtr?
-    auto *BCI = dyn_cast<BitCastInst>(U);
-    if (!Load && !BCI) {
-      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
-        continue;
-      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
-      BCI = dyn_cast<BitCastInst>(*U->user_begin());
-    }
-
-    if (BCI) {
-      if (!BCI->hasOneUse())
-        continue;
-      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
-    }
-
-    if (!Load || !Load->isSimple())
-      continue;
-
-    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
-
-    // TODO: Handle merged loads.
-    if (IsV5OrAbove) { // Base is ImplicitArgPtr.
-      switch (Offset) {
-      case HIDDEN_BLOCK_COUNT_X:
-        if (LoadSize == 4) {
-          BlockCounts[0] = Load;
-          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
-        }
-        break;
-      case HIDDEN_BLOCK_COUNT_Y:
-        if (LoadSize == 4) {
-          BlockCounts[1] = Load;
-          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
-        }
-        break;
-      case HIDDEN_BLOCK_COUNT_Z:
-        if (LoadSize == 4) {
-          BlockCounts[2] = Load;
-          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
-        }
-        break;
-      case HIDDEN_GROUP_SIZE_X:
-        if (LoadSize == 2)
-          GroupSizes[0] = Load;
-        break;
-      case HIDDEN_GROUP_SIZE_Y:
-        if (LoadSize == 2)
-          GroupSizes[1] = Load;
-        break;
-      case HIDDEN_GROUP_SIZE_Z:
-        if (LoadSize == 2)
-          GroupSizes[2] = Load;
-        break;
-      case HIDDEN_REMAINDER_X:
-        if (LoadSize == 2)
-          Remainders[0] = Load;
-        break;
-      case HIDDEN_REMAINDER_Y:
-        if (LoadSize == 2)
-          Remainders[1] = Load;
-        break;
-      case HIDDEN_REMAINDER_Z:
-        if (LoadSize == 2)
-          Remainders[2] = Load;
-        break;
-      default:
-        break;
-      }
-    } else { // Base is DispatchPtr.
-      switch (Offset) {
-      case WORKGROUP_SIZE_X:
-        if (LoadSize == 2)
-          GroupSizes[0] = Load;
-        break;
-      case WORKGROUP_SIZE_Y:
-        if (LoadSize == 2)
-          GroupSizes[1] = Load;
-        break;
-      case WORKGROUP_SIZE_Z:
-        if (LoadSize == 2)
-          GroupSizes[2] = Load;
-        break;
-      case GRID_SIZE_X:
-        if (LoadSize == 4)
-          GridSizes[0] = Load;
-        break;
-      case GRID_SIZE_Y:
-        if (LoadSize == 4)
-          GridSizes[1] = Load;
-        break;
-      case GRID_SIZE_Z:
-        if (LoadSize == 4)
-          GridSizes[2] = Load;
-        break;
-      default:
-        break;
-      }
-    }
-  }
-
-  bool MadeChange = false;
-  if (IsV5OrAbove && HasUniformWorkGroupSize) {
-    // Under v5  __ockl_get_local_size returns the value computed by the
-    // expression:
-    //
-    //   workgroup_id < hidden_block_count ? hidden_group_size :
-    //                                       hidden_remainder
-    //
-    // For functions with the attribute uniform-work-group-size=true. we can
-    // evaluate workgroup_id < hidden_block_count as true, and thus
-    // hidden_group_size is returned for __ockl_get_local_size.
-    for (int I = 0; I < 3; ++I) {
-      Value *BlockCount = BlockCounts[I];
-      if (!BlockCount)
-        continue;
-
-      using namespace llvm::PatternMatch;
-      auto GroupIDIntrin =
-          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
-                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
-                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
-      for (User *ICmp : BlockCount->users()) {
-        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
-                                       m_Specific(BlockCount)))) {
-          
ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
-          MadeChange = true;
-        }
-      }
-    }
-
-    // All remainders should be 0 with uniform work group size.
-    for (Value *Remainder : Remainders) {
-      if (!Remainder)
-        continue;
-      Remainder->replaceAllUsesWith(
-          Constant::getNullValue(Remainder->getType()));
-      MadeChange = true;
-    }
-  } else if (HasUniformWorkGroupSize) { // Pre-V5.
-    // Pattern match the code used to handle partial workgroup dispatches in 
the
-    // library implementation of get_local_size, so the entire function can be
-    // constant folded with a known group size.
-    //
-    // uint r = grid_size - group_id * group_size;
-    // get_local_size = (r < group_size) ? r : group_size;
-    //
-    // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
-    // the grid_size is required to be a multiple of group_size). In this case:
-    //
-    // grid_size - (group_id * group_size) < group_size
-    // ->
-    // grid_size < group_size + (group_id * group_size)
-    //
-    // (grid_size / group_size) < 1 + group_id
-    //
-    // grid_size / group_size is at least 1, so we can conclude the select
-    // condition is false (except for group_id == 0, where the select result is
-    // the same).
-    for (int I = 0; I < 3; ++I) {
-      Value *GroupSize = GroupSizes[I];
-      Value *GridSize = GridSizes[I];
-      if (!GroupSize || !GridSize)
-        continue;
-
-      using namespace llvm::PatternMatch;
-      auto GroupIDIntrin =
-          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
-                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
-                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
-      for (User *U : GroupSize->users()) {
-        auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
-        if (!ZextGroupSize)
-          continue;
-
-        for (User *UMin : ZextGroupSize->users()) {
-          if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
-                                       m_Mul(GroupIDIntrin,
-                                             m_Specific(ZextGroupSize))),
-                                 m_Specific(ZextGroupSize)))) {
-            if (HasReqdWorkGroupSize) {
-              ConstantInt *KnownSize =
-                  mdconst::extract<ConstantInt>(MD->getOperand(I));
-              UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
-                  KnownSize, UMin->getType(), false, DL));
-            } else {
-              UMin->replaceAllUsesWith(ZextGroupSize);
-            }
-
-            MadeChange = true;
-          }
-        }
-      }
-    }
-  }
-
-  // Upgrade the old method of calculating the block size using the grid size.
-  // We pattern match any case where the implicit argument group size is the
-  // divisor to a dispatch packet grid size read of the same dimension.
-  if (IsV5OrAbove) {
-    for (int I = 0; I < 3; I++) {
-      Value *GroupSize = GroupSizes[I];
-      if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
-        continue;
-
-      for (User *U : GroupSize->users()) {
-        Instruction *Inst = cast<Instruction>(U);
-        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
-          Inst = cast<Instruction>(*Inst->user_begin());
-
-        using namespace llvm::PatternMatch;
-        if (!match(
-                Inst,
-                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
-                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
-                           m_SpecificInt(GRID_SIZE_X + I * 
sizeof(uint32_t))))),
-                       m_Value())))
-          continue;
-
-        IRBuilder<> Builder(Inst);
-
-        Value *GEP = Builder.CreateInBoundsGEP(
-            Builder.getInt8Ty(), CI,
-            {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
-                              HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
-        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), 
GEP);
-        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
-                                MDNode::get(CI->getContext(), {}));
-        BlockCount->setMetadata(LLVMContext::MD_noundef,
-                                MDNode::get(CI->getContext(), {}));
-
-        Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
-        Inst->replaceAllUsesWith(BlockCountExt);
-        Inst->eraseFromParent();
-        MadeChange = true;
-      }
-    }
-  }
-
-  // If reqd_work_group_size is set, we can replace work group size with it.
-  if (!HasReqdWorkGroupSize)
-    return MadeChange;
-
-  for (int I = 0; I < 3; I++) {
-    Value *GroupSize = GroupSizes[I];
-    if (!GroupSize)
-      continue;
-
-    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
-    GroupSize->replaceAllUsesWith(
-        ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
-    MadeChange = true;
-  }
-
-  return MadeChange;
-}
-
-// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
-// TargetPassConfig for subtarget.
-bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
-  bool MadeChange = false;
-  bool IsV5OrAbove =
-      AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
-  Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
-
-  if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
-    return false;
-
-  SmallPtrSet<Instruction *, 4> HandledUses;
-  for (auto *U : BasePtr->users()) {
-    CallInst *CI = cast<CallInst>(U);
-    if (HandledUses.insert(CI).second) {
-      if (processUse(CI, IsV5OrAbove))
-        MadeChange = true;
-    }
-  }
-
-  return MadeChange;
-}
-
-INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
-                      "AMDGPU Kernel Attributes", false, false)
-INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
-                    "AMDGPU Kernel Attributes", false, false)
-
-char AMDGPULowerKernelAttributes::ID = 0;
-
-ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
-  return new AMDGPULowerKernelAttributes();
-}
-
-PreservedAnalyses
-AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) 
{
-  bool IsV5OrAbove =
-      AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= 
AMDGPU::AMDHSA_COV5;
-  Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
-
-  if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
-    return PreservedAnalyses::all();
-
-  bool Changed = false;
-  for (Instruction &I : instructions(F)) {
-    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-      if (CI->getCalledFunction() == BasePtr)
-        Changed |= processUse(CI, IsV5OrAbove);
-    }
-  }
-
-  return !Changed ? PreservedAnalyses::all()
-                  : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def 
b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index f464fbf31c754..40d12e6c10b80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -62,8 +62,6 @@ FUNCTION_PASS("amdgpu-late-codegenprepare",
                 *static_cast<const GCNTargetMachine *>(this)))
 FUNCTION_PASS("amdgpu-lower-kernel-arguments",
               AMDGPULowerKernelArgumentsPass(*this))
-FUNCTION_PASS("amdgpu-lower-kernel-attributes",
-              AMDGPULowerKernelAttributesPass())
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d25b22b2b96dc..86b6e8b878ba1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -582,7 +582,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void 
LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
-  initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -874,8 +873,8 @@ void 
AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 
   PB.registerPipelineEarlySimplificationEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level,
-         ThinOrFullLTOPhase Phase) {
+      [this](ModulePassManager &PM, OptimizationLevel Level,
+             ThinOrFullLTOPhase Phase) {
         if (!isLTOPreLink(Phase)) {
           // When we are not using -fgpu-rdc, we can run accelerator code
           // selection relatively early, but still after linking to prevent
@@ -898,6 +897,12 @@ void 
AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
+
+        if (!isLTOPreLink(Phase))
+          if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
+            AMDGPUAttributorOptions Opts;
+            PM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
+          }
       });
 
   PB.registerPeepholeEPCallback(
@@ -931,10 +936,6 @@ void 
AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
 
-        // This should run after inlining to have any chance of doing
-        // anything, and before other cleanup optimizations.
-        FPM.addPass(AMDGPULowerKernelAttributesPass());
-
         if (Level != OptimizationLevel::O0) {
           // Promote alloca to vector before SROA and loop unroll. If we
           // manage to eliminate allocas before unroll we may choose to unroll
@@ -945,20 +946,6 @@ void 
AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
 
-  // FIXME: Why is AMDGPUAttributor not in CGSCC?
-  PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
-                                            OptimizationLevel Level,
-                                            ThinOrFullLTOPhase Phase) {
-    if (Level != OptimizationLevel::O0) {
-      if (!isLTOPreLink(Phase)) {
-        if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
-          AMDGPUAttributorOptions Opts;
-          MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
-        }
-      }
-    }
-  });
-
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         // When we are using -fgpu-rdc, we can only run accelerator code
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt 
b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 782cbfa76e6e9..d85852beb803f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -75,7 +75,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerBufferFatPointers.cpp
   AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
-  AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
   AMDGPULowerExecSync.cpp
diff --git 
a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll 
b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
index 9064292129928..d8b80626f1974 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --check-globals all --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa 
-passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | 
FileCheck %s
 
 define i32 @use_grid_size_x_max_num_workgroups() #0 {
 ; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
@@ -111,10 +111,10 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
 !0 = !{i32 0, i32 -1}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
-; CHECK: attributes #[[ATTR1]] = { 
"amdgpu-max-num-workgroups"="4294967294,42,89" }
-; CHECK: attributes #[[ATTR2]] = { 
"amdgpu-max-num-workgroups"="4294967295,42,89" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" 
"amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" 
"amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" 
"amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" 
"uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { 
"amdgpu-max-num-workgroups"="4294967294,42,89" "amdgpu-no-cluster-id-x" 
"amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" 
"amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" 
"amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" 
"amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" 
"amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { 
"amdgpu-max-num-workgroups"="4294967295,42,89" "amdgpu-no-cluster-id-x" 
"amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" 
"amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" 
"amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" 
"amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" 
"amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" 
"amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" 
"amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" 
"amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" 
"amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" 
"uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind 
speculatable willreturn memory(none) }
 ;.
 ; CHECK: [[RNG0]] = !{i32 1, i32 37}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll 
b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
index 25e43a0f332c6..914658031f12e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine 
%s | FileCheck %s
 
 define i32 @num_blocks_x() {
-; CHECK-LABEL: define i32 @num_blocks_x() {
+; CHECK-LABEL: define i32 @num_blocks_x(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], 
align 4, !invariant.load [[META0:![0-9]+]], !noundef [[META0]]
@@ -21,7 +22,8 @@ entry:
 }
 
 define i32 @num_blocks_y() {
-; CHECK-LABEL: define i32 @num_blocks_y() {
+; CHECK-LABEL: define i32 @num_blocks_y(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[IMPLICITARG]], i64 4
@@ -41,7 +43,8 @@ entry:
 }
 
 define i32 @num_blocks_z() {
-; CHECK-LABEL: define i32 @num_blocks_z() {
+; CHECK-LABEL: define i32 @num_blocks_z(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[IMPLICITARG]], i64 8
@@ -62,7 +65,7 @@ entry:
 
 define i32 @num_blocks(i32 %dim) {
 ; CHECK-LABEL: define i32 @num_blocks(
-; CHECK-SAME: i32 [[DIM:%.*]]) {
+; CHECK-SAME: i32 [[DIM:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    switch i32 [[DIM]], label %[[DEFAULT:.*]] [
@@ -131,7 +134,8 @@ exit:
 }
 
 define i64 @larger() {
-; CHECK-LABEL: define i64 @larger() {
+; CHECK-LABEL: define i64 @larger(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) 
[[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
@@ -152,7 +156,8 @@ entry:
 }
 
 define i32 @bad_offset() {
-; CHECK-LABEL: define i32 @bad_offset() {
+; CHECK-LABEL: define i32 @bad_offset(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) 
[[DISPATCH]], i64 16
@@ -177,7 +182,8 @@ entry:
 }
 
 define i32 @dangling() {
-; CHECK-LABEL: define i32 @dangling() {
+; CHECK-LABEL: define i32 @dangling(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) 
[[DISPATCH]], i64 12
@@ -196,7 +202,8 @@ entry:
 }
 
 define i32 @wrong_cast() {
-; CHECK-LABEL: define i32 @wrong_cast() {
+; CHECK-LABEL: define i32 @wrong_cast(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) 
[[DISPATCH]], i64 12
@@ -221,7 +228,8 @@ entry:
 }
 
 define i32 @wrong_size() {
-; CHECK-LABEL: define i32 @wrong_size() {
+; CHECK-LABEL: define i32 @wrong_size(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) 
[[DISPATCH]], i64 12
@@ -246,7 +254,8 @@ entry:
 }
 
 define i32 @wrong_intrinsic() {
-; CHECK-LABEL: define i32 @wrong_intrinsic() {
+; CHECK-LABEL: define i32 @wrong_intrinsic(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) 
[[DISPATCH]], i64 16
@@ -271,7 +280,8 @@ entry:
 }
 
 define i16 @empty_use() {
-; CHECK-LABEL: define i16 @empty_use() {
+; CHECK-LABEL: define i16 @empty_use(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) 
[[DISPATCH]], i64 12
@@ -296,7 +306,8 @@ entry:
 }
 
 define i32 @multiple_use() {
-; CHECK-LABEL: define i32 @multiple_use() {
+; CHECK-LABEL: define i32 @multiple_use(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], 
align 4, !invariant.load [[META0]], !noundef [[META0]]
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll 
b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index 3563e737f5520..1fa939977fc7e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | 
FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck 
-enable-var-scope -check-prefix=GCN %s
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone 
willreturn
 define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll 
b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index 8c4bd4e882ac6..aa8feb59bbbda 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | 
FileCheck -enable-var-scope %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | 
FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck 
-enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S 
-passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck 
-enable-var-scope %s
 
 ; CHECK-LABEL: @invalid_reqd_work_group_size(
 ; CHECK: load i16,
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll 
b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 6a88be6e55859..d655306dee19d 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -826,5 +826,5 @@ entry:
 ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr 
#[[$NOUNWIND_READONLY:[0-9]+]]
 
 ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind }
-; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind 
memory(read) "uniform-work-group-size"="false" }
+; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind 
memory(read) }
 attributes #0 = { nounwind }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn 
b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index d078403135963..d5cc5be2b6aa9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -165,7 +165,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPULowerExecSync.cpp",
     "AMDGPULowerIntrinsics.cpp",
     "AMDGPULowerKernelArguments.cpp",
-    "AMDGPULowerKernelAttributes.cpp",
     "AMDGPULowerModuleLDSPass.cpp",
     "AMDGPULowerVGPREncoding.cpp",
     "AMDGPUMCInstLower.cpp",

>From c61763845288ad74246466f167dff32154057c75 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <[email protected]>
Date: Thu, 22 Jan 2026 17:46:00 -0600
Subject: [PATCH 2/3] Add missed change on a test

---
 .../amdgcnspirv-uses-amdgpu-abi.cpp           | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp 
b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
index 8f92d1fed1f9f..b6645409722aa 100644
--- a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
+++ b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
@@ -81,7 +81,7 @@ __global__ void k4(SingleElement) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local amdgpu_kernel void @_Z2k55ByRef(
-// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) 
align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) 
align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -117,7 +117,7 @@ __global__ void k7(unsigned*) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f0s(
-// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr 
#[[ATTR1:[0-9]+]] {
+// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr 
#[[ATTR2:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -129,7 +129,7 @@ __device__ void f0(short) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f1j(
-// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -141,7 +141,7 @@ __device__ void f1(unsigned) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f2d(
-// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -153,7 +153,7 @@ __device__ void f2(double) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f311Transparent(
-// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -165,7 +165,7 @@ __device__ void f3(Transparent) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f413SingleElement(
-// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -177,7 +177,7 @@ __device__ void f4(SingleElement) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f55ByRef(
-// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) 
align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) 
align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -189,7 +189,7 @@ __device__ void f5(ByRef) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f6Dv1_jDv2_jDv3_jDv4_j(
-// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef 
[[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) 
local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef 
[[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) 
local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -201,7 +201,7 @@ __device__ void f6(V1, V2, V3, V4) { }
 // AMDGCNSPIRV-NEXT:    ret i16 0
 //
 // AMDGPU-LABEL: define dso_local noundef signext i16 @_Z2f7v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i16 0
 //
@@ -213,7 +213,7 @@ __device__ short f7() { return 0; }
 // AMDGCNSPIRV-NEXT:    ret i32 0
 //
 // AMDGPU-LABEL: define dso_local noundef i32 @_Z2f8v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i32 0
 //
@@ -225,7 +225,7 @@ __device__ unsigned f8() { return 0; }
 // AMDGCNSPIRV-NEXT:    ret double 0.000000e+00
 //
 // AMDGPU-LABEL: define dso_local noundef double @_Z2f9v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret double 0.000000e+00
 //
@@ -237,7 +237,7 @@ __device__ double f9() { return 0.; }
 // AMDGCNSPIRV-NEXT:    ret i32 0
 //
 // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f10v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i32 0
 //
@@ -249,7 +249,7 @@ __device__ Transparent f10() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret i32 0
 //
 // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f11v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i32 0
 //
@@ -262,7 +262,7 @@ __device__ SingleElement f11() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z3f12v(
-// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly 
sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) 
[[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly 
sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) 
[[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef 
align 4 dereferenceable(68) [[AGG_RESULT]], i8 0, i64 68, i1 false)
 // AMDGPU-NEXT:    ret void
@@ -275,7 +275,7 @@ __device__ ByRef f12() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <1 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <1 x i32> @_Z3f13v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <1 x i32> zeroinitializer
 //
@@ -287,7 +287,7 @@ __device__ V1 f13() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <2 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <2 x i32> @_Z3f14v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <2 x i32> zeroinitializer
 //
@@ -299,7 +299,7 @@ __device__ V2 f14() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <3 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <3 x i32> @_Z3f15v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <3 x i32> zeroinitializer
 //
@@ -311,7 +311,7 @@ __device__ V3 f15() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <4 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <4 x i32> @_Z3f16v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <4 x i32> zeroinitializer
 //
@@ -319,3 +319,11 @@ __device__ V4 f16() { return {}; }
 //.
 // AMDGCNSPIRV: [[META9]] = !{i32 1024, i32 1, i32 1}
 //.
+
+// For recording purpose of AMDGPU
+// attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn 
memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" 
"amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" 
"amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx906" 
"uniform-work-group-size"="true" }
+// attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn 
memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" 
"amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" 
"amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" 
"amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" 
"amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" 
"amdgpu-no-workitem-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx906" 
"uniform-work-group-size"="true" }
+// attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn 
memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" 
"amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" 
"amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx906" 
"uniform-work-group-size"="false" }
+// attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn 
memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" 
"amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" 
"amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" 
"amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" 
"amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" 
"amdgpu-no-workitem-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx906" 
"uniform-work-group-size"="false" }
+// attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn 
memory(argmem: write) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" 
"amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" 
"amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx906" 
"uniform-work-group-size"="false" }
+// attributes #5 = { mustprogress nocallback nofree nounwind willreturn 
memory(argmem: write) }

>From 6f41cfb6b30057e5ad271708f09607d0ce562779 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <[email protected]>
Date: Fri, 23 Jan 2026 22:14:11 -0600
Subject: [PATCH 3/3] Add a new lit-test

---
 llvm/test/CodeGen/AMDGPU/early-attributor.ll | 504 +++++++++++++++++++
 1 file changed, 504 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/early-attributor.ll

diff --git a/llvm/test/CodeGen/AMDGPU/early-attributor.ll 
b/llvm/test/CodeGen/AMDGPU/early-attributor.ll
new file mode 100644
index 0000000000000..a01fa93ce928e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/early-attributor.ll
@@ -0,0 +1,504 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 6
+; RUN: opt < %s --amdgpu-internalize-symbols --passes='default<O2>' 
-mtriple=amdgcn-amd-amdhsa -S | FileCheck %s
+
+; By running AMDGPUAttributor early, "amdgpu-uniform-work-group-size"="true" 
can be applied to non-kernel functions.
+; Make sure following pattern of reading blockDim.y from implicitarg pointer 
is cleaned up.
+; A load from implicitarg pointer, a compare and select to drive an offset, 
and a final load using that offset is
+; simplified into one direct load with a constant offset.
+;
+;  %13 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+;  %14 = getelementptr inbounds nuw i8, ptr addrspace(4) %13, i64 4
+;  %15 = tail call i32 @llvm.amdgcn.workgroup.id.y()
+;  %16 = load i32, ptr addrspace(4) %14, align 4
+;  %17 = icmp ult i32 %15, %16
+;  %18 = select i1 %17, i64 14, i64 20
+;  %19 = getelementptr inbounds nuw i8, ptr addrspace(4) %13, i64 %18
+;  %20 = load i16, ptr addrspace(4) %19, align 2
+;
+; -->
+;
+;  %13 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+;  %14 = getelementptr inbounds nuw i8, ptr addrspace(4) %13, i64 14
+;  %15 = load i16, ptr addrspace(4) %14, align 2
+;
+; If AMDGOUAttributor is not run early, after the non-kernel callee is 
transformed (e.g. unroll), the target pattern
+; for optimization using "amdgpu-uniform-work-group-size"="true" disappears. 
After inlining into the kernel,
+; although the kernel has "amdgpu-uniform-work-group-size"="true", but it is 
useless.
+
+target datalayout = 
"m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+%class.anon = type { ptr, ptr, ptr }
+
+$_Z11MassVec3DPAILm192EEvPKdS1_Pdi = comdat any
+
+$_ZZ11MassVec3DPAILm192EEvPKdS1_PdiENKUliE_clEi = comdat any
+
+$_ZN25__hip_builtin_threadIdx_t7__get_zEv = comdat any
+
+$_ZN25__hip_builtin_threadIdx_t7__get_yEv = comdat any
+
+$_ZN24__hip_builtin_blockDim_t7__get_yEv = comdat any
+
+@__const.__assert_fail.fmt = hidden unnamed_addr addrspace(4) constant [47 x 
i8] c"%s:%u: %s: Device-side assertion `%s' failed.\0A\00", align 16
+@__oclc_ISA_version = hidden local_unnamed_addr addrspace(4) constant i32 
9402, align 4
+@__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant 
i32 600
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define protected amdgpu_kernel void @_Z11MassVec3DPAILm192EEvPKdS1_Pdi(ptr 
addrspace(1) noundef %0, ptr addrspace(1) noundef %1, ptr addrspace(1) noundef 
%2, i32 noundef %3) #0 comdat {
+; CHECK-LABEL: define protected amdgpu_kernel void 
@_Z11MassVec3DPAILm192EEvPKdS1_Pdi(
+; CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[TMP0:%.*]], 
ptr addrspace(1) noundef readonly captures(none) [[TMP1:%.*]], ptr addrspace(1) 
noundef writeonly captures(none) [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) 
local_unnamed_addr #[[ATTR0:[0-9]+]] comdat {
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call range(i32 0, 1024) i32 
@llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp samesign ult i32 [[TMP5]], 4
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[DOTLR_PH_I:.*]], label 
%[[_ZZ11MASSVEC3DPAILM192EEVPKDS1_PDIENKULIE_CLEI_EXIT:.*]]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call ptr addrspace(4) 
@llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP7]], i64 14
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr addrspace(4) [[TMP8]], align 2, 
!tbaa [[SHORT_TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    br label %[[BB21:.*]]
+; CHECK:       [[__CRIT_EDGE_I:.*:]]
+; CHECK-NEXT:    [[DOT0910_1_I:%.*]] = phi i32 [ [[TMP19:%.*]], 
%[[DOT_CRIT_EDGE_I:.*]] ], [ [[TMP5]], %[[BB21]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i32 [[DOT0910_1_I]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = zext nneg i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw double, ptr 
addrspace(1) [[TMP0]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr addrspace(1) [[TMP13]], 
align 8, !tbaa [[DOUBLE_TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw double, ptr 
addrspace(1) [[TMP1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr addrspace(1) [[TMP15]], 
align 8, !tbaa [[DOUBLE_TBAA5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd contract double [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw double, ptr 
addrspace(1) [[TMP2]], i64 [[TMP12]]
+; CHECK-NEXT:    store double [[TMP17]], ptr addrspace(1) [[TMP18]], align 8, 
!tbaa [[DOUBLE_TBAA5]]
+; CHECK-NEXT:    [[TMP19]] = add nuw nsw i32 [[DOT0910_1_I]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp samesign ult i32 [[TMP19]], 4
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[DOT_CRIT_EDGE_I]], label 
%[[_ZZ11MASSVEC3DPAILM192EEVPKDS1_PDIENKULIE_CLEI_EXIT]], !llvm.loop 
[[LOOP9:![0-9]+]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[DOT0910_I:%.*]] = phi i32 [ [[TMP5]], %[[DOTLR_PH_I]] ], [ 
[[TMP29:%.*]], %[[BB21]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = zext nneg i32 [[DOT0910_I]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw double, ptr 
addrspace(1) [[TMP0]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr addrspace(1) [[TMP23]], 
align 8, !tbaa [[DOUBLE_TBAA5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw double, ptr 
addrspace(1) [[TMP1]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr addrspace(1) [[TMP25]], 
align 8, !tbaa [[DOUBLE_TBAA5]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd contract double [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw double, ptr 
addrspace(1) [[TMP2]], i64 [[TMP22]]
+; CHECK-NEXT:    store double [[TMP27]], ptr addrspace(1) [[TMP28]], align 8, 
!tbaa [[DOUBLE_TBAA5]]
+; CHECK-NEXT:    [[TMP29]] = add nuw nsw i32 [[DOT0910_I]], [[TMP10]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp samesign ult i32 [[TMP29]], 4
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[BB21]], label %[[DOT_CRIT_EDGE_I]], 
!llvm.loop [[LOOP9]]
+; CHECK:       [[_ZZ11MASSVEC3DPAILM192EEVPKDS1_PDIENKULIE_CLEI_EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  %5 = alloca ptr, align 8, addrspace(5)
+  %6 = alloca ptr, align 8, addrspace(5)
+  %7 = alloca ptr, align 8, addrspace(5)
+  %8 = alloca ptr, align 8, addrspace(5)
+  %9 = alloca ptr, align 8, addrspace(5)
+  %10 = alloca ptr, align 8, addrspace(5)
+  %11 = alloca i32, align 4, addrspace(5)
+  %12 = alloca %class.anon, align 8, addrspace(5)
+  %13 = addrspacecast ptr addrspace(5) %5 to ptr
+  %14 = addrspacecast ptr addrspace(5) %6 to ptr
+  %15 = addrspacecast ptr addrspace(5) %7 to ptr
+  %16 = addrspacecast ptr addrspace(5) %8 to ptr
+  %17 = addrspacecast ptr addrspace(5) %9 to ptr
+  %18 = addrspacecast ptr addrspace(5) %10 to ptr
+  %19 = addrspacecast ptr addrspace(5) %11 to ptr
+  %20 = addrspacecast ptr addrspace(5) %12 to ptr
+  store ptr addrspace(1) %0, ptr %13, align 8
+  %21 = load ptr, ptr %13, align 8, !tbaa !7
+  store ptr addrspace(1) %1, ptr %14, align 8
+  %22 = load ptr, ptr %14, align 8, !tbaa !7
+  store ptr addrspace(1) %2, ptr %15, align 8
+  %23 = load ptr, ptr %15, align 8, !tbaa !7
+  store ptr %21, ptr %16, align 8, !tbaa !7
+  store ptr %22, ptr %17, align 8, !tbaa !7
+  store ptr %23, ptr %18, align 8, !tbaa !7
+  store i32 %3, ptr %19, align 4, !tbaa !12
+  call void @llvm.lifetime.start.p5(ptr addrspace(5) %12)
+  %24 = getelementptr inbounds nuw %class.anon, ptr %20, i32 0, i32 0
+  store ptr %18, ptr %24, align 8, !tbaa !14
+  %25 = getelementptr inbounds nuw %class.anon, ptr %20, i32 0, i32 1
+  store ptr %16, ptr %25, align 8, !tbaa !14
+  %26 = getelementptr inbounds nuw %class.anon, ptr %20, i32 0, i32 2
+  store ptr %17, ptr %26, align 8, !tbaa !14
+  %27 = call noundef i32 @_ZN25__hip_builtin_threadIdx_t7__get_zEv() #8
+  call void @_ZZ11MassVec3DPAILm192EEvPKdS1_PdiENKUliE_clEi(ptr noundef 
nonnull align 8 dereferenceable(24) %20, i32 noundef %27) #8
+  call void @llvm.lifetime.end.p5(ptr addrspace(5) %12)
+  ret void
+}
+
+; Function Attrs: convergent inlinehint mustprogress nounwind
+define weak_odr hidden void 
@_ZZ11MassVec3DPAILm192EEvPKdS1_PdiENKUliE_clEi(ptr noundef nonnull align 8 
dereferenceable(24) %0, i32 noundef %1) #1 comdat align 2 {
+  %3 = alloca ptr, align 8, addrspace(5)
+  %4 = alloca i32, align 4, addrspace(5)
+  %5 = alloca i32, align 4, addrspace(5)
+  %6 = alloca i32, align 4, addrspace(5)
+  %7 = alloca i32, align 4, addrspace(5)
+  %8 = alloca i32, align 4, addrspace(5)
+  %9 = addrspacecast ptr addrspace(5) %3 to ptr
+  %10 = addrspacecast ptr addrspace(5) %4 to ptr
+  %11 = addrspacecast ptr addrspace(5) %5 to ptr
+  %12 = addrspacecast ptr addrspace(5) %6 to ptr
+  %13 = addrspacecast ptr addrspace(5) %7 to ptr
+  %14 = addrspacecast ptr addrspace(5) %8 to ptr
+  store ptr %0, ptr %9, align 8, !tbaa !16
+  store i32 %1, ptr %10, align 4, !tbaa !12
+  %15 = load ptr, ptr %9, align 8
+  call void @llvm.lifetime.start.p5(ptr addrspace(5) %5)
+  store i32 0, ptr %11, align 4, !tbaa !12
+  br label %16
+
+16:                                               ; preds = %57, %2
+  %17 = load i32, ptr %11, align 4, !tbaa !12
+  %18 = icmp slt i32 %17, 2
+  br i1 %18, label %20, label %19
+
+19:                                               ; preds = %16
+  store i32 2, ptr %12, align 4
+  call void @llvm.lifetime.end.p5(ptr addrspace(5) %5)
+  br label %60
+
+20:                                               ; preds = %16
+  call void @llvm.lifetime.start.p5(ptr addrspace(5) %7)
+  %21 = call noundef i32 @_ZN25__hip_builtin_threadIdx_t7__get_yEv() #8
+  store i32 %21, ptr %13, align 4, !tbaa !12
+  br label %22
+
+22:                                               ; preds = %52, %20
+  %23 = load i32, ptr %13, align 4, !tbaa !12
+  %24 = icmp slt i32 %23, 4
+  br i1 %24, label %26, label %25
+
+25:                                               ; preds = %22
+  store i32 5, ptr %12, align 4
+  call void @llvm.lifetime.end.p5(ptr addrspace(5) %7)
+  br label %56
+
+26:                                               ; preds = %22
+  call void @llvm.lifetime.start.p5(ptr addrspace(5) %8)
+  %27 = load i32, ptr %11, align 4, !tbaa !12
+  %28 = mul nsw i32 4, %27
+  %29 = load i32, ptr %13, align 4, !tbaa !12
+  %30 = add nsw i32 %28, %29
+  store i32 %30, ptr %14, align 4, !tbaa !12
+  %31 = getelementptr inbounds nuw %class.anon, ptr %15, i32 0, i32 1
+  %32 = load ptr, ptr %31, align 8, !tbaa !17
+  %33 = load ptr, ptr %32, align 8, !tbaa !7
+  %34 = load i32, ptr %14, align 4, !tbaa !12
+  %35 = sext i32 %34 to i64
+  %36 = getelementptr inbounds double, ptr %33, i64 %35
+  %37 = load double, ptr %36, align 8, !tbaa !19
+  %38 = getelementptr inbounds nuw %class.anon, ptr %15, i32 0, i32 2
+  %39 = load ptr, ptr %38, align 8, !tbaa !21
+  %40 = load ptr, ptr %39, align 8, !tbaa !7
+  %41 = load i32, ptr %14, align 4, !tbaa !12
+  %42 = sext i32 %41 to i64
+  %43 = getelementptr inbounds double, ptr %40, i64 %42
+  %44 = load double, ptr %43, align 8, !tbaa !19
+  %45 = fadd contract double %37, %44
+  %46 = getelementptr inbounds nuw %class.anon, ptr %15, i32 0, i32 0
+  %47 = load ptr, ptr %46, align 8, !tbaa !22
+  %48 = load ptr, ptr %47, align 8, !tbaa !7
+  %49 = load i32, ptr %14, align 4, !tbaa !12
+  %50 = sext i32 %49 to i64
+  %51 = getelementptr inbounds double, ptr %48, i64 %50
+  store double %45, ptr %51, align 8, !tbaa !19
+  call void @llvm.lifetime.end.p5(ptr addrspace(5) %8)
+  br label %52
+
+52:                                               ; preds = %26
+  %53 = call noundef i32 @_ZN24__hip_builtin_blockDim_t7__get_yEv() #8
+  %54 = load i32, ptr %13, align 4, !tbaa !12
+  %55 = add i32 %54, %53
+  store i32 %55, ptr %13, align 4, !tbaa !12
+  br label %22, !llvm.loop !23
+
+56:                                               ; preds = %25
+  br label %57
+
+57:                                               ; preds = %56
+  %58 = load i32, ptr %11, align 4, !tbaa !12
+  %59 = add nsw i32 %58, 1
+  store i32 %59, ptr %11, align 4, !tbaa !12
+  br label %16, !llvm.loop !25
+
+60:                                               ; preds = %19
+  ret void
+}
+
+; Function Attrs: alwaysinline convergent mustprogress nounwind
+define weak_odr hidden noundef i32 @_ZN25__hip_builtin_threadIdx_t7__get_zEv() 
#2 comdat align 2 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = addrspacecast ptr addrspace(5) %1 to ptr
+  %3 = call noundef i32 @_ZL22__hip_get_thread_idx_zv() #8
+  ret i32 %3
+}
+
+; Function Attrs: alwaysinline convergent mustprogress nounwind
+define weak_odr hidden noundef i32 @_ZN25__hip_builtin_threadIdx_t7__get_yEv() 
#2 comdat align 2 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = addrspacecast ptr addrspace(5) %1 to ptr
+  %3 = call noundef i32 @_ZL22__hip_get_thread_idx_yv() #8
+  ret i32 %3
+}
+
+; Function Attrs: alwaysinline convergent mustprogress nounwind
+define weak_odr hidden noundef i32 @_ZN24__hip_builtin_blockDim_t7__get_yEv() 
#2 comdat align 2 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = addrspacecast ptr addrspace(5) %1 to ptr
+  %3 = call noundef i32 @_ZL21__hip_get_block_dim_yv() #8
+  ret i32 %3
+}
+
+; Function Attrs: alwaysinline convergent mustprogress nounwind
+define hidden noundef i32 @_ZL22__hip_get_thread_idx_yv() #2 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = addrspacecast ptr addrspace(5) %1 to ptr
+  %3 = call i64 @__ockl_get_local_id(i32 noundef 1) #9
+  %4 = trunc i64 %3 to i32
+  ret i32 %4
+}
+
+; Function Attrs: alwaysinline convergent mustprogress nounwind
+define hidden noundef i32 @_ZL21__hip_get_block_dim_yv() #2 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = addrspacecast ptr addrspace(5) %1 to ptr
+  %3 = call i64 @__ockl_get_local_size(i32 noundef 1) #9
+  %4 = trunc i64 %3 to i32
+  ret i32 %4
+}
+
+; Function Attrs: alwaysinline convergent mustprogress nounwind
+define hidden noundef i32 @_ZL22__hip_get_thread_idx_zv() #2 {
+  %1 = alloca i32, align 4, addrspace(5)
+  %2 = addrspacecast ptr addrspace(5) %1 to ptr
+  %3 = call i64 @__ockl_get_local_id(i32 noundef 2) #9
+  %4 = trunc i64 %3 to i32
+  ret i32 %4
+}
+
+; Function Attrs: convergent mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+define hidden range(i64 0, 1024) i64 @__ockl_get_local_id(i32 noundef %0) #3 {
+  switch i32 %0, label %8 [
+  i32 0, label %2
+  i32 1, label %4
+  i32 2, label %6
+  ]
+
+2:                                                ; preds = %1
+  %3 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+  br label %8
+
+4:                                                ; preds = %1
+  %5 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+  br label %8
+
+6:                                                ; preds = %1
+  %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+  br label %8
+
+8:                                                ; preds = %6, %4, %2, %1
+  %9 = phi i32 [ %7, %6 ], [ %5, %4 ], [ %3, %2 ], [ 0, %1 ]
+  %10 = zext nneg i32 %9 to i64
+  ret i64 %10
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #4
+
+; Function Attrs: convergent mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+define hidden range(i64 0, 65536) i64 @__ockl_get_local_size(i32 noundef %0) 
#5 {
+  switch i32 %0, label %76 [
+  i32 0, label %2
+  i32 1, label %26
+  i32 2, label %51
+  ]
+
+2:                                                ; preds = %1
+  %3 = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4, !tbaa !26
+  %4 = icmp slt i32 %3, 500
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = tail call align 4 dereferenceable(64) ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
+  %7 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %8 = getelementptr inbounds nuw i8, ptr addrspace(4) %6, i64 4
+  %9 = load i16, ptr addrspace(4) %8, align 4, !range !30, !invariant.load 
!31, !noundef !31
+  %10 = zext nneg i16 %9 to i32
+  %11 = getelementptr inbounds nuw i8, ptr addrspace(4) %6, i64 12
+  %12 = load i32, ptr addrspace(4) %11, align 4, !tbaa !32
+  %13 = mul i32 %7, %10
+  %14 = sub i32 %12, %13
+  %15 = tail call i32 @llvm.umin.i32(i32 %14, i32 %10)
+  %16 = zext nneg i32 %15 to i64
+  br label %76
+
+17:                                               ; preds = %2
+  %18 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %19 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %20 = load i32, ptr addrspace(4) %19, align 4, !tbaa !26
+  %21 = icmp ult i32 %18, %20
+  %22 = select i1 %21, i64 12, i64 18
+  %23 = getelementptr inbounds nuw i8, ptr addrspace(4) %19, i64 %22
+  %24 = load i16, ptr addrspace(4) %23, align 2, !tbaa !39
+  %25 = zext i16 %24 to i64
+  br label %76
+
+26:                                               ; preds = %1
+  %27 = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4, !tbaa !26
+  %28 = icmp slt i32 %27, 500
+  br i1 %28, label %29, label %41
+
+29:                                               ; preds = %26
+  %30 = tail call align 4 dereferenceable(64) ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
+  %31 = tail call i32 @llvm.amdgcn.workgroup.id.y()
+  %32 = getelementptr inbounds nuw i8, ptr addrspace(4) %30, i64 6
+  %33 = load i16, ptr addrspace(4) %32, align 2, !range !30, !invariant.load 
!31, !noundef !31
+  %34 = zext nneg i16 %33 to i32
+  %35 = getelementptr inbounds nuw i8, ptr addrspace(4) %30, i64 16
+  %36 = load i32, ptr addrspace(4) %35, align 8, !tbaa !40
+  %37 = mul i32 %31, %34
+  %38 = sub i32 %36, %37
+  %39 = tail call i32 @llvm.umin.i32(i32 %38, i32 %34)
+  %40 = zext nneg i32 %39 to i64
+  br label %76
+
+41:                                               ; preds = %26
+  %42 = tail call i32 @llvm.amdgcn.workgroup.id.y()
+  %43 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %44 = getelementptr inbounds nuw i8, ptr addrspace(4) %43, i64 4
+  %45 = load i32, ptr addrspace(4) %44, align 4, !tbaa !26
+  %46 = icmp ult i32 %42, %45
+  %47 = select i1 %46, i64 14, i64 20
+  %48 = getelementptr inbounds nuw i8, ptr addrspace(4) %43, i64 %47
+  %49 = load i16, ptr addrspace(4) %48, align 2, !tbaa !39
+  %50 = zext i16 %49 to i64
+  br label %76
+
+51:                                               ; preds = %1
+  %52 = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4, !tbaa !26
+  %53 = icmp slt i32 %52, 500
+  br i1 %53, label %54, label %66
+
+54:                                               ; preds = %51
+  %55 = tail call align 4 dereferenceable(64) ptr addrspace(4) 
@llvm.amdgcn.dispatch.ptr()
+  %56 = tail call i32 @llvm.amdgcn.workgroup.id.z()
+  %57 = getelementptr inbounds nuw i8, ptr addrspace(4) %55, i64 8
+  %58 = load i16, ptr addrspace(4) %57, align 4, !range !30, !invariant.load 
!31, !noundef !31
+  %59 = zext nneg i16 %58 to i32
+  %60 = getelementptr inbounds nuw i8, ptr addrspace(4) %55, i64 20
+  %61 = load i32, ptr addrspace(4) %60, align 4, !tbaa !41
+  %62 = mul i32 %56, %59
+  %63 = sub i32 %61, %62
+  %64 = tail call i32 @llvm.umin.i32(i32 %63, i32 %59)
+  %65 = zext nneg i32 %64 to i64
+  br label %76
+
+66:                                               ; preds = %51
+  %67 = tail call i32 @llvm.amdgcn.workgroup.id.z()
+  %68 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %69 = getelementptr inbounds nuw i8, ptr addrspace(4) %68, i64 8
+  %70 = load i32, ptr addrspace(4) %69, align 4, !tbaa !26
+  %71 = icmp ult i32 %67, %70
+  %72 = select i1 %71, i64 16, i64 22
+  %73 = getelementptr inbounds nuw i8, ptr addrspace(4) %68, i64 %72
+  %74 = load i16, ptr addrspace(4) %73, align 2, !tbaa !39
+  %75 = zext i16 %74 to i64
+  br label %76
+
+76:                                               ; preds = %66, %54, %41, 
%29, %17, %5, %1
+  %77 = phi i64 [ 1, %1 ], [ %16, %5 ], [ %25, %17 ], [ %40, %29 ], [ %50, %41 
], [ %65, %54 ], [ %75, %66 ]
+  ret i64 %77
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef nonnull align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.x() #4
+
+; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind 
speculatable willreturn memory(none)
+declare i32 @llvm.umin.i32(i32, i32) #6
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.y() #4
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn 
memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.z() #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: 
readwrite)
+declare void @llvm.lifetime.start.p5(ptr addrspace(5) captures(none)) #7
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: 
readwrite)
+declare void @llvm.lifetime.end.p5(ptr addrspace(5) captures(none)) #7
+
+attributes #0 = { convergent mustprogress norecurse nounwind 
"amdgpu-flat-work-group-size"="1,192" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx942" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 "uniform-work-group-size"="true" }
+attributes #1 = { convergent inlinehint mustprogress nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-cpu"="gfx942" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 }
+attributes #2 = { alwaysinline convergent mustprogress nounwind 
"no-trapping-math"="true" "stack-protector-buffer-size"="8" 
"target-cpu"="gfx942" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 }
+attributes #3 = { convergent mustprogress nofree norecurse nosync nounwind 
willreturn memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" 
"amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" 
"amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" 
"amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx942" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64,+xf32-insts"
 "uniform-work-group-size"="false" }
+attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn 
memory(none) }
+attributes #5 = { convergent mustprogress nofree norecurse nosync nounwind 
willreturn memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" 
"amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" 
"amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" 
"amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workitem-id-x" 
"amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" 
"stack-protector-buffer-size"="8" "target-cpu"="gfx942" 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gws,+mai-insts,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize64,+xf32-insts"
 "uniform-work-group-size"="false" }
+attributes #6 = { nocallback nocreateundeforpoison nofree nosync nounwind 
speculatable willreturn memory(none) }
+attributes #7 = { nocallback nofree nosync nounwind willreturn memory(argmem: 
readwrite) }
+attributes #8 = { convergent nounwind }
+attributes #9 = { convergent nounwind willreturn memory(none) }
+
+!opencl.ocl.version = !{!6, !6, !6, !6, !6, !6, !6, !6, !6, !6}
+
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 600}
+!6 = !{i32 2, i32 0}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"p1 double", !9, i64 0}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!13, !13, i64 0}
+!13 = !{!"int", !10, i64 0}
+!14 = !{!15, !15, i64 0}
+!15 = !{!"p2 double", !9, i64 0}
+!16 = !{!9, !9, i64 0}
+!17 = !{!18, !15, i64 8}
+!18 = !{!"_ZTSZ11MassVec3DPAILm192EEvPKdS1_PdiEUliE_", !15, i64 0, !15, i64 8, 
!15, i64 16}
+!19 = !{!20, !20, i64 0}
+!20 = !{!"double", !10, i64 0}
+!21 = !{!18, !15, i64 16}
+!22 = !{!18, !15, i64 0}
+!23 = distinct !{!23, !24}
+!24 = !{!"llvm.loop.mustprogress"}
+!25 = distinct !{!25, !24}
+!26 = !{!27, !27, i64 0}
+!27 = !{!"int", !28, i64 0}
+!28 = !{!"omnipotent char", !29, i64 0}
+!29 = !{!"Simple C/C++ TBAA"}
+!30 = !{i16 1, i16 1025}
+!31 = !{}
+!32 = !{!33, !27, i64 12}
+!33 = !{!"hsa_kernel_dispatch_packet_s", !34, i64 0, !34, i64 2, !34, i64 4, 
!34, i64 6, !34, i64 8, !34, i64 10, !27, i64 12, !27, i64 16, !27, i64 20, 
!27, i64 24, !27, i64 28, !28, i64 32, !35, i64 40, !37, i64 48, !38, i64 56}
+!34 = !{!"short", !28, i64 0}
+!35 = !{!"p1 void", !36, i64 0}
+!36 = !{!"any pointer", !28, i64 0}
+!37 = !{!"long", !28, i64 0}
+!38 = !{!"hsa_signal_s", !37, i64 0}
+!39 = !{!34, !34, i64 0}
+!40 = !{!33, !27, i64 16}
+!41 = !{!33, !27, i64 20}
+;.
+; CHECK: [[SHORT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"short", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[DOUBLE_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"double", [[META7:![0-9]+]], i64 0}
+; CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+; CHECK: [[META8]] = !{!"Simple C++ TBAA"}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]]}
+; CHECK: [[META10]] = !{!"llvm.loop.mustprogress"}
+;.

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to