https://github.com/nhaehnle created 
https://github.com/llvm/llvm-project/pull/170512

This change is motivated by the overall goal of finding alternative ways
to promote allocas to VGPRs. The current solution is effectively limited
to allocas whose size matches a register class, and we can't keep adding
more register classes. We have some downstream work in this direction,
and I'm currently looking at cleaning that up to bring it upstream.

This refactor paves the way to adding a third way of promoting allocas,
on top of the existing alloca-to-vector and alloca-to-LDS. Much of the
analysis can be shared between the different promotion techniques.

Additionally, the idea behind splitting the pass into an analysis
phase and a commit phase is that it ought to allow us to more easily make
better "big picture" decision about which allocas to promote how in the
future.

---

**Stack**:
- [5/5] #170512 ⬅
- [4/5] #170511
- [3/5] #170510
- [2/5] #170509
- [1/5] #170508


⚠️ *Part of a stack created by [spr](https://github.com/nhaehnle/spr). Merging 
this PR using the GitHub UI may have unexpected results.*

From 269205deb1ffc85e8acfef7e1f8e3a628b528d17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <[email protected]>
Date: Fri, 7 Nov 2025 16:05:38 -0800
Subject: [PATCH] AMDGPU/PromoteAlloca: Refactor into analysis / commit phases

This change is motivated by the overall goal of finding alternative ways
to promote allocas to VGPRs. The current solution is effectively limited
to allocas whose size matches a register class, and we can't keep adding
more register classes. We have some downstream work in this direction,
and I'm currently looking at cleaning that up to bring it upstream.

This refactor paves the way to adding a third way of promoting allocas,
on top of the existing alloca-to-vector and alloca-to-LDS. Much of the
analysis can be shared between the different promotion techniques.

Additionally, the idea behind splitting the pass into an analysis
phase and a commit phase is that it ought to allow us to more easily make
better "big picture" decision about which allocas to promote how in the
future.

commit-id:138f5985
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 644 ++++++++++--------
 .../AMDGPU/promote-alloca-negative-index.ll   |   6 +-
 .../CodeGen/AMDGPU/promote-alloca-scoring.ll  |  64 +-
 3 files changed, 382 insertions(+), 332 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index efd3664266dee..f431535c722ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -85,6 +85,42 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
+// We support vector indices of the form (A * stride) + B
+// All parts are optional.
+struct GEPToVectorIndex {
+  Value *VarIndex = nullptr;         // defaults to 0
+  ConstantInt *VarMul = nullptr;     // defaults to 1
+  ConstantInt *ConstIndex = nullptr; // defaults to 0
+  Value *Full = nullptr;
+};
+
+struct MemTransferInfo {
+  ConstantInt *SrcIndex = nullptr;
+  ConstantInt *DestIndex = nullptr;
+};
+
+// Analysis for planning the different strategies of alloca promotion.
+struct AllocaAnalysis {
+  AllocaInst *Alloca = nullptr;
+  SmallVector<Value *> Pointers;
+  SmallVector<Use *> Uses;
+  unsigned Score = 0;
+  bool HaveSelectOrPHI = false;
+  struct {
+    FixedVectorType *Ty = nullptr;
+    SmallVector<Instruction *> Worklist;
+    SmallVector<Instruction *> UsersToRemove;
+    MapVector<GetElementPtrInst *, GEPToVectorIndex> GEPVectorIdx;
+    MapVector<MemTransferInst *, MemTransferInfo> TransferInfo;
+  } Vector;
+  struct {
+    bool Enable = false;
+    SmallVector<User *> Worklist;
+  } LDS;
+
+  explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
+};
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
@@ -106,10 +142,7 @@ class AMDGPUPromoteAllocaImpl {
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
 
-  /// BaseAlloca is the alloca root the search started from.
-  /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
-                               std::vector<Value *> &WorkList) const;
+  bool collectAllocaUses(AllocaAnalysis &AA) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
@@ -123,10 +156,12 @@ class AMDGPUPromoteAllocaImpl {
   bool hasSufficientLocalMem(const Function &F);
 
   FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
-  bool tryPromoteAllocaToVector(AllocaInst &I);
-  bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+  void analyzePromoteToVector(AllocaAnalysis &AA) const;
+  void promoteAllocaToVector(AllocaAnalysis &AA);
+  void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+  bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS);
 
-  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+  void scoreAlloca(AllocaAnalysis &AA) const;
 
   void setFunctionLimits(const Function &F);
 
@@ -237,53 +272,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
 
-static void collectAllocaUses(AllocaInst &Alloca,
-                              SmallVectorImpl<Use *> &Uses) {
-  SmallVector<Instruction *, 4> WorkList({&Alloca});
+bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
+  const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+    LLVM_DEBUG(dbgs() << "  Cannot promote alloca: " << Msg << "\n"
+                      << "    " << *Inst << "\n");
+    return false;
+  };
+
+  SmallVector<Instruction *, 4> WorkList({AA.Alloca});
   while (!WorkList.empty()) {
     auto *Cur = WorkList.pop_back_val();
+    if (find(AA.Pointers, Cur) != AA.Pointers.end())
+      continue;
+    AA.Pointers.push_back(Cur);
     for (auto &U : Cur->uses()) {
-      Uses.push_back(&U);
+      auto *Inst = cast<Instruction>(U.getUser());
+      if (isa<StoreInst>(Inst)) {
+        if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
+          return RejectUser(Inst, "pointer escapes via store");
+        }
+      }
+      AA.Uses.push_back(&U);
 
-      if (isa<GetElementPtrInst>(U.getUser()))
-        WorkList.push_back(cast<Instruction>(U.getUser()));
+      if (isa<GetElementPtrInst>(U.getUser())) {
+        WorkList.push_back(Inst);
+      } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
+        // Only promote a select if we know that the other select operand is
+        // from another pointer that will also be promoted.
+        if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
+          return RejectUser(Inst, "select from mixed objects");
+        WorkList.push_back(Inst);
+        AA.HaveSelectOrPHI = true;
+      } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+        // Repeat for phis.
+
+        // TODO: Handle more complex cases. We should be able to replace loops
+        // over arrays.
+        switch (Phi->getNumIncomingValues()) {
+        case 1:
+          break;
+        case 2:
+          if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
+            return RejectUser(Inst, "phi from mixed objects");
+          break;
+        default:
+          return RejectUser(Inst, "phi with too many operands");
+        }
+
+        WorkList.push_back(Inst);
+        AA.HaveSelectOrPHI = true;
+      }
     }
   }
+  return true;
 }
 
-void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
-    SmallVectorImpl<AllocaInst *> &Allocas) {
-  DenseMap<AllocaInst *, unsigned> Scores;
-
-  for (auto *Alloca : Allocas) {
-    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
-    unsigned &Score = Scores[Alloca];
-    // Increment score by one for each user + a bonus for users within loops.
-    SmallVector<Use *, 8> Uses;
-    collectAllocaUses(*Alloca, Uses);
-    for (auto *U : Uses) {
-      Instruction *Inst = cast<Instruction>(U->getUser());
-      if (isa<GetElementPtrInst>(Inst))
-        continue;
-      unsigned UserScore =
-          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
-      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
-      Score += UserScore;
-    }
-    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
+  LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
+  unsigned Score = 0;
+  // Increment score by one for each user + a bonus for users within loops.
+  for (auto *U : AA.Uses) {
+    Instruction *Inst = cast<Instruction>(U->getUser());
+    if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+        isa<PHINode>(Inst))
+      continue;
+    unsigned UserScore =
+        1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+    LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+    Score += UserScore;
   }
-
-  stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
-    return Scores.at(A) > Scores.at(B);
-  });
-
-  // clang-format off
-  LLVM_DEBUG(
-    dbgs() << "Sorted Worklist:\n";
-    for (auto *A: Allocas)
-      dbgs() << "  " << *A << "\n";
-  );
-  // clang-format on
+  LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  AA.Score = Score;
 }
 
 void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
@@ -320,27 +379,48 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool 
PromoteToLDS) {
                                   : (MaxVGPRs * 32)) /
       VGPRBudgetRatio;
 
-  SmallVector<AllocaInst *, 16> Allocas;
+  std::vector<AllocaAnalysis> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       // Array allocations are probably not worth handling, since an allocation
       // of the array type is the canonical form.
       if (!AI->isStaticAlloca() || AI->isArrayAllocation())
         continue;
-      Allocas.push_back(AI);
+
+      LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
+
+      AllocaAnalysis AA{AI};
+      if (collectAllocaUses(AA)) {
+        analyzePromoteToVector(AA);
+        if (PromoteToLDS)
+          analyzePromoteToLDS(AA);
+        if (AA.Vector.Ty || AA.LDS.Enable) {
+          scoreAlloca(AA);
+          Allocas.push_back(std::move(AA));
+        }
+      }
     }
   }
 
-  sortAllocasToPromote(Allocas);
+  stable_sort(Allocas,
+              [](const auto &A, const auto &B) { return A.Score > B.Score; });
+
+  // clang-format off
+  LLVM_DEBUG(
+    dbgs() << "Sorted Worklist:\n";
+    for (const auto &AA : Allocas)
+      dbgs() << "  " << *AA.Alloca << "\n";
+  );
+  // clang-format on
 
   bool Changed = false;
-  for (AllocaInst *AI : Allocas) {
-    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
-    // First, check if we have enough budget to vectorize this alloca.
-    if (AllocaCost <= VectorizationBudget) {
-      // If we do, attempt vectorization, otherwise, fall through and try
-      // promoting to LDS instead.
-      if (tryPromoteAllocaToVector(*AI)) {
+  for (AllocaAnalysis &AA : Allocas) {
+    if (AA.Vector.Ty) {
+      const unsigned AllocaCost =
+          DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
+      // First, check if we have enough budget to vectorize this alloca.
+      if (AllocaCost <= VectorizationBudget) {
+        promoteAllocaToVector(AA);
         Changed = true;
         assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
                "Underflow!");
@@ -348,14 +428,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool 
PromoteToLDS) {
         LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
                           << VectorizationBudget << "\n");
         continue;
+      } else {
+        LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
+                          << AllocaCost << ", budget:" << VectorizationBudget
+                          << "): " << *AA.Alloca << "\n");
       }
-    } else {
-      LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
-                        << AllocaCost << ", budget:" << VectorizationBudget
-                        << "): " << *AI << "\n");
     }
 
-    if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+    if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS))
       Changed = true;
   }
 
@@ -366,11 +446,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool 
PromoteToLDS) {
   return Changed;
 }
 
-struct MemTransferInfo {
-  ConstantInt *SrcIndex = nullptr;
-  ConstantInt *DestIndex = nullptr;
-};
-
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
@@ -388,23 +463,49 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst 
*AI,
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
 }
 
-static Value *calculateVectorIndex(
-    Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
-  if (!GEP)
-    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
+  IRBuilder<> B(Ptr->getContext());
+
+  Ptr = Ptr->stripPointerCasts();
+  if (Ptr == AA.Alloca)
+    return B.getInt32(0);
+
+  auto *GEP = cast<GetElementPtrInst>(Ptr);
+  auto I = AA.Vector.GEPVectorIdx.find(GEP);
+  assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
+
+  if (!I->second.Full) {
+    Value *Result = nullptr;
+    B.SetInsertPoint(GEP);
 
-  auto I = GEPIdx.find(GEP);
-  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+    if (I->second.VarIndex) {
+      Result = I->second.VarIndex;
+      Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
 
-  Value *IndexValue = I->second;
-  assert(IndexValue && "index value missing from GEP index map");
-  return IndexValue;
+      if (I->second.VarMul)
+        Result = B.CreateMul(Result, I->second.VarMul);
+    }
+
+    if (I->second.ConstIndex) {
+      if (Result) {
+        Result = B.CreateAdd(Result, I->second.ConstIndex);
+      } else {
+        Result = I->second.ConstIndex;
+      }
+    }
+
+    if (!Result)
+      Result = B.getInt32(0);
+
+    I->second.Full = Result;
+  }
+
+  return I->second.Full;
 }
 
-static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
-                               Type *VecElemTy, const DataLayout &DL,
-                               SmallVector<Instruction *> &NewInsts) {
+static std::optional<GEPToVectorIndex>
+computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
+                        Type *VecElemTy, const DataLayout &DL) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   LLVMContext &Ctx = GEP->getContext();
@@ -432,7 +533,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, 
AllocaInst *Alloca,
   Value *CurPtr = GEP;
   while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
     if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
-      return nullptr;
+      return {};
 
     // Move to the next outer pointer.
     CurPtr = CurGEP->getPointerOperand();
@@ -442,69 +543,58 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, 
AllocaInst *Alloca,
 
   int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
   if (VarOffsets.size() > 1)
-    return nullptr;
+    return {};
 
   APInt IndexQuot;
   int64_t Rem;
   APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
   if (Rem != 0)
-    return nullptr;
-  if (VarOffsets.size() == 0)
-    return ConstantInt::get(Ctx, IndexQuot);
+    return {};
+
+  GEPToVectorIndex Result;
+
+  if (!ConstOffset.isZero())
+    Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
 
-  IRBuilder<> Builder(GEP);
+  if (VarOffsets.empty())
+    return Result;
 
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
   APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
   if (Rem != 0 || OffsetQuot.isZero())
-    return nullptr;
-
-  Value *Offset = VarOffset.first;
-  if (!isa<IntegerType>(Offset->getType()))
-    return nullptr;
+    return {};
 
-  Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));
-  if (Offset != VarOffset.first)
-    NewInsts.push_back(cast<Instruction>(Offset));
+  Result.VarIndex = VarOffset.first;
+  auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
+  if (!OffsetType)
+    return {};
 
   if (!OffsetQuot.isOne()) {
-    ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
-    Offset = Builder.CreateMul(Offset, ConstMul);
-    if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
-      NewInsts.push_back(NewInst);
+    Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
   }
-  if (ConstOffset.isZero())
-    return Offset;
-
-  ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
-  Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
-  if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
-    NewInsts.push_back(NewInst);
-  return IndexAdd;
+
+  return Result;
 }
 
 /// Promotes a single user of the alloca to a vector form.
 ///
 /// \param Inst           Instruction to be promoted.
 /// \param DL             Module Data Layout.
-/// \param VectorTy       Vectorized Type.
+/// \param AA             Alloca Analysis.
 /// \param VecStoreSize   Size of \p VectorTy in bytes.
 /// \param ElementSize    Size of \p VectorTy element type in bytes.
-/// \param TransferInfo   MemTransferInst info map.
-/// \param GEPVectorIdx   GEP -> VectorIdx cache.
 /// \param CurVal         Current value of the vector (e.g. last stored value)
 /// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
 ///              be promoted now. This happens when promoting requires \p
 ///              CurVal, but \p CurVal is nullptr.
 /// \return the stored value if \p Inst would have written to the alloca, or
 ///         nullptr otherwise.
-static Value *promoteAllocaUserToVector(
-    Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
-    unsigned VecStoreSize, unsigned ElementSize,
-    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
-    std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
-    function_ref<Value *()> GetCurVal) {
+static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout 
&DL,
+                                        AllocaAnalysis &AA,
+                                        unsigned VecStoreSize,
+                                        unsigned ElementSize,
+                                        function_ref<Value *()> GetCurVal) {
   // Note: we use InstSimplifyFolder because it can leverage the DataLayout
   // to do more folding, especially in the case of vector splats.
   IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
@@ -526,13 +616,13 @@ static Value *promoteAllocaUserToVector(
         Val, FixedVectorType::get(EltTy, NumPtrElts));
   };
 
-  Type *VecEltTy = VectorTy->getElementType();
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
 
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
     Value *CurVal = GetCurVal();
-    Value *Index = calculateVectorIndex(
-        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+    Value *Index =
+        calculateVectorIndex(cast<LoadInst>(Inst)->getPointerOperand(), AA);
 
     // We're loading the full vector.
     Type *AccessTy = Inst->getType();
@@ -588,7 +678,7 @@ static Value *promoteAllocaUserToVector(
     // to know the current value. If this is a store of a single element, we
     // need to know the value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
     Value *Val = SI->getValueOperand();
 
     // We're storing the full vector, we can handle this without knowing 
CurVal.
@@ -598,9 +688,9 @@ static Value *promoteAllocaUserToVector(
       if (CI->isZeroValue() && AccessSize == VecStoreSize) {
         if (AccessTy->isPtrOrPtrVectorTy())
           Val = CreateTempPtrIntCast(Val, AccessTy);
-        else if (VectorTy->isPtrOrPtrVectorTy())
-          Val = CreateTempPtrIntCast(Val, VectorTy);
-        return Builder.CreateBitOrPointerCast(Val, VectorTy);
+        else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
+          Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
+        return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
       }
     }
 
@@ -609,7 +699,7 @@ static Value *promoteAllocaUserToVector(
       assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
       const unsigned NumWrittenElts =
           AccessSize / DL.getTypeStoreSize(VecEltTy);
-      const unsigned NumVecElts = VectorTy->getNumElements();
+      const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
@@ -640,14 +730,14 @@ static Value *promoteAllocaUserToVector(
       // For memcpy, we need to know curval.
       ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
       unsigned NumCopied = Length->getZExtValue() / ElementSize;
-      MemTransferInfo *TI = &TransferInfo[MTI];
+      MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
       unsigned SrcBegin = TI->SrcIndex->getZExtValue();
       unsigned DestBegin = TI->DestIndex->getZExtValue();
 
       SmallVector<int> Mask;
-      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+      for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
         if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-          Mask.push_back(SrcBegin < VectorTy->getNumElements()
+          Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
                              ? SrcBegin++
                              : PoisonMaskElem);
         } else {
@@ -676,14 +766,14 @@ static Value *promoteAllocaUserToVector(
           Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
       }
 
-      return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+      return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
     }
 
     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
         Intr->replaceAllUsesWith(
             Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
-                            DL.getTypeAllocSize(VectorTy)));
+                            DL.getTypeAllocSize(AA.Vector.Ty)));
         return nullptr;
       }
     }
@@ -838,46 +928,32 @@ AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type 
*AllocaTy) const {
   return VectorTy;
 }
 
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n');
-
-  Type *AllocaTy = Alloca.getAllocatedType();
-  FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy);
-  if (!VectorTy)
-    return false;
+void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const 
{
+  if (AA.HaveSelectOrPHI) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector due to select or phi\n");
+    return;
+  }
 
-  std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
-  SmallVector<Instruction *> WorkList;
-  SmallVector<Instruction *> UsersToRemove;
-  SmallVector<Instruction *> DeferredInsts;
-  SmallVector<Instruction *> NewGEPInsts;
-  DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
+  Type *AllocaTy = AA.Alloca->getAllocatedType();
+  AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
+  if (!AA.Vector.Ty)
+    return;
 
   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
     LLVM_DEBUG(dbgs() << "  Cannot promote alloca to vector: " << Msg << "\n"
                       << "    " << *Inst << "\n");
-    for (auto *Inst : reverse(NewGEPInsts))
-      Inst->eraseFromParent();
-    return false;
+    AA.Vector.Ty = nullptr;
   };
 
-  SmallVector<Use *, 8> Uses;
-  collectAllocaUses(Alloca, Uses);
-
-  LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
-
-  Type *VecEltTy = VectorTy->getElementType();
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
   unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
   assert(ElementSize > 0);
-  for (auto *U : Uses) {
+  for (auto *U : AA.Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
-      // This is a store of the pointer, not to the pointer.
-      if (isa<StoreInst>(Inst) &&
-          U->getOperandNo() != StoreInst::getPointerOperandIndex())
-        return RejectUser(Inst, "pointer is being stored");
+      assert(!isa<StoreInst>(Inst) ||
+             U->getOperandNo() == StoreInst::getPointerOperandIndex());
 
       Type *AccessTy = getLoadStoreType(Inst);
       if (AccessTy->isAggregateType())
@@ -893,34 +969,35 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       Ptr = Ptr->stripPointerCasts();
 
       // Alloca already accessed as vector.
-      if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy)) {
-        WorkList.push_back(Inst);
+      if (Ptr == AA.Alloca &&
+          DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
+              DL->getTypeStoreSize(AccessTy)) {
+        AA.Vector.Worklist.push_back(Inst);
         continue;
       }
 
-      if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
+      if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
         return RejectUser(Inst, "not a supported access type");
 
-      WorkList.push_back(Inst);
+      AA.Vector.Worklist.push_back(Inst);
       continue;
     }
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, 
NewGEPInsts);
+      auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
-      GEPVectorIdx[GEP] = Index;
-      UsersToRemove.push_back(Inst);
+      AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
-        MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
-      WorkList.push_back(Inst);
+        MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
+      AA.Vector.Worklist.push_back(Inst);
       continue;
     }
 
@@ -933,21 +1010,22 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "mem transfer inst length is non-constant or "
                                 "not a multiple of the vector element size");
 
-      if (TransferInfo.try_emplace(TransferInst).second) {
-        DeferredInsts.push_back(Inst);
-        WorkList.push_back(Inst);
-      }
-
       auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
-        GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-        if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
-          return nullptr;
+        if (Ptr == AA.Alloca)
+          return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
 
-        return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+        const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
+        if (GEPI.VarIndex)
+          return nullptr;
+        if (GEPI.ConstIndex)
+          return GEPI.ConstIndex;
+        return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
       };
 
+      MemTransferInfo *TI =
+          &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
       unsigned OpNum = U->getOperandNo();
-      MemTransferInfo *TI = &TransferInfo[TransferInst];
       if (OpNum == 0) {
         Value *Dest = TransferInst->getDest();
         ConstantInt *Index = getPointerIndexOfAlloca(Dest);
@@ -967,7 +1045,7 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
     if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
       if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
-        WorkList.push_back(Inst);
+        AA.Vector.Worklist.push_back(Inst);
         continue;
       }
     }
@@ -976,56 +1054,59 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (isAssumeLikeIntrinsic(Inst)) {
       if (!Inst->use_empty())
         return RejectUser(Inst, "assume-like intrinsic cannot have any users");
-      UsersToRemove.push_back(Inst);
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
         })) {
-      UsersToRemove.push_back(Inst);
+      AA.Vector.UsersToRemove.push_back(Inst);
       continue;
     }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
 
-  while (!DeferredInsts.empty()) {
-    Instruction *Inst = DeferredInsts.pop_back_val();
-    MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
-    // TODO: Support the case if the pointers are from different alloca or
-    // from different address spaces.
-    MemTransferInfo &Info = TransferInfo[TransferInst];
-    if (!Info.SrcIndex || !Info.DestIndex)
-      return RejectUser(
-          Inst, "mem transfer inst is missing constant src and/or dst index");
+  // Follow-up check to ensure we've seen both sides of all transfer insts.
+  for (const auto &Entry : AA.Vector.TransferInfo) {
+    const MemTransferInfo &TI = Entry.second;
+    if (!TI.SrcIndex || !TI.DestIndex)
+      return RejectUser(Entry.first,
+                        "mem transfer inst between different objects");
+    AA.Vector.Worklist.push_back(Entry.first);
   }
+}
 
-  LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
-                    << *VectorTy << '\n');
-  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
+  LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
+  LLVM_DEBUG(dbgs() << "  type conversion: " << *AA.Alloca->getAllocatedType()
+                    << " -> " << *AA.Vector.Ty << '\n');
+  const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
+
+  Type *VecEltTy = AA.Vector.Ty->getElementType();
+  const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
 
   // Alloca is uninitialized memory. Imitate that by making the first value
   // undef.
   SSAUpdater Updater;
-  Updater.Initialize(VectorTy, "promotealloca");
+  Updater.Initialize(AA.Vector.Ty, "promotealloca");
 
-  BasicBlock *EntryBB = Alloca.getParent();
+  BasicBlock *EntryBB = AA.Alloca->getParent();
   BasicBlock::iterator InitInsertPos =
-      skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
-  // Alloca memory is undefined to begin, not poison.
-  Value *AllocaInitValue =
-      new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
-  AllocaInitValue->takeName(&Alloca);
+      skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
+  IRBuilder<> Builder(&*InitInsertPos);
+  Value *AllocaInitValue = 
Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
+  AllocaInitValue->takeName(AA.Alloca);
 
-  Updater.AddAvailableValue(EntryBB, AllocaInitValue);
+  Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
 
   // First handle the initial worklist, in basic block order.
   //
   // Insert a placeholder whenever we need the vector value at the top of a
   // basic block.
   SmallVector<Instruction *> Placeholders;
-  forEachWorkListItem(WorkList, [&](Instruction *I) {
+  forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
     BasicBlock *BB = I->getParent();
     auto GetCurVal = [&]() -> Value * {
       if (Value *CurVal = Updater.FindValueForBlock(BB))
@@ -1035,15 +1116,14 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       // placeholder that we will replace later.
       IRBuilder<> Builder(I);
       auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
-          PoisonValue::get(VectorTy), "promotealloca.placeholder"));
+          PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
       Placeholders.push_back(Placeholder);
       Updater.AddAvailableValue(BB, Placeholder);
       return Placeholder;
     };
 
-    Value *Result =
-        promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize,
-                                  TransferInfo, GEPVectorIdx, GetCurVal);
+    Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
+                                              ElementSize, GetCurVal);
     if (Result)
       Updater.AddAvailableValue(BB, Result);
   });
@@ -1057,23 +1137,23 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   // Delete all instructions. On the first pass, new dummy loads may have been
   // added so we need to collect them too.
-  DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
+  DenseSet<Instruction *> InstsToDelete(AA.Vector.Worklist.begin(),
+                                        AA.Vector.Worklist.end());
   for (Instruction *I : InstsToDelete) {
     assert(I->use_empty());
     I->eraseFromParent();
   }
 
   // Delete all the users that are known to be removeable.
-  for (Instruction *I : reverse(UsersToRemove)) {
+  for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
     I->dropDroppableUses();
     assert(I->use_empty());
     I->eraseFromParent();
   }
 
   // Alloca should now be dead too.
-  assert(Alloca.use_empty());
-  Alloca.eraseFromParent();
-  return true;
+  assert(AA.Alloca->use_empty());
+  AA.Alloca->eraseFromParent();
 }
 
 std::pair<Value *, Value *>
@@ -1247,61 +1327,78 @@ bool 
AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
   return true;
 }
 
-bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
-    Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
+void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
+  if (DisablePromoteAllocaToLDS) {
+    LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
+    return;
+  }
 
-  for (User *User : Val->users()) {
-    if (is_contained(WorkList, User))
-      continue;
+  // Don't promote the alloca to LDS for shader calling conventions as the work
+  // item ID intrinsics are not supported for these calling conventions.
+  // Furthermore not all LDS is available for some of the stages.
+  const Function &ContainingFunction = *AA.Alloca->getFunction();
+  CallingConv::ID CC = ContainingFunction.getCallingConv();
+
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    break;
+  default:
+    LLVM_DEBUG(
+        dbgs()
+        << "  promote alloca to LDS not supported with calling convention.\n");
+    return;
+  }
+
+  for (Use *Use : AA.Uses) {
+    auto *User = Use->getUser();
 
     if (CallInst *CI = dyn_cast<CallInst>(User)) {
       if (!isCallPromotable(CI))
-        return false;
+        return;
 
-      WorkList.push_back(User);
+      if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+        AA.LDS.Worklist.push_back(User);
       continue;
     }
 
     Instruction *UseInst = cast<Instruction>(User);
     if (UseInst->getOpcode() == Instruction::PtrToInt)
-      return false;
+      return;
 
     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
       if (LI->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
       if (SI->isVolatile())
-        return false;
-
-      // Reject if the stored value is not the pointer operand.
-      if (SI->getPointerOperand() != Val)
-        return false;
+        return;
       continue;
     }
 
     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
       if (RMW->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
       if (CAS->isVolatile())
-        return false;
+        return;
       continue;
     }
 
     // Only promote a select if we know that the other select operand
     // is from another pointer that will also be promoted.
     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
-        return false;
+      if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
+        return;
 
       // May need to rewrite constant operands.
-      WorkList.push_back(ICmp);
+      if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+        AA.LDS.Worklist.push_back(ICmp);
       continue;
     }
 
@@ -1309,28 +1406,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       // Be conservative if an address could be computed outside the bounds of
       // the alloca.
       if (!GEP->isInBounds())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
-      // Only promote a select if we know that the other select operand is from
-      // another pointer that will also be promoted.
-      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
-        return false;
-    } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
-      // Repeat for phis.
-
-      // TODO: Handle more complex cases. We should be able to replace loops
-      // over arrays.
-      switch (Phi->getNumIncomingValues()) {
-      case 1:
-        break;
-      case 2:
-        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
-          return false;
-        break;
-      default:
-        return false;
-      }
-    } else if (!isa<ExtractElementInst>(User)) {
+        return;
+    } else if (!isa<ExtractElementInst, SelectInst, PHINode>(User)) {
       // Do not promote vector/aggregate type instructions. It is hard to track
       // their users.
 
@@ -1338,15 +1415,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       //
       // TODO: If we know the address is only observed through flat pointers, 
we
       // could still promote.
-      return false;
+      return;
     }
 
-    WorkList.push_back(User);
-    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
-      return false;
+    if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
+      AA.LDS.Worklist.push_back(User);
   }
 
-  return true;
+  AA.LDS.Enable = true;
 }
 
 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
@@ -1477,44 +1553,23 @@ bool 
AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
 }
 
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
                                                     bool SufficientLDS) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
-
-  if (DisablePromoteAllocaToLDS) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to LDS is disabled\n");
-    return false;
-  }
-
-  const DataLayout &DL = Mod->getDataLayout();
-  IRBuilder<> Builder(&I);
-
-  const Function &ContainingFunction = *I.getFunction();
-  CallingConv::ID CC = ContainingFunction.getCallingConv();
-
-  // Don't promote the alloca to LDS for shader calling conventions as the work
-  // item ID intrinsics are not supported for these calling conventions.
-  // Furthermore not all LDS is available for some of the stages.
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    break;
-  default:
-    LLVM_DEBUG(
-        dbgs()
-        << " promote alloca to LDS not supported with calling convention.\n");
-    return false;
-  }
+  LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
 
   // Not likely to have sufficient local memory for promotion.
   if (!SufficientLDS)
     return false;
 
+  const DataLayout &DL = Mod->getDataLayout();
+  IRBuilder<> Builder(AA.Alloca);
+
+  const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
-  Align Alignment =
-      DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
+  Align Alignment = DL.getValueOrABITypeAlignment(
+      AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
 
   // FIXME: This computed padding is likely wrong since it depends on inverse
   // usage order.
@@ -1524,7 +1579,7 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize =
-      WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
+      WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
   NewSize += AllocSize;
 
   if (NewSize > LocalMemLimit) {
@@ -1535,24 +1590,17 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value *> WorkList;
-
-  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
-    LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return false;
-  }
-
   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
 
-  Function *F = I.getFunction();
+  Function *F = AA.Alloca->getFunction();
 
-  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
+  Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
   GlobalVariable *GV = new GlobalVariable(
       *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
-      Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
+      Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(I.getAlign());
+  GV->setAlignment(AA.Alloca->getAlign());
 
   Value *TCntY, *TCntZ;
 
@@ -1571,15 +1619,15 @@ bool 
AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
   Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
 
   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
-  I.mutateType(Offset->getType());
-  I.replaceAllUsesWith(Offset);
-  I.eraseFromParent();
+  AA.Alloca->mutateType(Offset->getType());
+  AA.Alloca->replaceAllUsesWith(Offset);
+  AA.Alloca->eraseFromParent();
 
   SmallVector<IntrinsicInst *> DeferredIntrs;
 
   PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
 
-  for (Value *V : WorkList) {
+  for (Value *V : AA.LDS.Worklist) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll 
b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
index 7da441f2e79d2..7ebb4ca262614 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -12,8 +12,7 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 
%offset) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], -1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
@@ -42,8 +41,7 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 
%offset) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP5]], -1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll 
b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
index ab03177d1edc5..ae6157af2cf4c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -1,14 +1,16 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri 
-debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 
-passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK:      Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-LABEL: Analyzing:   %simpleuser = alloca [4 x i64], align 4, 
addrspace(5)
+; CHECK-NEXT: Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i64 42, ptr addrspace(5) %simpleuser, align 8
 ; CHECK-NEXT:   => Final Score:1
+; CHECK-LABEL: Analyzing:   %manyusers = alloca [4 x i64], align 4, 
addrspace(5)
 ; CHECK-NEXT: Scoring:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:   [+1]:   store i32 %v0.ext, ptr addrspace(5) %manyusers.1, 
align 4
-; CHECK-NEXT:   [+1]:   %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
-; CHECK-NEXT:   [+1]:   store i32 %v1.ext, ptr addrspace(5) %manyusers.2, 
align 4
-; CHECK-NEXT:   [+1]:   %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT:   [+1]:   store i64 %v0.add, ptr addrspace(5) %manyusers.1, 
align 8
+; CHECK-NEXT:   [+1]:   %v0 = load i64, ptr addrspace(5) %manyusers.1, align 8
+; CHECK-NEXT:   [+1]:   store i64 %v1.add, ptr addrspace(5) %manyusers.2, 
align 8
+; CHECK-NEXT:   [+1]:   %v1 = load i64, ptr addrspace(5) %manyusers.2, align 8
 ; CHECK-NEXT:   => Final Score:4
 ; CHECK-NEXT: Sorted Worklist:
 ; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
@@ -20,50 +22,52 @@ entry:
   ; should get a score of 4
   %manyusers = alloca [4 x i64], align 4, addrspace(5)
 
-  store i32 42, ptr addrspace(5) %simpleuser
+  store i64 42, ptr addrspace(5) %simpleuser
 
-  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
-  %v0 = load i8, ptr addrspace(5)  %manyusers.1
-  %v0.ext = zext i8 %v0 to i32
-  store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+  %manyusers.1 = getelementptr i64, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i64, ptr addrspace(5)  %manyusers.1
+  %v0.add = add i64 %v0, 1
+  store i64 %v0.add, ptr addrspace(5) %manyusers.1
 
-  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
-  %v1 = load i8, ptr addrspace(5)  %manyusers.2
-  %v1.ext = zext i8 %v0 to i32
-  store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+  %manyusers.2 = getelementptr i64, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i64, ptr addrspace(5)  %manyusers.2
+  %v1.add = add i64 %v0, 1
+  store i64 %v1.add, ptr addrspace(5) %manyusers.2
 
   ret void
 }
 
-; CHECK:      Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:   [+5]:   store i32 32, ptr addrspace(5) %stack, align 4
-; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %stack, align 4
-; CHECK-NEXT:   [+9]:   store i32 32, ptr addrspace(5) %stack.1, align 4
-; CHECK-NEXT:   [+5]:   %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 
1
-; CHECK-NEXT:   [+1]:   store i32 64, ptr addrspace(5) %stack.2, align 4
-; CHECK-NEXT:   [+9]:   %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 
1
+; CHECK-LABEL: Analyzing:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+5]:   store i64 32, ptr addrspace(5) %stack, align 8
+; CHECK-NEXT:   [+1]:   store i64 42, ptr addrspace(5) %stack, align 8
+; CHECK-NEXT:   [+9]:   store i64 32, ptr addrspace(5) %stack.1, align 8
+; CHECK-NEXT:   [+5]:   %outer = load i64, ptr addrspace(5) %stack.1, align 8
+; CHECK-NEXT:   [+1]:   store i64 64, ptr addrspace(5) %stack.2, align 8
+; CHECK-NEXT:   [+9]:   %inner = load i64, ptr addrspace(5) %stack.2, align 8
 ; CHECK-NEXT:   => Final Score:30
 define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
 entry:
   ; should get a score of 1
   %stack = alloca [4 x i64], align 4, addrspace(5)
-  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
-  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 16
 
-  store i32 42, ptr addrspace(5) %stack
+  store i64 42, ptr addrspace(5) %stack
   br label %loop.outer
 
 loop.outer:
-  store i32 32, ptr addrspace(5) %stack
-  %outer.cmp = load i1, ptr addrspace(5) %stack.1
+  store i64 32, ptr addrspace(5) %stack
+  %outer = load i64, ptr addrspace(5) %stack.1
   br label %loop.inner
 
 loop.inner:
-  store i32 32, ptr addrspace(5) %stack.1
-  %inner.cmp = load i1, ptr addrspace(5) %stack.2
+  store i64 32, ptr addrspace(5) %stack.1
+  %inner = load i64, ptr addrspace(5) %stack.2
+  %inner.cmp = icmp sge i64 %inner, 0
   br i1 %inner.cmp, label %loop.inner, label %loop.outer
 
 exit:
-  store i32 64, ptr addrspace(5) %stack.2
+  store i64 64, ptr addrspace(5) %stack.2
   ret void
 }

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to