[llvm-branch-commits] [llvm] [LAA] Support monotonic pointers in LoopAccessAnalysis (PR #140721)

2025-05-27 Thread Sergey Kachkov via llvm-branch-commits

skachkov-sc wrote:

Gentle ping

https://github.com/llvm/llvm-project/pull/140721
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize][NFC] Refactor widening decision logic (PR #140722)

2025-11-10 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc edited 
https://github.com/llvm/llvm-project/pull/140722
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize][NFC] Refactor widening decision logic (PR #140722)

2025-11-10 Thread Sergey Kachkov via llvm-branch-commits

skachkov-sc wrote:

@david-arm addressed, but I've left memoryInstructionCanBeWidened as a separate 
function for now (it contains some early exits so I think the code will be 
harder to read after its substitution)

https://github.com/llvm/llvm-project/pull/140722
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize][NFC] Refactor widening decision logic (PR #140722)

2025-11-10 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/140722

>From b08f1f89c1e8b8dd2acb0662fa1da021f27d9ab9 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Mon, 10 Nov 2025 16:35:23 +0300
Subject: [PATCH 1/2] [IVDescriptors] Add unit tests for MonotonicDescriptor

---
 llvm/unittests/Analysis/IVDescriptorsTest.cpp | 131 ++
 1 file changed, 131 insertions(+)

diff --git a/llvm/unittests/Analysis/IVDescriptorsTest.cpp 
b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
index 453800abf9cab..5e31a2bde7e7d 100644
--- a/llvm/unittests/Analysis/IVDescriptorsTest.cpp
+++ b/llvm/unittests/Analysis/IVDescriptorsTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
@@ -259,3 +260,133 @@ for.end:
 EXPECT_EQ(Kind, RecurKind::FMax);
   });
 }
+
+TEST(IVDescriptorsTest, MonotonicIntVar) {
+  // Parse the module.
+  LLVMContext Context;
+
+  std::unique_ptr M =
+  parseIR(Context,
+  R"(define void @foo(i32 %start, i1 %cond, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %monotonic = phi i32 [ %start, %entry ], [ %monotonic.next, %for.inc ]
+  br i1 %cond, label %if.then, label %for.inc
+
+if.then:
+  %inc = add nsw i32 %monotonic, 1
+  br label %for.inc
+
+for.inc:
+  %monotonic.next = phi i32 [ %inc, %if.then ], [ %monotonic, %for.body ]
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %i.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+})");
+
+  runWithLoopInfoAndSE(
+  *M, "foo", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+Function::iterator FI = F.begin();
+// First basic block is entry - skip it.
+BasicBlock *Header = &*(++FI);
+assert(Header->getName() == "for.body");
+Loop *L = LI.getLoopFor(Header);
+EXPECT_NE(L, nullptr);
+BasicBlock::iterator BBI = Header->begin();
+assert((&*BBI)->getName() == "i");
+PHINode *Phi = dyn_cast(&*(++BBI));
+assert(Phi->getName() == "monotonic");
+BasicBlock *IfThen = &*(++FI);
+assert(IfThen->getName() == "if.then");
+Instruction *StepInst = &*(IfThen->begin());
+assert(StepInst->getName() == "inc");
+BasicBlock *IfEnd = &*(++FI);
+assert(IfEnd->getName() == "for.inc");
+auto *ChainPhi = dyn_cast(&*(IfEnd->begin()));
+assert(ChainPhi->getName() == "monotonic.next");
+MonotonicDescriptor Desc;
+bool IsMonotonicPhi =
+MonotonicDescriptor::isMonotonicPHI(Phi, L, Desc, SE);
+EXPECT_TRUE(IsMonotonicPhi);
+auto &Chain = Desc.getChain();
+EXPECT_TRUE(Chain.size() == 1 && Chain.contains(ChainPhi));
+EXPECT_EQ(Desc.getStepInst(), StepInst);
+EXPECT_EQ(Desc.getPredicateEdge(),
+  MonotonicDescriptor::Edge(IfThen, IfEnd));
+auto *StartSCEV = SE.getSCEV(F.getArg(0));
+auto *StepSCEV = SE.getConstant(StartSCEV->getType(), 1);
+EXPECT_EQ(Desc.getExpr(),
+  SE.getAddRecExpr(StartSCEV, StepSCEV, L, SCEV::FlagNW));
+  });
+}
+
+TEST(IVDescriptorsTest, MonotonicPtrVar) {
+  // Parse the module.
+  LLVMContext Context;
+
+  std::unique_ptr M =
+  parseIR(Context,
+  R"(define void @foo(ptr %start, i1 %cond, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %monotonic = phi ptr [ %start, %entry ], [ %monotonic.next, %for.inc ]
+  br i1 %cond, label %if.then, label %for.inc
+
+if.then:
+  %inc = getelementptr inbounds i8, ptr %monotonic, i64 4
+  br label %for.inc
+
+for.inc:
+  %monotonic.next = phi ptr [ %inc, %if.then ], [ %monotonic, %for.body ]
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %i.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+})");
+
+  runWithLoopInfoAndSE(
+  *M, "foo", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+Function::iterator FI = F.begin();
+// First basic block is entry - skip it.
+BasicBlock *Header = &*(++FI);
+assert(Header->getName() == "for.body");
+Loop *L = LI.getLoopFor(Header);
+EXPECT_NE(L, nullptr);
+BasicBlock::iterator BBI = Header->begin();
+assert((&*BBI)->getName() == "i");
+PHINode *Phi = dyn_cast(&*(++BBI));
+assert(Phi->getName() == "monotonic");
+BasicBlock *IfThen = &*(++FI);
+assert(IfThen->getName() == "if.then");
+Instruction *StepInst = &*(IfThen->begin());
+assert(StepInst->getName() == "inc");
+BasicBlock *IfEnd = &*(++FI);
+assert(If

[llvm-branch-commits] [llvm] [VPlan] Implement compressed widening of memory instructions (PR #166956)

2025-11-10 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/166956

>From 92342e03b192d37370c9160b13ce1048501eb079 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Fri, 7 Nov 2025 18:09:56 +0300
Subject: [PATCH] [VPlan] Implement compressed widening of memory instructions

---
 .../llvm/Analysis/TargetTransformInfo.h   |  1 +
 .../Transforms/Vectorize/LoopVectorize.cpp| 24 ++
 llvm/lib/Transforms/Vectorize/VPlan.h | 32 ---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 11 ---
 5 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..e8769f5860c77 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1442,6 +1442,7 @@ class TargetTransformInfo {
 Normal,///< The cast is used with a normal load/store.
 Masked,///< The cast is used with a masked load/store.
 GatherScatter, ///< The cast is used with a gather/scatter.
+Compressed,///< The cast is used with an expand load/compress store.
 Interleave,///< The cast is used with an interleaved load/store.
 Reversed,  ///< The cast is used with a reversed load/store.
   };
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e069b2e8103e0..6565c8c036ca0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel {
 CM_Widen_Reverse, // For consecutive accesses with stride -1.
 CM_Interleave,
 CM_GatherScatter,
+CM_Compressed,
 CM_Scalarize,
 CM_VectorCall,
 CM_IntrinsicCall
@@ -3109,9 +3110,9 @@ void 
LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 if (IsUniformMemOpUse(I))
   return true;
 
-return (WideningDecision == CM_Widen ||
-WideningDecision == CM_Widen_Reverse ||
-WideningDecision == CM_Interleave);
+return (
+WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+WideningDecision == CM_Interleave || WideningDecision == 
CM_Compressed);
   };
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5192,11 +5193,16 @@ InstructionCost 
LoopVectorizationCostModel::getConsecutiveMemOpCost(
 Instruction *I, ElementCount VF, InstWidening Decision) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
 
+  if (Decision == CM_Compressed)
+return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+ /*VariableMask*/ true, Alignment,
+ CostKind, I);
+
   assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
  "Expected widen decision.");
-  const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6300,6 +6306,8 @@ 
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   switch (getWideningDecision(I, VF)) {
   case LoopVectorizationCostModel::CM_GatherScatter:
 return TTI::CastContextHint::GatherScatter;
+  case LoopVectorizationCostModel::CM_Compressed:
+return TTI::CastContextHint::Compressed;
   case LoopVectorizationCostModel::CM_Interleave:
 return TTI::CastContextHint::Interleave;
   case LoopVectorizationCostModel::CM_Scalarize:
@@ -7515,8 +7523,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, 
ArrayRef Operands,
   LoopVectorizationCostModel::InstWidening Decision =
   CM.getWideningDecision(I, Range.Start);
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+  bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
   bool Consecutive =
-  Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+  Reverse || Compressed || Decision == 
LoopVectorizationCostModel::CM_Widen;
 
   VPValue *Ptr = isa(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -7546,11 +7555,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, 
ArrayRef Operands,
   }
   if (LoadInst *Load = dyn_cast(I))
 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Compressed, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
 
   StoreInst *Store = cast(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0]

[llvm-branch-commits] [llvm] [VPlan] Implement compressed widening of memory instructions (PR #166956)

2025-11-10 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc edited 
https://github.com/llvm/llvm-project/pull/166956
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-11 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc edited 
https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-11 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc edited 
https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-11 Thread Sergey Kachkov via llvm-branch-commits


@@ -8430,6 +8479,46 @@ VPlanPtr 
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // bring the VPlan to its final state.
   // 
---
 
+  // Adjust the recipes for any monotonic phis.
+  for (VPRecipeBase &R : HeaderVPBB->phis()) {
+auto *MonotonicPhi = dyn_cast(&R);

skachkov-sc wrote:

Do you mean VPIRPhi recipe? I think it's possible, but I'm slightly concerned 
that semantically  VPMonotonicPHIRecipe models loop header phi so it should be 
derived from VPHeaderPHIRecipe. But yes, we can distinguish monotonic phis 
using LoopVectorizationLegality

https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits


@@ -4442,6 +4495,29 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const 
Twine &Indent,
 }
 #endif
 
+void VPMonotonicPHIRecipe::execute(VPTransformState &State) {
+  assert(getParent()->getPlan()->getUF() == 1 && "Expected unroll factor 1.");
+  Value *Start = getStartValue()->getLiveInIRValue();
+  BasicBlock *VectorPH =
+  State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
+  PHINode *MonotonicPHI =
+  State.Builder.CreatePHI(Start->getType(), 2, "monotonic.iv");
+  MonotonicPHI->addIncoming(Start, VectorPH);
+  MonotonicPHI->setDebugLoc(getDebugLoc());
+  State.set(this, MonotonicPHI, /*IsScalar=*/true);
+}

skachkov-sc wrote:

The only rational for new recipe was to simplify "adjusting" of VPlan: we need 
to insert VPInstruction::ComputeMonotonicRecipe at the backedge of such phi 
(that will incement phi value on ctpop(mask)). This looks similar to handling 
of reductions: VPMonotonicPHIRecipe/VPInstruction::ComputeMonotonicResult is 
symmetric to VPReductionPHIRecipe/VPInstruction::ComputeReductionResult. 
Probably VPWidenPHI recipe can be used there, but there are some places in code 
when we want to distinguish "monotonic" header phis ftom the others.

https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits


@@ -3193,6 +3239,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public 
VPRecipeBase,
   /// Whether the consecutive accessed addresses are in reverse order.
   bool Reverse;
 
+  /// Whether the consecutive accessed addresses are compressed with mask 
value.
+  bool Compressed;
+

skachkov-sc wrote:

There is no intrinsic before loop vectorization here; we have plain load/store 
instruction that is placed under some predicate in the original loop, so it 
become masked in LoopVectorizer. The difference with ordinary masked 
loads/stores is that "compressed" loads/stores read or write the memory 
consecutively (the number of elements == the number of set mask bits), and then 
broadcast the elements in the masked positions

https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

skachkov-sc wrote:

> I think you can probably make this independent of #140721 by first just 
> supporting cases where to compressed store does not alias any of the other 
> memory accesses?

Yes, the changes in LAA are fully independent, we can skip them for now.

> Curious if you already have any runtime performance numbers you could share?

We've benchmarked the following loop pattern:
```
// benchmark() is run 32 times

template
void benchmark(T *dst, const T *src) {
  size_t idx = 0;
  for(size_t i = 0; i < 1024; ++i) {
T cur = src[i];
if (cur != static_cast(0))
  dst[idx++] = cur;
  }
  dst[idx] = static_cast(0);
}
```
On SpacemiT-X60 core (RISC-V CPU with VLEN=256) the results are following:

| Type| cycles (scalar) | cycles (vector) | speedup |
| -|-|--|-|
| int16_t | 189151   | 56795   | 3.33x  |
| int32_t | 205712   | 87196   | 2.36x  |
| int64_t | 205757   | 150115 | 1.37x  |

There were no branch mispredicts for `if (cur != static_cast(0))` branch in 
scalar case here (due to the specifics of data in src array), so I think the 
speedup can be even bigger for more random inputs. We haven't observed any 
significant changes on SPECs though.

https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize][NFC] Refactor widening decision logic (PR #140722)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/140722

>From 3bb26adeaf0ddd4b826aac751fdbb70d00da3089 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Wed, 22 Nov 2023 17:24:08 +0300
Subject: [PATCH] [LoopVectorize][NFC] Refactor widening decision logic

---
 .../Transforms/Vectorize/LoopVectorize.cpp| 51 +--
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 906fa2f857c21..914018591d832 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1240,9 +1240,10 @@ class LoopVectorizationCostModel {
   getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const;
 
-  /// Returns true if \p I is a memory instruction with consecutive memory
-  /// access that can be widened.
-  bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
+  /// Returns widening decision (CM_Widen or CM_Widen_Reverse) if \p I is a
+  /// memory instruction with consecutive access that can be widened, or
+  /// CM_Unknown otherwise.
+  InstWidening memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
 
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
@@ -1509,7 +1510,8 @@ class LoopVectorizationCostModel {
 
   /// The cost computation for widening instruction \p I with consecutive
   /// memory access.
-  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF,
+  InstWidening Decision);
 
   /// The cost calculation for Load/Store instruction \p I with uniform 
pointer -
   /// Load: scalar load + broadcast.
@@ -2988,8 +2990,9 @@ bool 
LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   : TTI.isLegalMaskedStore(Ty, Alignment, AS);
 }
 
-bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
-Instruction *I, ElementCount VF) {
+LoopVectorizationCostModel::InstWidening
+LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+  ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
   assert((isa(I)) && "Invalid memory instruction");
 
@@ -2997,21 +3000,22 @@ bool 
LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   auto *ScalarTy = getLoadStoreType(I);
 
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
-return false;
+  auto Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
+  if (!Stride)
+return CM_Unknown;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I, VF))
-return false;
+return CM_Unknown;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getDataLayout();
   if (hasIrregularType(ScalarTy, DL))
-return false;
+return CM_Unknown;
 
-  return true;
+  return Stride == 1 ? CM_Widen : CM_Widen_Reverse;
 }
 
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
@@ -5183,17 +5187,15 @@ 
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
-ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
+Instruction *I, ElementCount VF, InstWidening Decision) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast(toVectorTy(ValTy, VF));
-  Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
-  int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
- "Stride should be 1 or -1 for consecutive memory access");
+  assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
+ "Expected widen decision.");
   const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
@@ -5205,8 +5207,7 @@ 
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
 CostKind, OpInfo, I);
   }
 
-  bool Reverse = ConsecutiveStride < 0;
-  if (Reverse)
+  if (Decision == CM_Widen_Reverse)
 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, {}, CostKind, 0);
   return Cost;
@@ -5617,14 +5618,8 @@ void 
LoopVectorizationCostModel::setCo

[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/140723

>From 9697cd806947ab6ebd021cb7919acd62cc2e29a0 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Fri, 7 Nov 2025 18:09:56 +0300
Subject: [PATCH 1/3] [VPlan] Implement compressed widening of memory
 instructions

---
 .../llvm/Analysis/TargetTransformInfo.h   |  1 +
 .../Transforms/Vectorize/LoopVectorize.cpp| 24 ++
 llvm/lib/Transforms/Vectorize/VPlan.h | 32 ---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 11 ---
 5 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..e8769f5860c77 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1442,6 +1442,7 @@ class TargetTransformInfo {
 Normal,///< The cast is used with a normal load/store.
 Masked,///< The cast is used with a masked load/store.
 GatherScatter, ///< The cast is used with a gather/scatter.
+Compressed,///< The cast is used with an expand load/compress store.
 Interleave,///< The cast is used with an interleaved load/store.
 Reversed,  ///< The cast is used with a reversed load/store.
   };
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 914018591d832..25e8a63eae9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel {
 CM_Widen_Reverse, // For consecutive accesses with stride -1.
 CM_Interleave,
 CM_GatherScatter,
+CM_Compressed,
 CM_Scalarize,
 CM_VectorCall,
 CM_IntrinsicCall
@@ -3108,9 +3109,9 @@ void 
LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 if (IsUniformMemOpUse(I))
   return true;
 
-return (WideningDecision == CM_Widen ||
-WideningDecision == CM_Widen_Reverse ||
-WideningDecision == CM_Interleave);
+return (
+WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+WideningDecision == CM_Interleave || WideningDecision == 
CM_Compressed);
   };
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5191,12 +5192,17 @@ InstructionCost 
LoopVectorizationCostModel::getConsecutiveMemOpCost(
 Instruction *I, ElementCount VF, InstWidening Decision) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
+  if (Decision == CM_Compressed)
+return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+ /*VariableMask*/ true, Alignment,
+ CostKind, I);
+
   assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
  "Expected widen decision.");
-  const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6299,6 +6305,8 @@ 
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   switch (getWideningDecision(I, VF)) {
   case LoopVectorizationCostModel::CM_GatherScatter:
 return TTI::CastContextHint::GatherScatter;
+  case LoopVectorizationCostModel::CM_Compressed:
+return TTI::CastContextHint::Compressed;
   case LoopVectorizationCostModel::CM_Interleave:
 return TTI::CastContextHint::Interleave;
   case LoopVectorizationCostModel::CM_Scalarize:
@@ -7514,8 +7522,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, 
ArrayRef Operands,
   LoopVectorizationCostModel::InstWidening Decision =
   CM.getWideningDecision(I, Range.Start);
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+  bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
   bool Consecutive =
-  Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+  Reverse || Compressed || Decision == 
LoopVectorizationCostModel::CM_Widen;
 
   VPValue *Ptr = isa(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -7545,11 +7554,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, 
ArrayRef Operands,
   }
   if (LoadInst *Load = dyn_cast(I))
 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Compressed, VPIRMetadata(*Load, LVer),
+ I->getDebugLoc());
 
   StoreInst *Stor

[llvm-branch-commits] [llvm] [VPlan] Implement compressed widening of memory instructions (PR #166956)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc created 
https://github.com/llvm/llvm-project/pull/166956

RFC link: 
https://discourse.llvm.org/t/rfc-loop-vectorization-of-compress-store-expand-load-patterns/86442

>From 9697cd806947ab6ebd021cb7919acd62cc2e29a0 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Fri, 7 Nov 2025 18:09:56 +0300
Subject: [PATCH] [VPlan] Implement compressed widening of memory instructions

---
 .../llvm/Analysis/TargetTransformInfo.h   |  1 +
 .../Transforms/Vectorize/LoopVectorize.cpp| 24 ++
 llvm/lib/Transforms/Vectorize/VPlan.h | 32 ---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 11 ---
 5 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h 
b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..e8769f5860c77 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1442,6 +1442,7 @@ class TargetTransformInfo {
 Normal,///< The cast is used with a normal load/store.
 Masked,///< The cast is used with a masked load/store.
 GatherScatter, ///< The cast is used with a gather/scatter.
+Compressed,///< The cast is used with an expand load/compress store.
 Interleave,///< The cast is used with an interleaved load/store.
 Reversed,  ///< The cast is used with a reversed load/store.
   };
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 914018591d832..25e8a63eae9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel {
 CM_Widen_Reverse, // For consecutive accesses with stride -1.
 CM_Interleave,
 CM_GatherScatter,
+CM_Compressed,
 CM_Scalarize,
 CM_VectorCall,
 CM_IntrinsicCall
@@ -3108,9 +3109,9 @@ void 
LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 if (IsUniformMemOpUse(I))
   return true;
 
-return (WideningDecision == CM_Widen ||
-WideningDecision == CM_Widen_Reverse ||
-WideningDecision == CM_Interleave);
+return (
+WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+WideningDecision == CM_Interleave || WideningDecision == 
CM_Compressed);
   };
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5191,12 +5192,17 @@ InstructionCost 
LoopVectorizationCostModel::getConsecutiveMemOpCost(
 Instruction *I, ElementCount VF, InstWidening Decision) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
+  if (Decision == CM_Compressed)
+return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+ /*VariableMask*/ true, Alignment,
+ CostKind, I);
+
   assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
  "Expected widen decision.");
-  const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6299,6 +6305,8 @@ 
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   switch (getWideningDecision(I, VF)) {
   case LoopVectorizationCostModel::CM_GatherScatter:
 return TTI::CastContextHint::GatherScatter;
+  case LoopVectorizationCostModel::CM_Compressed:
+return TTI::CastContextHint::Compressed;
   case LoopVectorizationCostModel::CM_Interleave:
 return TTI::CastContextHint::Interleave;
   case LoopVectorizationCostModel::CM_Scalarize:
@@ -7514,8 +7522,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, 
ArrayRef Operands,
   LoopVectorizationCostModel::InstWidening Decision =
   CM.getWideningDecision(I, Range.Start);
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+  bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
   bool Consecutive =
-  Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+  Reverse || Compressed || Decision == 
LoopVectorizationCostModel::CM_Widen;
 
   VPValue *Ptr = isa(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -7545,11 +7554,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, 
ArrayRef Operands,
   }
   if (LoadInst *Load = dyn_cast(I))
 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
- VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ Compress

[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc edited 
https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits


@@ -3193,6 +3239,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public 
VPRecipeBase,
   /// Whether the consecutive accessed addresses are in reverse order.
   bool Reverse;
 
+  /// Whether the consecutive accessed addresses are compressed with mask 
value.
+  bool Compressed;
+

skachkov-sc wrote:

Changes that are related to the extension of VPWidenMemoryRecipe are splitted 
into separate PR: https://github.com/llvm/llvm-project/pull/166956 (hopefully 
it will be easier to review)

https://github.com/llvm/llvm-project/pull/140723
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [LoopVectorize] Support vectorization of compressing patterns in VPlan (PR #140723)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/140723

>From 2f9baaf83b414b8d2cad73a4ada7efe800a02809 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Wed, 15 Jan 2025 16:09:16 +0300
Subject: [PATCH 1/2] [LoopVectorize][NFC] Add pre-commit tests

---
 .../LoopVectorize/compress-idioms.ll  | 480 ++
 1 file changed, 480 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/compress-idioms.ll

diff --git a/llvm/test/Transforms/LoopVectorize/compress-idioms.ll 
b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll
new file mode 100644
index 0..1390092e40387
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/compress-idioms.ll
@@ -0,0 +1,480 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes=loop-vectorize 
-force-vector-interleave=1 -force-vector-width=4 -S 2>&1 | FileCheck %s
+
+define void @test_store_with_pointer(ptr writeonly %dst, ptr readonly %src, 
i32 %c, i32 %n) {
+; CHECK-LABEL: define void @test_store_with_pointer(
+; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 
[[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:[[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label 
%[[FOR_COND_CLEANUP:.*]]
+; CHECK:   [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:[[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:br label %[[FOR_BODY:.*]]
+; CHECK:   [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:br label %[[FOR_COND_CLEANUP]]
+; CHECK:   [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:ret void
+; CHECK:   [[FOR_BODY]]:
+; CHECK-NEXT:[[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], 
[ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:[[DST_ADDR_09:%.*]] = phi ptr [ [[DST]], 
%[[FOR_BODY_PREHEADER]] ], [ [[DST_ADDR_1:%.*]], %[[FOR_INC]] ]
+; CHECK-NEXT:[[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], 
i64 [[INDVARS_IV]]
+; CHECK-NEXT:[[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:[[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[C]]
+; CHECK-NEXT:br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
+; CHECK:   [[IF_THEN]]:
+; CHECK-NEXT:[[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr 
[[DST_ADDR_09]], i64 4
+; CHECK-NEXT:store i32 [[TMP0]], ptr [[DST_ADDR_09]], align 4
+; CHECK-NEXT:br label %[[FOR_INC]]
+; CHECK:   [[FOR_INC]]:
+; CHECK-NEXT:[[DST_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], %[[IF_THEN]] ], [ 
[[DST_ADDR_09]], %[[FOR_BODY]] ]
+; CHECK-NEXT:[[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:[[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 
[[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], 
label %[[FOR_BODY]]
+;
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, 
%for.inc ]
+  %dst.addr.09 = phi ptr [ %dst, %for.body.preheader ], [ %dst.addr.1, 
%for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %0, %c
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4
+  store i32 %0, ptr %dst.addr.09, align 4
+  br label %for.inc
+
+for.inc:
+  %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @test_store_with_index(ptr writeonly %dst, ptr readonly %src, i32 
%c, i32 %n) {
+; CHECK-LABEL: define void @test_store_with_index(
+; CHECK-SAME: ptr writeonly [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 
[[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:[[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:br i1 [[CMP11]], label %[[FOR_BODY_PREHEADER:.*]], label 
%[[FOR_COND_CLEANUP:.*]]
+; CHECK:   [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:[[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:br label %[[FOR_BODY:.*]]
+; CHECK:   [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:br label %[[FOR_COND_CLEANUP]]
+; CHECK:   [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:ret void
+; CHECK:   [[FOR_BODY]]:
+; CHECK-NEXT:[[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], 
[ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]

[llvm-branch-commits] [llvm] [LoopVectorize][NFC] Refactor widening decision logic (PR #140722)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/140722

>From 9ae9f1d152b47f6fbf920a7e932b491521b5471d Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Wed, 22 Nov 2023 17:24:08 +0300
Subject: [PATCH] [LoopVectorize][NFC] Refactor widening decision logic

---
 .../Transforms/Vectorize/LoopVectorize.cpp| 51 +--
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 906fa2f857c21..914018591d832 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1240,9 +1240,10 @@ class LoopVectorizationCostModel {
   getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const;
 
-  /// Returns true if \p I is a memory instruction with consecutive memory
-  /// access that can be widened.
-  bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
+  /// Returns widening decision (CM_Widen or CM_Widen_Reverse) if \p I is a
+  /// memory instruction with consecutive access that can be widened, or
+  /// CM_Unknown otherwise.
+  InstWidening memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
 
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
@@ -1509,7 +1510,8 @@ class LoopVectorizationCostModel {
 
   /// The cost computation for widening instruction \p I with consecutive
   /// memory access.
-  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF,
+  InstWidening Decision);
 
   /// The cost calculation for Load/Store instruction \p I with uniform 
pointer -
   /// Load: scalar load + broadcast.
@@ -2988,8 +2990,9 @@ bool 
LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   : TTI.isLegalMaskedStore(Ty, Alignment, AS);
 }
 
-bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
-Instruction *I, ElementCount VF) {
+LoopVectorizationCostModel::InstWidening
+LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+  ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
   assert((isa(I)) && "Invalid memory instruction");
 
@@ -2997,21 +3000,22 @@ bool 
LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   auto *ScalarTy = getLoadStoreType(I);
 
   // In order to be widened, the pointer should be consecutive, first of all.
-  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
-return false;
+  auto Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
+  if (!Stride)
+return CM_Unknown;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I, VF))
-return false;
+return CM_Unknown;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getDataLayout();
   if (hasIrregularType(ScalarTy, DL))
-return false;
+return CM_Unknown;
 
-  return true;
+  return Stride == 1 ? CM_Widen : CM_Widen_Reverse;
 }
 
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
@@ -5183,17 +5187,15 @@ 
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
-ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
+Instruction *I, ElementCount VF, InstWidening Decision) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast(toVectorTy(ValTy, VF));
-  Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
-  int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
- "Stride should be 1 or -1 for consecutive memory access");
+  assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
+ "Expected widen decision.");
   const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
@@ -5205,8 +5207,7 @@ 
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
 CostKind, OpInfo, I);
   }
 
-  bool Reverse = ConsecutiveStride < 0;
-  if (Reverse)
+  if (Decision == CM_Widen_Reverse)
 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, {}, CostKind, 0);
   return Cost;
@@ -5617,14 +5618,8 @@ void 
LoopVectorizationCostModel::setCo

[llvm-branch-commits] [llvm] [LAA] Support monotonic pointers in LoopAccessAnalysis (PR #140721)

2025-11-07 Thread Sergey Kachkov via llvm-branch-commits

https://github.com/skachkov-sc updated 
https://github.com/llvm/llvm-project/pull/140721

>From 64a329ce099a2db1e5404a461069a7ce6827f57b Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Thu, 23 Jan 2025 16:16:40 +0300
Subject: [PATCH 1/2] [LAA] Add pre-commit test

---
 .../LoopAccessAnalysis/monotonic-pointers.ll  | 86 +++
 1 file changed, 86 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/monotonic-pointers.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/monotonic-pointers.ll 
b/llvm/test/Analysis/LoopAccessAnalysis/monotonic-pointers.ll
new file mode 100644
index 0..b13d79acc5ee4
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/monotonic-pointers.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by 
utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -disable-output -passes='print' %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @monotonic_ptr_simple(ptr writeonly %dst, ptr readonly %src, i32 
%c, i32 %n) {
+; CHECK-LABEL: 'monotonic_ptr_simple'
+; CHECK-NEXT:for.body:
+; CHECK-NEXT:  Report: cannot identify array bounds
+; CHECK-NEXT:  Dependences:
+; CHECK-NEXT:  Run-time memory checks:
+; CHECK-NEXT:  Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:  Non vectorizable stores to invariant address were not found 
in loop.
+; CHECK-NEXT:  SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:  Expressions re-written:
+;
+entry:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %dst.addr.09 = phi ptr [ %dst, %entry ], [ %dst.addr.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %0, %c
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4
+  store i32 %0, ptr %dst.addr.09, align 4
+  br label %for.inc
+
+for.inc:
+  %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @monotonic_ptr_indexed(ptr writeonly %dst, ptr readonly %src, i32 
%c, i32 %n) {
+; CHECK-LABEL: 'monotonic_ptr_indexed'
+; CHECK-NEXT:for.body:
+; CHECK-NEXT:  Report: cannot identify array bounds
+; CHECK-NEXT:  Dependences:
+; CHECK-NEXT:  Run-time memory checks:
+; CHECK-NEXT:  Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:  Non vectorizable stores to invariant address were not found 
in loop.
+; CHECK-NEXT:  SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:  Expressions re-written:
+;
+entry:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %idx.012 = phi i32 [ 0, %entry ], [ %idx.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %0, %c
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %inc = add nsw i32 %idx.012, 1
+  %idxprom4 = sext i32 %idx.012 to i64
+  %arrayidx5 = getelementptr inbounds i32, ptr %dst, i64 %idxprom4
+  store i32 %0, ptr %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:
+  %idx.1 = phi i32 [ %inc, %if.then ], [ %idx.012, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}

>From 18aa00f7770f0ad79faa649232af57a5c06e407d Mon Sep 17 00:00:00 2001
From: Sergey Kachkov 
Date: Wed, 29 Jan 2025 15:28:42 +0300
Subject: [PATCH 2/2] [LAA] Support monotonic pointers in LoopAccessAnalysis

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp  | 23 +---
 .../LoopAccessAnalysis/monotonic-pointers.ll  | 26 +--
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp 
b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 5d88e5f54e3d6..4484c03f3b7c7 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -1024,7 +1025,8 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, 
const SCEVAddRecExpr *AR,
   if (AR->getNoWrapFlags(SCEV::NoWrapMask))
 return true;
 
-  if (Ptr && PSE.hasNoOverflow(Ptr, SCE