Author: Florian Hahn
Date: 2026-05-08T21:52:53+02:00
New Revision: 9414ed711700de33ac728566c247186e4ff1790f

URL: 
https://github.com/llvm/llvm-project/commit/9414ed711700de33ac728566c247186e4ff1790f
DIFF: 
https://github.com/llvm/llvm-project/commit/9414ed711700de33ac728566c247186e4ff1790f.diff

LOG: Revert "[VPlan] Unify inner and outer loop paths (NFCI). (#192868)"

This reverts commit b84f58ee844ca929db2fff2e41e2195e255548b8.

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
    llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/lib/Transforms/Vectorize/VPlan.cpp
    llvm/lib/Transforms/Vectorize/VPlan.h
    llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
    llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
    llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
    llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h 
b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index 0d45c159d315c..18906aa7eeae3 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -169,12 +169,9 @@ struct LoopVectorizePass : public 
OptionalPassInfoMixin<LoopVectorizePass> {
 /// purposes along with the corresponding optimization remark \p RemarkName.
 /// If \p I is passed, it is an instruction that prevents vectorization.
 /// Otherwise, the loop \p TheLoop is used for the location of the remark.
-LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg,
-                                         const StringRef OREMsg,
-                                         const StringRef ORETag,
-                                         OptimizationRemarkEmitter *ORE,
-                                         const Loop *TheLoop,
-                                         Instruction *I = nullptr);
+LLVM_ABI void reportVectorizationFailure(
+    const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
+    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
 
 /// Same as above, but the debug message and optimization remark are identical
 inline void reportVectorizationFailure(const StringRef DebugMsg,

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
index f29834d2f804e..91476cf232fe0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp
@@ -27,8 +27,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-vectorize"
 
-extern cl::opt<bool> VPlanBuildOuterloopStressTest;
-
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -620,49 +618,3 @@ void VFSelectionContext::collectInLoopReductions() {
                       << " reduction for phi: " << *Phi << "\n");
   }
 }
-
-// TODO: we could return a pair of values that specify the max VF and
-// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
-// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
-// doesn't have a cost model that can choose which plan to execute if
-// more than one is generated.
-FixedScalableVFPair
-VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) {
-  if (UserVF.isScalable() && !supportsScalableVectors()) {
-    reportVectorizationFailure(
-        "Scalable vectorization requested but not supported by the target",
-        "the scalable user-specified vectorization width for outer-loop "
-        "vectorization cannot be used because the target does not support "
-        "scalable vectors.",
-        "ScalableVFUnfeasible", ORE, TheLoop);
-    return FixedScalableVFPair::getNone();
-  }
-
-  ElementCount VF = UserVF;
-  if (VF.isZero()) {
-    auto [_, WidestType] = getSmallestAndWidestTypes();
-
-    auto RegKind = TTI.enableScalableVectorization()
-                       ? TargetTransformInfo::RGK_ScalableVector
-                       : TargetTransformInfo::RGK_FixedWidthVector;
-
-    TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
-    unsigned N = RegSize.getKnownMinValue() / WidestType;
-    VF = ElementCount::get(N, RegSize.isScalable());
-    LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
-
-    // Make sure we have a VF > 1 for stress testing.
-    if (VPlanBuildOuterloopStressTest && VF.isScalar()) {
-      LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
-                        << "overriding computed VF.\n");
-      VF = ElementCount::getFixed(4);
-    }
-  }
-  assert(isPowerOf2_32(VF.getKnownMinValue()) &&
-         "VF needs to be a power of two");
-  if (VF.isScalar())
-    return FixedScalableVFPair::getNone();
-  LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
-                    << "VF " << VF << " to build VPlans.\n");
-  return FixedScalableVFPair(VF);
-}

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h 
b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 00b689326d770..a6789974e0bd6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -703,10 +703,6 @@ class VFSelectionContext {
   /// for size, returning true here aborts vectorization.
   bool runtimeChecksRequired();
 
-  /// Returns a scalable VF to use for outer-loop vectorization if the target
-  /// supports it and a fixed VF otherwise.
-  FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF);
-
   /// Compute smallest bitwidth each instruction can be represented with.
   /// The vector equivalents of these instructions should be truncated to this
   /// type.
@@ -793,6 +789,10 @@ class LoopVectorizationPlanner {
   /// interleaving should be avoided up-front, no plans are generated.
   void plan(ElementCount UserVF, unsigned UserIC);
 
+  /// Use the VPlan-native path to plan how to best vectorize, return the best
+  /// VF and its cost.
+  VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
+
   /// Return the VPlan for \p VF. At the moment, there is always a single VPlan
   /// for each VF.
   VPlan &getPlanFor(ElementCount VF) const;
@@ -881,22 +881,34 @@ class LoopVectorizationPlanner {
       unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF,
       bool DisableRuntimeUnroll);
 
-private:
-  /// Build a VPlan using VPRecipes according to the information gathered by
-  /// Legal and VPlan-based analysis. For outer loops, performs basic recipe
-  /// conversion only. For inner loops, \p Range's largest included VF is
-  /// restricted to the maximum VF the returned VPlan is valid for. If no VPlan
-  /// can be built for the input range, set the largest included VF to the
-  /// maximum VF for which no plan could be built. Each VPlan is built starting
-  /// from a copy of \p InitialPlan, which is a plain CFG VPlan wrapping the
-  /// original scalar loop.
-  VPlanPtr tryToBuildVPlan(VPlanPtr InitialPlan, VFRange &Range);
-
+protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop.
   void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
 
+private:
+  /// Build a VPlan according to the information gathered by Legal. \return a
+  /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+  /// exclusive, possibly decreasing \p Range.End. If no VPlan can be built for
+  /// the input range, set the largest included VF to the maximum VF for which
+  /// no plan could be built.
+  VPlanPtr tryToBuildVPlan(VFRange &Range);
+
+  /// Build a VPlan using VPRecipes according to the information gather by
+  /// Legal. This method is only used for the legacy inner loop vectorizer.
+  /// \p Range's largest included VF is restricted to the maximum VF the
+  /// returned VPlan is valid for. If no VPlan can be built for the input 
range,
+  /// set the largest included VF to the maximum VF for which no plan could be
+  /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
+  /// is a plain CFG VPlan wrapping the original scalar loop.
+  VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range);
+
+  /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+  /// according to the information gathered by Legal when it checked if it is
+  /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+
   /// Add ComputeReductionResult recipes to the middle block to compute the
   /// final reduction results. Add Select recipes to the latch block when
   /// folding tail, to feed ComputeReductionResult with the last or penultimate

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp 
b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ace2275e2b6d..ae1d6d83cccd4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -350,8 +350,8 @@ cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
 // VPlan-native vectorization path. It must be used in conjuction with
 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
 // verification of the H-CFGs built.
-cl::opt<bool> VPlanBuildOuterloopStressTest(
-    "vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden,
+static cl::opt<bool> VPlanBuildStressTest(
+    "vplan-build-stress-test", cl::init(false), cl::Hidden,
     cl::desc(
         "Build VPlan for every supported loop nest in the function and bail "
         "out right after the build (stress test the VPlan H-CFG construction "
@@ -745,8 +745,8 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, 
ElementCount VF) {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
                                 const StringRef OREMsg, const StringRef ORETag,
-                                OptimizationRemarkEmitter *ORE,
-                                const Loop *TheLoop, Instruction *I) {
+                                OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+                                Instruction *I) {
   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
   LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE);
   ORE->emit(
@@ -1877,7 +1877,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
   // now, only collect outer loops that have explicit vectorization hints. If 
we
   // are stress testing the VPlan H-CFG construction, we collect the outermost
   // loop of every loop nest.
-  if (L.isInnermost() || VPlanBuildOuterloopStressTest ||
+  if (L.isInnermost() || VPlanBuildStressTest ||
       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
     LoopBlocksRPO RPOT(&L);
     RPOT.perform(LI);
@@ -2868,12 +2868,6 @@ void 
LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
 FixedScalableVFPair
 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) 
{
-  // For outer loops, use simple type-based heuristic VF. No cost model or
-  // memory dependence analysis is available.
-  if (!TheLoop->isInnermost()) {
-    return Config.computeVPlanOuterloopVF(UserVF);
-  }
-
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
     // TODO: It may be useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
@@ -5672,7 +5666,83 @@ void LoopVectorizationCostModel::collectValuesToIgnore() 
{
   }
 }
 
+// This function will select a scalable VF if the target supports scalable
+// vectors and a fixed one otherwise.
+// TODO: we could return a pair of values that specify the max VF and
+// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
+// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
+// doesn't have a cost model that can choose which plan to execute if
+// more than one is generated.
+static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
+                                     VFSelectionContext &Config) {
+  unsigned WidestType = Config.getSmallestAndWidestTypes().second;
+
+  TargetTransformInfo::RegisterKind RegKind =
+      TTI.enableScalableVectorization()
+          ? TargetTransformInfo::RGK_ScalableVector
+          : TargetTransformInfo::RGK_FixedWidthVector;
+
+  TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
+  unsigned N = RegSize.getKnownMinValue() / WidestType;
+  return ElementCount::get(N, RegSize.isScalable());
+}
+
+VectorizationFactor
+LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
+  ElementCount VF = UserVF;
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is 
profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
+  if (!OrigLoop->isInnermost()) {
+    // If the user doesn't provide a vectorization factor, determine a
+    // reasonable one.
+    if (UserVF.isZero()) {
+      VF = determineVPlanVF(TTI, Config);
+      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
+
+      // Make sure we have a VF > 1 for stress testing.
+      if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
+        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
+                          << "overriding computed VF.\n");
+        VF = ElementCount::getFixed(4);
+      }
+    } else if (UserVF.isScalable() && !Config.supportsScalableVectors()) {
+      LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
+                        << "not supported by the target.\n");
+      reportVectorizationFailure(
+          "Scalable vectorization requested but not supported by the target",
+          "the scalable user-specified vectorization width for outer-loop "
+          "vectorization cannot be used because the target does not support "
+          "scalable vectors.",
+          "ScalableVFUnfeasible", ORE, OrigLoop);
+      return VectorizationFactor::Disabled();
+    }
+    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+           "VF needs to be a power of two");
+    LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
+                      << "VF " << VF << " to build VPlans.\n");
+    buildVPlans(VF, VF);
+
+    if (VPlans.empty())
+      return VectorizationFactor::Disabled();
+
+    // For VPlan build stress testing, we bail out after VPlan construction.
+    if (VPlanBuildStressTest)
+      return VectorizationFactor::Disabled();
+
+    return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+                "VPlan-native path.\n");
+  return VectorizationFactor::Disabled();
+}
+
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+  assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   Config.collectElementTypesForWidening(&CM.ValuesToIgnore);
 
@@ -5680,16 +5750,6 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, 
unsigned UserIC) {
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
 
-  if (!OrigLoop->isInnermost()) {
-    // For outer loops, computeMaxVF returns a single non-scalar VF; build a
-    // plan for only that VF.
-    ElementCount VF =
-        MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF;
-    buildVPlans(VF, VF);
-    LLVM_DEBUG(printPlans(dbgs()));
-    return;
-  }
-
   // Compute the minimal bitwidths required for integer operations in the loop
   // for later use by the cost model.
   Config.computeMinimalBitwidths();
@@ -5730,9 +5790,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, 
unsigned UserIC) {
       if (EpilogueUserVF.isVector() &&
           ElementCount::isKnownLT(EpilogueUserVF, UserVF)) {
         CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF);
-        buildVPlans(EpilogueUserVF, EpilogueUserVF);
+        buildVPlansWithVPRecipes(EpilogueUserVF, EpilogueUserVF);
       }
-      buildVPlans(UserVF, UserVF);
+      buildVPlansWithVPRecipes(UserVF, UserVF);
       if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) {
         // For scalar VF, skip VPlan cost check as VPlan cost is designed for
         // vector VFs only.
@@ -5764,8 +5824,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, 
unsigned UserIC) {
     CM.collectNonVectorizedAndSetWideningDecisions(VF);
   }
 
-  buildVPlans(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlans(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
+  buildVPlansWithVPRecipes(ElementCount::getScalable(1), 
MaxFactors.ScalableVF);
 
   LLVM_DEBUG(printPlans(dbgs()));
 }
@@ -5983,25 +6043,22 @@ LoopVectorizationPlanner::computeBestVF() {
     return {VectorizationFactor::Disabled(), nullptr};
   // If there is a single VPlan with a single VF, return it directly.
   VPlan &FirstPlan = *VPlans[0];
-
   ElementCount UserVF = Hints.getWidth();
-  if (VPlans.size() == 1) {
-    // For outer loops, the plan has a single vector VF determined by the
-    // heuristic.
-    assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) ||
-            FirstPlan.isOuterLoop()) &&
-           "must have a single scalar VF, UserVF or an outer loop");
-    return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
-  }
-
-  if (hasPlanWithVF(UserVF) && EpilogueVectorizationForceVF > 1) {
-    assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
-    assert(VPlans[0]->getSingleVF() ==
-               ElementCount::getFixed(EpilogueVectorizationForceVF) &&
-           "expected first plan to be for the forced epilogue VF");
-    assert(VPlans[1]->getSingleVF() == UserVF &&
-           "expected second plan to be for the forced UserVF");
-    return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
+  if (hasPlanWithVF(UserVF)) {
+    if (VPlans.size() == 1) {
+      assert(FirstPlan.getSingleVF() == UserVF &&
+             "UserVF must match single VF");
+      return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
+    }
+    if (EpilogueVectorizationForceVF > 1) {
+      assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
+      assert(VPlans[0]->getSingleVF() ==
+                 ElementCount::getFixed(EpilogueVectorizationForceVF) &&
+             "expected first plan to be for the forced epilogue VF");
+      assert(VPlans[1]->getSingleVF() == UserVF &&
+             "expected second plan to be for the forced UserVF");
+      return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
+    }
   }
 
   LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
@@ -6747,38 +6804,30 @@ 
VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
 // optimizations.
 static void printOptimizedVPlan(VPlan &) {}
 
-void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
-                                           ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+                                                        ElementCount MaxVF) {
   if (ElementCount::isKnownGT(MinVF, MaxVF))
     return;
 
-  bool IsInnerLoop = OrigLoop->isInnermost();
-
-  // Set up loop versioning for inner loops with memory runtime checks.
-  // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not
-  // called for them.
-  std::optional<LoopVersioning> LVer;
-  if (IsInnerLoop) {
-    const LoopAccessInfo *LAI = Legal->getLAI();
-    LVer.emplace(*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop,
-                 LI, DT, PSE.getSE());
-    if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
-        !LAI->getRuntimePointerChecking()->getDiffChecks()) {
-      // Only use noalias metadata when using memory checks guaranteeing no
-      // overlap across all iterations.
-      LVer->prepareNoAliasMetadata();
-    }
+  assert(OrigLoop->isInnermost() && "Inner loop expected.");
+
+  const LoopAccessInfo *LAI = Legal->getLAI();
+  LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
+                      OrigLoop, LI, DT, PSE.getSE());
+  if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
+      !LAI->getRuntimePointerChecking()->getDiffChecks()) {
+    // Only use noalias metadata when using memory checks guaranteeing no
+    // overlap across all iterations.
+    LVer.prepareNoAliasMetadata();
   }
 
   // Create initial base VPlan0, to serve as common starting point for all
   // candidates built later for specific VF ranges.
   auto VPlan0 = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE,
-      LVer ? &*LVer : nullptr);
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
 
-  // Create recipes for header phis. For outer loops, reductions, recurrences
-  // and in-loop reductions are empty since legality doesn't detect them.
+  // Create recipes for header phis.
   if (!RUN_VPLAN_PASS(VPlanTransforms::createHeaderPhiRecipes, *VPlan0, PSE,
                       *OrigLoop, Legal->getInductionVars(),
                       Legal->getReductionVars(),
@@ -6813,8 +6862,8 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount 
MinVF,
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    auto Plan =
-        tryToBuildVPlan(std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange);
+    auto Plan = tryToBuildVPlanWithVPRecipes(
+        std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange);
     VF = SubRange.End;
 
     if (!Plan)
@@ -6842,21 +6891,9 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount 
MinVF,
   }
 }
 
-VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
-                                                   VFRange &Range) {
-
-  // For outer loops, the plan only needs basic recipe conversion and induction
-  // live-out optimization; the full inner-loop recipe building below does not
-  // apply (no widening decisions, interleave groups, reductions, etc.).
-  if (Plan->isOuterLoop()) {
-    for (ElementCount VF : Range)
-      Plan->addVF(VF);
-    if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI))
-      return nullptr;
-    VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE,
-                                                   /*FoldTail=*/false);
-    return Plan;
-  }
+VPlanPtr
+LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VPlanPtr Plan,
+                                                       VFRange &Range) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -7079,6 +7116,47 @@ VPlanPtr 
LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
   return Plan;
 }
 
+VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
+  // Outer loop handling: They may require CFG and instruction level
+  // transformations before even evaluating whether vectorization is 
profitable.
+  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+  // the vectorization pipeline.
+  assert(!OrigLoop->isInnermost());
+  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+  auto Plan = VPlanTransforms::buildVPlan0(
+      OrigLoop, *LI, Legal->getWidestInductionType(),
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+
+  if (!VPlanTransforms::createHeaderPhiRecipes(
+          *Plan, PSE, *OrigLoop, Legal->getInductionVars(),
+          MapVector<PHINode *, RecurrenceDescriptor>(),
+          SmallPtrSet<const PHINode *, 1>(), SmallPtrSet<PHINode *, 1>(),
+          /*AllowReordering=*/false))
+    return nullptr;
+  [[maybe_unused]] bool CanHandleExits = VPlanTransforms::handleEarlyExits(
+      *Plan, UncountableExitStyle::NoUncountableExit, OrigLoop, PSE, *DT,
+      Legal->getAssumptionCache());
+  assert(CanHandleExits &&
+         "early-exits are not supported in VPlan-native path");
+  VPlanTransforms::addMiddleCheck(*Plan, /*TailFolded*/ false);
+
+  VPlanTransforms::createLoopRegions(*Plan);
+
+  for (ElementCount VF : Range)
+    Plan->addVF(VF);
+
+  if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI))
+    return nullptr;
+
+  // Optimize induction live-out users to use precomputed end values.
+  VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE,
+                                                 /*FoldTail=*/false);
+
+  assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
+  return Plan;
+}
+
 void LoopVectorizationPlanner::addReductionResultComputation(
     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
   using namespace VPlanPatternMatch;
@@ -7279,7 +7357,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
   if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
     // VPlan-native path does not do any analysis for runtime checks
     // currently.
-    assert((!EnableVPlanNativePath || !Plan.isOuterLoop()) &&
+    assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
            "Runtime checks are not supported for outer loops yet");
 
     if (Config.OptForSize) {
@@ -7360,6 +7438,75 @@ getEpilogueLowering(Function *F, Loop *L, 
LoopVectorizeHints &Hints,
   return CM_EpilogueAllowed;
 }
 
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+    OptimizationRemarkEmitter *ORE,
+    std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
+    LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
+
+  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+    LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
+    return false;
+  }
+  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+  Function *F = L->getHeader()->getParent();
+  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+
+  EpilogueLowering SEL =
+      getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);
+
+  VFSelectionContext Config(*TTI, LVL, L, *F, PSE, DB, ORE, &Hints, 
OptForSize);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, AC, ORE,
+                                GetBFI, F, &Hints, IAI, Config);
+  // Use the planner for outer loop vectorization.
+  // TODO: CM is not used at this point inside the planner. Turn CM into an
+  // optional argument if we don't need it in the future.
+  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, Config, IAI, PSE,
+                               Hints, ORE);
+
+  // Get user vectorization factor.
+  ElementCount UserVF = Hints.getWidth();
+
+  Config.collectElementTypesForWidening();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
+
+  // If we are stress testing VPlan builds, do not attempt to generate vector
+  // code. Masked vector code generation support will follow soon.
+  // Also, do not attempt to vectorize if no vector code will be produced.
+  if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
+    return false;
+
+  VPlan &BestPlan = LVP.getPlanFor(VF.Width);
+
+  {
+    GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind);
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
+                           Checks, BestPlan);
+    LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
+                      << "\"\n");
+    LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1,
+                                 VF.MinProfitableTripCount);
+    bool HasBranchWeights =
+        hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
+    LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
+
+    reportVectorization(ORE, L, VF, 1);
+
+    LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT);
+  }
+
+  assert(!verifyFunction(*F, &dbgs()));
+  return true;
+}
+
 // Emit a remark if there are stores to floats that required a floating point
 // extension. If the vectorized loop was generated with floating point there
 // will be a performance penalty from the conversion overhead and the change in
@@ -8029,14 +8176,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  bool IsInnerLoop = L->isInnermost();
-
-  // Outer loops require a computable trip count.
-  if (!IsInnerLoop && isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
-    LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
-    return false;
-  }
-
   if (LVL.hasUncountableEarlyExit()) {
     if (!EnableEarlyExitVectorization) {
       reportVectorizationFailure("Auto-vectorization of loops with uncountable 
"
@@ -8046,13 +8185,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     }
   }
 
+  // Entrance to the VPlan-native vectorization path. Outer loops are processed
+  // here. They may require CFG and instruction level transformations before
+  // even evaluating whether vectorization is profitable. Since we cannot 
modify
+  // the incoming IR, we need to build VPlan upfront in the vectorization
+  // pipeline.
+  if (!L->isInnermost())
+    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+                                        ORE, GetBFI, OptForSize, Hints,
+                                        Requirements);
+
+  assert(L->isInnermost() && "Inner loop expected.");
+
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
-  bool UseInterleaved =
-      IsInnerLoop && TTI->enableInterleavedAccessVectorization();
+  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
 
   // If an override option has been passed in for interleaved accesses, use it.
   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
-    UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses;
+    UseInterleaved = EnableInterleavedMemAccesses;
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved)
@@ -8155,11 +8305,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
-  // Outer loops don't have LoopAccessInfo, so skip the safety check and reset
-  // UserIC (interleaving is not supported for outer loops).
-  if (!IsInnerLoop)
-    UserIC = 0;
-  else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
+  if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
     UserIC = 1;
 
   // Plan how to best vectorize.
@@ -8167,16 +8313,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   auto [VF, BestPlanPtr] = LVP.computeBestVF();
   unsigned IC = 1;
 
-  // For VPlan build stress testing of outer loops, bail after plan
-  // construction.
-  if (!IsInnerLoop && VPlanBuildOuterloopStressTest)
-    return false;
-
-  if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME))
+  if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
   GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind);
-  if (IsInnerLoop && LVP.hasPlanWithVF(VF.Width)) {
+  if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(*BestPlanPtr, VF.Width, VF.Cost);
 
@@ -8419,9 +8560,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                  VF.MinProfitableTripCount);
     LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights);
 
-    if (!IsInnerLoop)
-      LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
-                        << "\"\n");
     LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
     ++LoopsVectorized;
   }

diff  --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp 
b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 3c8f3362ae93a..77cc6484e9c6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1078,14 +1078,6 @@ const VPRegionBlock *VPlan::getVectorLoopRegion() const {
   return nullptr;
 }
 
-bool VPlan::isOuterLoop() const {
-  const VPRegionBlock *LoopRegion = getVectorLoopRegion();
-  assert(LoopRegion && "expected a vector loop region");
-  return any_of(VPBlockUtils::blocksOnly<const VPRegionBlock>(
-                    vp_depth_first_shallow(LoopRegion->getEntry())),
-                [](const VPRegionBlock *R) { return !R->isReplicator(); });
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPlan::printLiveIns(raw_ostream &O) const {
   VPSlotTracker SlotTracker(this);
@@ -1675,6 +1667,27 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
   return PredicateAtRangeStart;
 }
 
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
+void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
+                                           ElementCount MaxVF) {
+  auto MaxVFTimes2 = MaxVF * 2;
+  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+    VFRange SubRange = {VF, MaxVFTimes2};
+    if (auto Plan = tryToBuildVPlan(SubRange)) {
+      VPlanTransforms::optimize(*Plan);
+      // Update the name of the latch of the top-level vector loop region 
region
+      // after optimizations which includes block folding.
+      Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch");
+      VPlans.push_back(std::move(Plan));
+    }
+    VF = SubRange.End;
+  }
+}
+
 VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==

diff  --git a/llvm/lib/Transforms/Vectorize/VPlan.h 
b/llvm/lib/Transforms/Vectorize/VPlan.h
index 51193964bdd83..6a1ea6b3439bf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4616,10 +4616,6 @@ class VPlan {
   LLVM_ABI_FOR_TEST VPRegionBlock *getVectorLoopRegion();
   LLVM_ABI_FOR_TEST const VPRegionBlock *getVectorLoopRegion() const;
 
-  /// Returns true if this VPlan is for an outer loop, i.e., its vector
-  /// loop region contains a nested loop region.
-  LLVM_ABI_FOR_TEST bool isOuterLoop() const;
-
   /// Returns the 'middle' block of the plan, that is the block that selects
   /// whether to execute the scalar tail loop or the exit block from the loop
   /// latch. If there is an early exit from the vector loop, the middle block

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp 
b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 2717b80e2eeaa..9710767f905fe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -260,9 +260,6 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
 }
 
 void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
-  // Nested loop regions (outer-loop vectorization) are not supported yet.
-  if (Plan.isOuterLoop())
-    return;
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.

diff  --git 
a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll 
b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
index 7f442f7d72e78..e03110fc3807a 100644
--- 
a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
+++ 
b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -S -passes=loop-vectorize -enable-vplan-native-path 
-vplan-build-outerloop-stress-test -debug-only=loop-vectorize -disable-output 
2>&1  | FileCheck %s
+; RUN: opt < %s  -S -passes=loop-vectorize -enable-vplan-native-path 
-vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1  | 
FileCheck %s
 
 ; This test checks that, when stress testing VPlan, if the computed VF
 ; is 1, we override it to VF = 4.

diff  --git 
a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll 
b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
index f60a620deecf9..f6b215f43d68e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path 
-vplan-build-outerloop-stress-test -debug-only=vplan -disable-output 2>&1 | 
FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path 
-vplan-build-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s
 
 ; Verify that the stress testing flag for the VPlan H-CFG builder works as
 ; expected with and without enabling the VPlan H-CFG Verifier.

diff  --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll 
b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
index a610f0669f483..71bcd90304e43 100644
--- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
@@ -73,7 +73,7 @@ for.end15:
 ; CHECK-LABEL: case2
 ; CHECK: LV: Loop hints: force=enabled width=0 interleave=0
 ; CHECK: LV: We can vectorize this outer loop!
-; CHECK: LV: VPlan computed VF 1.
+; CHECK: LV: Using VF 1 to build VPlans.
 
 define void @case2(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 
%M) {
 entry:


        
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to