Author: Florian Hahn Date: 2026-05-08T21:52:53+02:00 New Revision: 9414ed711700de33ac728566c247186e4ff1790f
URL: https://github.com/llvm/llvm-project/commit/9414ed711700de33ac728566c247186e4ff1790f DIFF: https://github.com/llvm/llvm-project/commit/9414ed711700de33ac728566c247186e4ff1790f.diff LOG: Revert "[VPlan] Unify inner and outer loop paths (NFCI). (#192868)" This reverts commit b84f58ee844ca929db2fff2e41e2195e255548b8. Added: Modified: llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/lib/Transforms/Vectorize/VPlan.cpp llvm/lib/Transforms/Vectorize/VPlan.h llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll Removed: ################################################################################ diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h index 0d45c159d315c..18906aa7eeae3 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -169,12 +169,9 @@ struct LoopVectorizePass : public OptionalPassInfoMixin<LoopVectorizePass> { /// purposes along with the corresponding optimization remark \p RemarkName. /// If \p I is passed, it is an instruction that prevents vectorization. /// Otherwise, the loop \p TheLoop is used for the location of the remark. -LLVM_ABI void reportVectorizationFailure(const StringRef DebugMsg, - const StringRef OREMsg, - const StringRef ORETag, - OptimizationRemarkEmitter *ORE, - const Loop *TheLoop, - Instruction *I = nullptr); +LLVM_ABI void reportVectorizationFailure( + const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr); /// Same as above, but the debug message and optimization remark are identical inline void reportVectorizationFailure(const StringRef DebugMsg, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp index f29834d2f804e..91476cf232fe0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.cpp @@ -27,8 +27,6 @@ using namespace llvm; #define DEBUG_TYPE "loop-vectorize" -extern cl::opt<bool> VPlanBuildOuterloopStressTest; - static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -620,49 +618,3 @@ void VFSelectionContext::collectInLoopReductions() { << " reduction for phi: " << *Phi << "\n"); } } - -// TODO: we could return a pair of values that specify the max VF and -// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of -// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment -// doesn't have a cost model that can choose which plan to execute if -// more than one is generated. -FixedScalableVFPair -VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) { - if (UserVF.isScalable() && !supportsScalableVectors()) { - reportVectorizationFailure( - "Scalable vectorization requested but not supported by the target", - "the scalable user-specified vectorization width for outer-loop " - "vectorization cannot be used because the target does not support " - "scalable vectors.", - "ScalableVFUnfeasible", ORE, TheLoop); - return FixedScalableVFPair::getNone(); - } - - ElementCount VF = UserVF; - if (VF.isZero()) { - auto [_, WidestType] = getSmallestAndWidestTypes(); - - auto RegKind = TTI.enableScalableVectorization() - ? TargetTransformInfo::RGK_ScalableVector - : TargetTransformInfo::RGK_FixedWidthVector; - - TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); - unsigned N = RegSize.getKnownMinValue() / WidestType; - VF = ElementCount::get(N, RegSize.isScalable()); - LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); - - // Make sure we have a VF > 1 for stress testing. - if (VPlanBuildOuterloopStressTest && VF.isScalar()) { - LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " - << "overriding computed VF.\n"); - VF = ElementCount::getFixed(4); - } - } - assert(isPowerOf2_32(VF.getKnownMinValue()) && - "VF needs to be a power of two"); - if (VF.isScalar()) - return FixedScalableVFPair::getNone(); - LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") - << "VF " << VF << " to build VPlans.\n"); - return FixedScalableVFPair(VF); -} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 00b689326d770..a6789974e0bd6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -703,10 +703,6 @@ class VFSelectionContext { /// for size, returning true here aborts vectorization. bool runtimeChecksRequired(); - /// Returns a scalable VF to use for outer-loop vectorization if the target - /// supports it and a fixed VF otherwise. - FixedScalableVFPair computeVPlanOuterloopVF(ElementCount UserVF); - /// Compute smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -793,6 +789,10 @@ class LoopVectorizationPlanner { /// interleaving should be avoided up-front, no plans are generated. void plan(ElementCount UserVF, unsigned UserIC); + /// Use the VPlan-native path to plan how to best vectorize, return the best + /// VF and its cost. + VectorizationFactor planInVPlanNativePath(ElementCount UserVF); + /// Return the VPlan for \p VF. At the moment, there is always a single VPlan /// for each VF. VPlan &getPlanFor(ElementCount VF) const; @@ -881,22 +881,34 @@ class LoopVectorizationPlanner { unsigned OrigLoopInvocationWeight, unsigned EstimatedVFxUF, bool DisableRuntimeUnroll); -private: - /// Build a VPlan using VPRecipes according to the information gathered by - /// Legal and VPlan-based analysis. For outer loops, performs basic recipe - /// conversion only. For inner loops, \p Range's largest included VF is - /// restricted to the maximum VF the returned VPlan is valid for. If no VPlan - /// can be built for the input range, set the largest included VF to the - /// maximum VF for which no plan could be built. Each VPlan is built starting - /// from a copy of \p InitialPlan, which is a plain CFG VPlan wrapping the - /// original scalar loop. - VPlanPtr tryToBuildVPlan(VPlanPtr InitialPlan, VFRange &Range); - +protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. void buildVPlans(ElementCount MinVF, ElementCount MaxVF); +private: + /// Build a VPlan according to the information gathered by Legal. \return a + /// VPlan for vectorization factors \p Range.Start and up to \p Range.End + /// exclusive, possibly decreasing \p Range.End. If no VPlan can be built for + /// the input range, set the largest included VF to the maximum VF for which + /// no plan could be built. + VPlanPtr tryToBuildVPlan(VFRange &Range); + + /// Build a VPlan using VPRecipes according to the information gather by + /// Legal. This method is only used for the legacy inner loop vectorizer. + /// \p Range's largest included VF is restricted to the maximum VF the + /// returned VPlan is valid for. If no VPlan can be built for the input range, + /// set the largest included VF to the maximum VF for which no plan could be + /// built. Each VPlan is built starting from a copy of \p InitialPlan, which + /// is a plain CFG VPlan wrapping the original scalar loop. + VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range); + + /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, + /// according to the information gathered by Legal when it checked if it is + /// legal to vectorize the loop. This method creates VPlans using VPRecipes. + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + /// Add ComputeReductionResult recipes to the middle block to compute the /// final reduction results. Add Select recipes to the latch block when /// folding tail, to feed ComputeReductionResult with the last or penultimate diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1ace2275e2b6d..ae1d6d83cccd4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -350,8 +350,8 @@ cl::opt<bool> llvm::VPlanPrintVectorRegionScope( // VPlan-native vectorization path. It must be used in conjuction with // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the // verification of the H-CFGs built. -cl::opt<bool> VPlanBuildOuterloopStressTest( - "vplan-build-outerloop-stress-test", cl::init(false), cl::Hidden, +static cl::opt<bool> VPlanBuildStressTest( + "vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc( "Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " @@ -745,8 +745,8 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, - OptimizationRemarkEmitter *ORE, - const Loop *TheLoop, Instruction *I) { + OptimizationRemarkEmitter *ORE, Loop *TheLoop, + Instruction *I) { LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); LoopVectorizeHints Hints(TheLoop, false /* doesn't matter */, *ORE); ORE->emit( @@ -1877,7 +1877,7 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, // now, only collect outer loops that have explicit vectorization hints. If we // are stress testing the VPlan H-CFG construction, we collect the outermost // loop of every loop nest. - if (L.isInnermost() || VPlanBuildOuterloopStressTest || + if (L.isInnermost() || VPlanBuildStressTest || (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { LoopBlocksRPO RPOT(&L); RPOT.perform(LI); @@ -2868,12 +2868,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { FixedScalableVFPair LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { - // For outer loops, use simple type-based heuristic VF. No cost model or - // memory dependence analysis is available. - if (!TheLoop->isInnermost()) { - return Config.computeVPlanOuterloopVF(UserVF); - } - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may be useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -5672,7 +5666,83 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } +// This function will select a scalable VF if the target supports scalable +// vectors and a fixed one otherwise. +// TODO: we could return a pair of values that specify the max VF and +// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of +// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment +// doesn't have a cost model that can choose which plan to execute if +// more than one is generated. +static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, + VFSelectionContext &Config) { + unsigned WidestType = Config.getSmallestAndWidestTypes().second; + + TargetTransformInfo::RegisterKind RegKind = + TTI.enableScalableVectorization() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; + + TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); + unsigned N = RegSize.getKnownMinValue() / WidestType; + return ElementCount::get(N, RegSize.isScalable()); +} + +VectorizationFactor +LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { + ElementCount VF = UserVF; + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + if (!OrigLoop->isInnermost()) { + // If the user doesn't provide a vectorization factor, determine a + // reasonable one. + if (UserVF.isZero()) { + VF = determineVPlanVF(TTI, Config); + LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); + + // Make sure we have a VF > 1 for stress testing. + if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { + LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " + << "overriding computed VF.\n"); + VF = ElementCount::getFixed(4); + } + } else if (UserVF.isScalable() && !Config.supportsScalableVectors()) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " + << "not supported by the target.\n"); + reportVectorizationFailure( + "Scalable vectorization requested but not supported by the target", + "the scalable user-specified vectorization width for outer-loop " + "vectorization cannot be used because the target does not support " + "scalable vectors.", + "ScalableVFUnfeasible", ORE, OrigLoop); + return VectorizationFactor::Disabled(); + } + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") + << "VF " << VF << " to build VPlans.\n"); + buildVPlans(VF, VF); + + if (VPlans.empty()) + return VectorizationFactor::Disabled(); + + // For VPlan build stress testing, we bail out after VPlan construction. + if (VPlanBuildStressTest) + return VectorizationFactor::Disabled(); + + return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; + } + + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " + "VPlan-native path.\n"); + return VectorizationFactor::Disabled(); +} + void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); Config.collectElementTypesForWidening(&CM.ValuesToIgnore); @@ -5680,16 +5750,6 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return; - if (!OrigLoop->isInnermost()) { - // For outer loops, computeMaxVF returns a single non-scalar VF; build a - // plan for only that VF. - ElementCount VF = - MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF; - buildVPlans(VF, VF); - LLVM_DEBUG(printPlans(dbgs())); - return; - } - // Compute the minimal bitwidths required for integer operations in the loop // for later use by the cost model. Config.computeMinimalBitwidths(); @@ -5730,9 +5790,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (EpilogueUserVF.isVector() && ElementCount::isKnownLT(EpilogueUserVF, UserVF)) { CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF); - buildVPlans(EpilogueUserVF, EpilogueUserVF); + buildVPlansWithVPRecipes(EpilogueUserVF, EpilogueUserVF); } - buildVPlans(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF, UserVF); if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) { // For scalar VF, skip VPlan cost check as VPlan cost is designed for // vector VFs only. @@ -5764,8 +5824,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.collectNonVectorizedAndSetWideningDecisions(VF); } - buildVPlans(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlans(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); + buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); LLVM_DEBUG(printPlans(dbgs())); } @@ -5983,25 +6043,22 @@ LoopVectorizationPlanner::computeBestVF() { return {VectorizationFactor::Disabled(), nullptr}; // If there is a single VPlan with a single VF, return it directly. VPlan &FirstPlan = *VPlans[0]; - ElementCount UserVF = Hints.getWidth(); - if (VPlans.size() == 1) { - // For outer loops, the plan has a single vector VF determined by the - // heuristic. - assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) || - FirstPlan.isOuterLoop()) && - "must have a single scalar VF, UserVF or an outer loop"); - return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan}; - } - - if (hasPlanWithVF(UserVF) && EpilogueVectorizationForceVF > 1) { - assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built"); - assert(VPlans[0]->getSingleVF() == - ElementCount::getFixed(EpilogueVectorizationForceVF) && - "expected first plan to be for the forced epilogue VF"); - assert(VPlans[1]->getSingleVF() == UserVF && - "expected second plan to be for the forced UserVF"); - return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()}; + if (hasPlanWithVF(UserVF)) { + if (VPlans.size() == 1) { + assert(FirstPlan.getSingleVF() == UserVF && + "UserVF must match single VF"); + return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan}; + } + if (EpilogueVectorizationForceVF > 1) { + assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built"); + assert(VPlans[0]->getSingleVF() == + ElementCount::getFixed(EpilogueVectorizationForceVF) && + "expected first plan to be for the forced epilogue VF"); + assert(VPlans[1]->getSingleVF() == UserVF && + "expected second plan to be for the forced UserVF"); + return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()}; + } } LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: " @@ -6747,38 +6804,30 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R, // optimizations. static void printOptimizedVPlan(VPlan &) {} -void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, + ElementCount MaxVF) { if (ElementCount::isKnownGT(MinVF, MaxVF)) return; - bool IsInnerLoop = OrigLoop->isInnermost(); - - // Set up loop versioning for inner loops with memory runtime checks. - // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not - // called for them. - std::optional<LoopVersioning> LVer; - if (IsInnerLoop) { - const LoopAccessInfo *LAI = Legal->getLAI(); - LVer.emplace(*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, - LI, DT, PSE.getSE()); - if (!LAI->getRuntimePointerChecking()->getChecks().empty() && - !LAI->getRuntimePointerChecking()->getDiffChecks()) { - // Only use noalias metadata when using memory checks guaranteeing no - // overlap across all iterations. - LVer->prepareNoAliasMetadata(); - } + assert(OrigLoop->isInnermost() && "Inner loop expected."); + + const LoopAccessInfo *LAI = Legal->getLAI(); + LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(), + OrigLoop, LI, DT, PSE.getSE()); + if (!LAI->getRuntimePointerChecking()->getChecks().empty() && + !LAI->getRuntimePointerChecking()->getDiffChecks()) { + // Only use noalias metadata when using memory checks guaranteeing no + // overlap across all iterations. + LVer.prepareNoAliasMetadata(); } // Create initial base VPlan0, to serve as common starting point for all // candidates built later for specific VF ranges. auto VPlan0 = VPlanTransforms::buildVPlan0( OrigLoop, *LI, Legal->getWidestInductionType(), - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, - LVer ? &*LVer : nullptr); + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer); - // Create recipes for header phis. For outer loops, reductions, recurrences - // and in-loop reductions are empty since legality doesn't detect them. + // Create recipes for header phis. if (!RUN_VPLAN_PASS(VPlanTransforms::createHeaderPhiRecipes, *VPlan0, PSE, *OrigLoop, Legal->getInductionVars(), Legal->getReductionVars(), @@ -6813,8 +6862,8 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - auto Plan = - tryToBuildVPlan(std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange); + auto Plan = tryToBuildVPlanWithVPRecipes( + std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange); VF = SubRange.End; if (!Plan) @@ -6842,21 +6891,9 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, } } -VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan, - VFRange &Range) { - - // For outer loops, the plan only needs basic recipe conversion and induction - // live-out optimization; the full inner-loop recipe building below does not - // apply (no widening decisions, interleave groups, reductions, etc.). - if (Plan->isOuterLoop()) { - for (ElementCount VF : Range) - Plan->addVF(VF); - if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI)) - return nullptr; - VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE, - /*FoldTail=*/false); - return Plan; - } +VPlanPtr +LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VPlanPtr Plan, + VFRange &Range) { using namespace llvm::VPlanPatternMatch; SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; @@ -7079,6 +7116,47 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan, return Plan; } +VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + assert(!OrigLoop->isInnermost()); + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + + auto Plan = VPlanTransforms::buildVPlan0( + OrigLoop, *LI, Legal->getWidestInductionType(), + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + + if (!VPlanTransforms::createHeaderPhiRecipes( + *Plan, PSE, *OrigLoop, Legal->getInductionVars(), + MapVector<PHINode *, RecurrenceDescriptor>(), + SmallPtrSet<const PHINode *, 1>(), SmallPtrSet<PHINode *, 1>(), + /*AllowReordering=*/false)) + return nullptr; + [[maybe_unused]] bool CanHandleExits = VPlanTransforms::handleEarlyExits( + *Plan, UncountableExitStyle::NoUncountableExit, OrigLoop, PSE, *DT, + Legal->getAssumptionCache()); + assert(CanHandleExits && + "early-exits are not supported in VPlan-native path"); + VPlanTransforms::addMiddleCheck(*Plan, /*TailFolded*/ false); + + VPlanTransforms::createLoopRegions(*Plan); + + for (ElementCount VF : Range) + Plan->addVF(VF); + + if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(*Plan, *TLI)) + return nullptr; + + // Optimize induction live-out users to use precomputed end values. + VPlanTransforms::optimizeInductionLiveOutUsers(*Plan, PSE, + /*FoldTail=*/false); + + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); + return Plan; +} + void LoopVectorizationPlanner::addReductionResultComputation( VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { using namespace VPlanPatternMatch; @@ -7279,7 +7357,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { // VPlan-native path does not do any analysis for runtime checks // currently. - assert((!EnableVPlanNativePath || !Plan.isOuterLoop()) && + assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && "Runtime checks are not supported for outer loops yet"); if (Config.OptForSize) { @@ -7360,6 +7438,75 @@ getEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, return CM_EpilogueAllowed; } +// Process the loop in the VPlan-native vectorization path. This path builds +// VPlan upfront in the vectorization pipeline, which allows to apply +// VPlan-to-VPlan transformations from the very beginning without modifying the +// input LLVM IR. +static bool processLoopInVPlanNativePath( + Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, + LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, + std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize, + LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) { + + if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { + LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); + return false; + } + assert(EnableVPlanNativePath && "VPlan-native path is disabled."); + Function *F = L->getHeader()->getParent(); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); + + EpilogueLowering SEL = + getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI); + + VFSelectionContext Config(*TTI, LVL, L, *F, PSE, DB, ORE, &Hints, OptForSize); + LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, AC, ORE, + GetBFI, F, &Hints, IAI, Config); + // Use the planner for outer loop vectorization. + // TODO: CM is not used at this point inside the planner. Turn CM into an + // optional argument if we don't need it in the future. + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, Config, IAI, PSE, + Hints, ORE); + + // Get user vectorization factor. + ElementCount UserVF = Hints.getWidth(); + + Config.collectElementTypesForWidening(); + + // Plan how to best vectorize, return the best VF and its cost. + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); + + // If we are stress testing VPlan builds, do not attempt to generate vector + // code. Masked vector code generation support will follow soon. + // Also, do not attempt to vectorize if no vector code will be produced. + if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) + return false; + + VPlan &BestPlan = LVP.getPlanFor(VF.Width); + + { + GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind); + InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM, + Checks, BestPlan); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName() + << "\"\n"); + LVP.addMinimumIterationCheck(BestPlan, VF.Width, /*UF=*/1, + VF.MinProfitableTripCount); + bool HasBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); + LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights); + + reportVectorization(ORE, L, VF, 1); + + LVP.executePlan(VF.Width, /*UF=*/1, BestPlan, LB, DT); + } + + assert(!verifyFunction(*F, &dbgs())); + return true; +} + // Emit a remark if there are stores to floats that required a floating point // extension. If the vectorized loop was generated with floating point there // will be a performance penalty from the conversion overhead and the change in @@ -8029,14 +8176,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - bool IsInnerLoop = L->isInnermost(); - - // Outer loops require a computable trip count. - if (!IsInnerLoop && isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { - LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); - return false; - } - if (LVL.hasUncountableEarlyExit()) { if (!EnableEarlyExitVectorization) { reportVectorizationFailure("Auto-vectorization of loops with uncountable " @@ -8046,13 +8185,24 @@ bool LoopVectorizePass::processLoop(Loop *L) { } } + // Entrance to the VPlan-native vectorization path. Outer loops are processed + // here. They may require CFG and instruction level transformations before + // even evaluating whether vectorization is profitable. Since we cannot modify + // the incoming IR, we need to build VPlan upfront in the vectorization + // pipeline. + if (!L->isInnermost()) + return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, + ORE, GetBFI, OptForSize, Hints, + Requirements); + + assert(L->isInnermost() && "Inner loop expected."); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); - bool UseInterleaved = - IsInnerLoop && TTI->enableInterleavedAccessVectorization(); + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); // If an override option has been passed in for interleaved accesses, use it. if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) - UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses; + UseInterleaved = EnableInterleavedMemAccesses; // Analyze interleaved memory accesses. if (UseInterleaved) @@ -8155,11 +8305,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); - // Outer loops don't have LoopAccessInfo, so skip the safety check and reset - // UserIC (interleaving is not supported for outer loops). - if (!IsInnerLoop) - UserIC = 0; - else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth()) + if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth()) UserIC = 1; // Plan how to best vectorize. @@ -8167,16 +8313,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { auto [VF, BestPlanPtr] = LVP.computeBestVF(); unsigned IC = 1; - // For VPlan build stress testing of outer loops, bail after plan - // construction. - if (!IsInnerLoop && VPlanBuildOuterloopStressTest) - return false; - - if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME)) + if (ORE->allowExtraAnalysis(LV_NAME)) LVP.emitInvalidCostRemarks(ORE); GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind); - if (IsInnerLoop && LVP.hasPlanWithVF(VF.Width)) { + if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = LVP.selectInterleaveCount(*BestPlanPtr, VF.Width, VF.Cost); @@ -8419,9 +8560,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { VF.MinProfitableTripCount); LVP.attachRuntimeChecks(BestPlan, Checks, HasBranchWeights); - if (!IsInnerLoop) - LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName() - << "\"\n"); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); ++LoopsVectorized; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 3c8f3362ae93a..77cc6484e9c6c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1078,14 +1078,6 @@ const VPRegionBlock *VPlan::getVectorLoopRegion() const { return nullptr; } -bool VPlan::isOuterLoop() const { - const VPRegionBlock *LoopRegion = getVectorLoopRegion(); - assert(LoopRegion && "expected a vector loop region"); - return any_of(VPBlockUtils::blocksOnly<const VPRegionBlock>( - vp_depth_first_shallow(LoopRegion->getEntry())), - [](const VPRegionBlock *R) { return !R->isReplicator(); }); -} - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPlan::printLiveIns(raw_ostream &O) const { VPSlotTracker SlotTracker(this); @@ -1675,6 +1667,27 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( return PredicateAtRangeStart; } +/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, +/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range +/// of VF's starting at a given VF and extending it as much as possible. Each +/// vectorization decision can potentially shorten this sub-range during +/// buildVPlan(). +void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, + ElementCount MaxVF) { + auto MaxVFTimes2 = MaxVF * 2; + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { + VFRange SubRange = {VF, MaxVFTimes2}; + if (auto Plan = tryToBuildVPlan(SubRange)) { + VPlanTransforms::optimize(*Plan); + // Update the name of the latch of the top-level vector loop region region + // after optimizations which includes block folding. + Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch"); + VPlans.push_back(std::move(Plan)); + } + VF = SubRange.End; + } +} + VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 51193964bdd83..6a1ea6b3439bf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4616,10 +4616,6 @@ class VPlan { LLVM_ABI_FOR_TEST VPRegionBlock *getVectorLoopRegion(); LLVM_ABI_FOR_TEST const VPRegionBlock *getVectorLoopRegion() const; - /// Returns true if this VPlan is for an outer loop, i.e., its vector - /// loop region contains a nested loop region. - LLVM_ABI_FOR_TEST bool isOuterLoop() const; - /// Returns the 'middle' block of the plan, that is the block that selects /// whether to execute the scalar tail loop or the exit block from the loop /// latch. If there is an early exit from the vector loop, the middle block diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 2717b80e2eeaa..9710767f905fe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -260,9 +260,6 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { } void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) { - // Nested loop regions (outer-loop vectorization) are not supported yet. - if (Plan.isOuterLoop()) - return; VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll index 7f442f7d72e78..e03110fc3807a 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-stress-test-no-explict-vf.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -vplan-build-outerloop-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s ; This test checks that, when stress testing VPlan, if the computed VF ; is 1, we override it to VF = 4. diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll index f60a620deecf9..f6b215f43d68e 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan_hcfg_stress_test.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path -vplan-build-outerloop-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=vplan -disable-output 2>&1 | FileCheck %s ; Verify that the stress testing flag for the VPlan H-CFG builder works as ; expected with and without enabling the VPlan H-CFG Verifier. diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll index a610f0669f483..71bcd90304e43 100644 --- a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -73,7 +73,7 @@ for.end15: ; CHECK-LABEL: case2 ; CHECK: LV: Loop hints: force=enabled width=0 interleave=0 ; CHECK: LV: We can vectorize this outer loop! -; CHECK: LV: VPlan computed VF 1. +; CHECK: LV: Using VF 1 to build VPlans. define void @case2(ptr nocapture %a, ptr nocapture readonly %b, i32 %N, i32 %M) { entry: _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
