Issue 156114
Summary Stall inside LoopLoadEliminationPass
Labels new issue
Assignees
Reporter abadams
    We have a test case in Halide that is timing out inside LLVM with LLVM main. I have reduced it to some C++ below for ease of reproduction. Given the following C++

```
typedef float vector_t __attribute__((ext_vector_type(8)));

vector_t gather(const float *ptr) {
 vector_t result;
#pragma unroll
    for (int i = 0; i < 8; i++) {
 result[i] = *ptr;
        ptr += 1000;
    }
    return result;
}

void blur(const float *__restrict__ src, float *__restrict__ dst, int off, int n) {
    for (int x = 0; x < n; x++) {
        vector_t result{};
#pragma unroll
        for (int k = 0; k < N; k++) {
 int idx = (x + off) << 11;
            result += gather(src + (idx + k));
 }
        __builtin_memcpy(dst + x * 8, &result, sizeof(result));
 }
}
```

Compile time seems to be both very slow and also superlinear in N, which is the size of that unrolled inner loop over k. Here's the timing I get for various values of N:

```
N=10

real	0m0.068s
user	0m0.040s
sys	0m0.028s

N=20

real	0m0.076s
user	0m0.058s
sys	0m0.018s

N=30

real	0m0.096s
user	0m0.072s
sys	0m0.023s

N=40

real	0m7.253s
user	0m7.222s
sys	0m0.029s

N=50

real	0m13.966s
user	0m13.934s
sys	0m0.030s

N=60

real	0m24.731s
user	0m24.700s
sys	0m0.026s

N=70

real	0m38.657s
user	0m38.607s
sys	0m0.036s

N=80

real	0m59.675s
user	0m59.630s
sys	0m0.034s

N=90

real	1m23.424s
user	1m23.367s
sys	0m0.042s

N=100

real	1m57.814s
user	1m57.755s
sys	0m0.036s

```

A sample stack trace:
```
#0  __memset_evex_unaligned_erms () at ../sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:301
#1 0x00005555583ca480 in llvm::SmallPtrSetImplBase::Grow(unsigned int) ()
#2 0x00005555583ca663 in llvm::SmallPtrSetImplBase::insert_imp_big(void const*) ()
#3  0x0000555557564580 in llvm::ScalarEvolution::getUsedLoops(llvm::SCEV const*, llvm::SmallPtrSetImpl<llvm::Loop const*>&) ()
#4  0x000055555759c184 in llvm::ScalarEvolution::isKnownViaInduction(llvm::CmpPredicate, llvm::SCEV const*, llvm::SCEV const*) ()
#5  0x000055555759c72f in llvm::ScalarEvolution::isKnownPredicate(llvm::CmpPredicate, llvm::SCEV const*, llvm::SCEV const*) ()
#6  0x00005555575a20ea in llvm::SCEVWrapPredicate::implies(llvm::SCEVPredicate const*, llvm::ScalarEvolution&) const ()
#7  0x0000555557568dc3 in llvm::SCEVUnionPredicate::add(llvm::SCEVPredicate const*, llvm::ScalarEvolution&) ()
#8  0x00005555575c4853 in llvm::PredicatedScalarEvolution::addPredicate(llvm::SCEVPredicate const&) [clone .part.0] ()
#9  0x00005555575cbe9b in llvm::PredicatedScalarEvolution::setNoOverflow(llvm::Value*, llvm::SCEVWrapPredicate::IncrementWrapFlags) ()
#10 0x00005555574d0ff4 in isNoWrap(llvm::PredicatedScalarEvolution&, llvm::SCEVAddRecExpr const*, llvm::Value*, llvm::Type*, llvm::Loop const*, bool, std::optional<long>) ()
#11 0x00005555574db1ea in (anonymous namespace)::AccessAnalysis::createCheckForAccess(llvm::RuntimePointerChecking&, llvm::PointerIntPair<llvm::Value*, 1u, bool, llvm::PointerLikeTypeTraits<llvm::Value*>, llvm::PointerIntPairInfo<llvm::Value*, 1u, llvm::PointerLikeTypeTraits<llvm::Value*> > >, llvm::Type*, llvm::DenseMap<llvm::Value*, llvm::SCEV const*, llvm::DenseMapInfo<llvm::Value*, void>, llvm::detail::DenseMapPair<llvm::Value*, llvm::SCEV const*> > const&, llvm::DenseMap<llvm::Value*, unsigned int, llvm::DenseMapInfo<llvm::Value*, void>, llvm::detail::DenseMapPair<llvm::Value*, unsigned int> >&, llvm::Loop*, unsigned int&, unsigned int, bool) ()
#12 0x00005555574dea37 in (anonymous namespace)::AccessAnalysis::canCheckPtrAtRT(llvm::RuntimePointerChecking&, llvm::Loop*, llvm::DenseMap<llvm::Value*, llvm::SCEV const*, llvm::DenseMapInfo<llvm::Value*, void>, llvm::detail::DenseMapPair<llvm::Value*, llvm::SCEV const*> > const&, llvm::Value*&, bool) [clone .part.0] ()
#13 0x00005555574e4e3f in llvm::LoopAccessInfo::analyzeLoop(llvm::AAResults*, llvm::LoopInfo const*, llvm::TargetLibraryInfo const*, llvm::DominatorTree*) ()
#14 0x00005555574e59cc in llvm::LoopAccessInfo::LoopAccessInfo(llvm::Loop*, llvm::ScalarEvolution*, llvm::TargetTransformInfo const*, llvm::TargetLibraryInfo const*, llvm::AAResults*, llvm::DominatorTree*, llvm::LoopInfo*, bool) ()
#15 0x00005555574e6242 in llvm::LoopAccessInfoManager::getInfo(llvm::Loop&, bool) ()
#16 0x000055555bbf0100 in llvm::LoopLoadEliminationPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) ()
#17 0x000055555948b9a6 in llvm::detail::PassModel<llvm::Function, llvm::LoopLoadEliminationPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) ()
#18 0x0000555557ef4279 in llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) ()
#19 0x00005555560b7856 in llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) ()
#20 0x0000555557ef4772 in llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) ()
#21 0x00005555560b8216 in llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) ()
#22 0x0000555557ef2331 in llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) ()
#23 0x00005555586d8f8f in (anonymous namespace)::EmitAssemblyHelper::RunOptimizationPipeline(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream> >&, std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile> >&, clang::BackendConsumer*) ()
...
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to