Previously, we keep strict load/store order as before transformation. But for load/store from/into different address spaces, the instruction order can be changed. And this gave us an chance to merge more load/store instructions.
Also change the processing window a little larger to (maxVecSize * 5). This change would make significant performance benefit for SKL platform: A typical case is: ./opencv_perf_core --gtest_filter=OCL_LUTFixture_LUT.LUT/15 v2: should not move BBI backward if it points to the first instruction in the basic block. Signed-off-by: Ruiling Song <ruiling.s...@intel.com> --- backend/src/llvm/llvm_loadstore_optimization.cpp | 89 +++++++++++++++++++----- 1 file changed, 72 insertions(+), 17 deletions(-) diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp index 121f53d..018a3ed 100644 --- a/backend/src/llvm/llvm_loadstore_optimization.cpp +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp @@ -70,11 +70,11 @@ namespace gbe { bool isLoadStoreCompatible(Value *A, Value *B); void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged); void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged); - BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB, - SmallVector<Instruction*, 16> &merged, - BasicBlock::iterator &start, - unsigned maxVecSize, - bool isLoad); + bool findConsecutiveAccess(BasicBlock &BB, + SmallVector<Instruction*, 16> &merged, + const BasicBlock::iterator &start, + unsigned maxVecSize, + bool isLoad); virtual const char *getPassName() const { return "Merge compatible Load/stores for Gen"; @@ -159,38 +159,58 @@ namespace gbe { values[i]->replaceAllUsesWith(S); } } - - BasicBlock::iterator + // When searching for consecutive memory access, we do it in a small window, + // if the window is too large, it would take up too much compiling time. + // An Important rule we have followed is don't try to change load/store order. + // But an exeption is 'load& store that are from different address spaces. The + // return value will indicate wheter such kind of reorder happens. + bool GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB, SmallVector<Instruction*, 16> &merged, - BasicBlock::iterator &start, + const BasicBlock::iterator &start, unsigned maxVecSize, bool isLoad) { - BasicBlock::iterator stepForward = start; - if(!isSimpleLoadStore(&*start)) return stepForward; + if(!isSimpleLoadStore(&*start)) return false; merged.push_back(&*start); + unsigned targetAddrSpace = getAddressSpace(&*start); BasicBlock::iterator E = BB.end(); - BasicBlock::iterator J = ++start; + BasicBlock::iterator J = start; + ++J; - unsigned maxLimit = maxVecSize * 3; + unsigned maxLimit = maxVecSize * 5; + bool reordered = false; for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) { if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) { if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) { merged.push_back(&*J); - stepForward = ++J; } - } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) { + } else if((isLoad && isa<StoreInst>(*J))) { // simple stop to keep read/write order - break; + StoreInst *st = cast<StoreInst>(&*J); + unsigned addrSpace = st->getPointerAddressSpace(); + if (addrSpace != targetAddrSpace) { + reordered = true; + } else { + break; + } + } else if ((!isLoad && isa<LoadInst>(*J))) { + LoadInst *ld = cast<LoadInst>(&*J); + unsigned addrSpace = ld->getPointerAddressSpace(); + if (addrSpace != targetAddrSpace) { + reordered = true; + } else { + break; + } } if(merged.size() >= maxVecSize) break; } - return stepForward; + + return reordered; } void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) { @@ -220,6 +240,28 @@ namespace gbe { newST->setAlignment(align); } + // Find the safe iterator we can point to. If reorder happens, we need to + // point to the instruction after the first of toBeDeleted. If no reorder, + // we are safe to point to the instruction after the last of toBeDeleted + static BasicBlock::iterator + findSafeInstruction(SmallVector<Instruction*, 16> &toBeDeleted, + const BasicBlock::iterator ¤t, + bool reorder) { + BasicBlock::iterator safe = current; + unsigned size = toBeDeleted.size(); + if (reorder) { + unsigned i = 0; + while (toBeDeleted[i] == &*safe && i < size) { + ++i; + ++safe; + } + } else { + safe = BasicBlock::iterator(toBeDeleted[size - 1]); + ++safe; + } + return safe; + } + bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) { bool changed = false; SmallVector<Instruction*, 16> merged; @@ -232,11 +274,18 @@ namespace gbe { if (!(ty->isFloatTy() || ty->isIntegerTy(32) || ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad))) continue; + unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 : (ty->isIntegerTy(16) ? 8 : 16); - BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad); + bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad); uint32_t size = merged.size(); uint32_t pos = 0; + bool doDeleting = size > 1; + if (doDeleting) { + // choose next undeleted instruction + BBI = findSafeInstruction(merged, BBI, reorder); + } + while(size > 1) { unsigned vecSize = (size >= 16) ? 16 : (size >= 8 ? 8 : @@ -253,6 +302,12 @@ namespace gbe { pos += vecSize; size -= vecSize; } + if (doDeleting) { + //adjust the BBI back by one, as we would increase it in for loop + //don't do this if BBI points to the very first instruction. + if (BBI != BB.begin()) + --BBI; + } merged.clear(); } } -- 2.4.1 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet