Author: Yaxun (Sam) Liu Date: 2020-03-19T00:15:47-04:00 New Revision: f528df8e269df8b10de4a2458e99b42d6240ea91
URL: https://github.com/llvm/llvm-project/commit/f528df8e269df8b10de4a2458e99b42d6240ea91 DIFF: https://github.com/llvm/llvm-project/commit/f528df8e269df8b10de4a2458e99b42d6240ea91.diff LOG: Revert "Add a test for UsedDeclVisitor" This reverts commit b58f6bb1207521cdf6a856004525d7bffa5f2153. Added: Modified: mlir/lib/Transforms/AffineDataCopyGeneration.cpp Removed: clang/test/CodeGenCXX/used-decl-visitor.cpp ################################################################################ diff --git a/clang/test/CodeGenCXX/used-decl-visitor.cpp b/clang/test/CodeGenCXX/used-decl-visitor.cpp deleted file mode 100644 index 2b923ab562db..000000000000 --- a/clang/test/CodeGenCXX/used-decl-visitor.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// RUN: %clang_cc1 -triple x86_64 -emit-llvm -o %t %s - -// Make sure there is no assertion due to UsedDeclVisitor. - -struct A { - int a; -}; - -static A a; - -struct B { - B(int b = a.a) {} -}; - - -void foo() { - B(); -} diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp index a7f90b0c19b9..5409c557da83 100644 --- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp @@ -1,14 +1,268 @@ -struct A { - int a; -}; +//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to automatically promote accessed memref regions +// to buffers in a faster memory space that is explicitly managed, with the +// necessary data movement operations performed through either regular +// point-wise load/store's or DMAs. Such explicit copying (also referred to as +// array packing/unpacking in the literature), when done on arrays that exhibit +// reuse, results in near elimination of conflict misses, TLB misses, reduced +// use of hardware prefetch streams, and reduced false sharing. It is also +// necessary for hardware that explicitly managed levels in the memory +// hierarchy, and where DMAs may have to be used. This optimization is often +// performed on already tiled code. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Utils.h" +#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" +#include "mlir/Transforms/Utils.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include <algorithm> + +#define DEBUG_TYPE "affine-data-copy-generate" + +using namespace mlir; + +static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); + +static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( + "affine-data-copy-generate-fast-mem-capacity", + llvm::cl::desc( + "Set fast memory space capacity in KiB (default: unlimited)"), + llvm::cl::cat(clOptionsCategory)); + +static llvm::cl::opt<bool> + clDma("affine-data-copy-generate-dma", + llvm::cl::desc("Generate DMA instead of point-wise copy"), + llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); + +static llvm::cl::opt<unsigned> clFastMemorySpace( + "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), + llvm::cl::desc( + "Fast memory space identifier for copy generation (default: 1)"), + llvm::cl::cat(clOptionsCategory)); + +static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( + "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, + llvm::cl::init(false), + llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " + "for copy placement"), + llvm::cl::cat(clOptionsCategory)); -static A a; +namespace { -struct B { - B(int b = a.a) {} +/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by +/// introducing copy operations to transfer data into `fastMemorySpace` and +/// rewriting the original load's/store's to instead load/store from the +/// allocated fast memory buffers. Additional options specify the identifier +/// corresponding to the fast memory space and the amount of fast memory space +/// available. The pass traverses through the nesting structure, recursing to +/// inner levels if necessary to determine at what depth copies need to be +/// placed so that the allocated buffers fit within the memory capacity +/// provided. +// TODO(bondhugula): We currently can't generate copies correctly when stores +// are strided. Check for strided stores. +struct AffineDataCopyGeneration + : public FunctionPass<AffineDataCopyGeneration> { + explicit AffineDataCopyGeneration( + unsigned slowMemorySpace = 0, + unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, + int minDmaTransferSize = 1024, + uint64_t fastMemCapacityBytes = + (clFastMemoryCapacity.getNumOccurrences() > 0 + ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB + : std::numeric_limits<uint64_t>::max()), + bool generateDma = clDma, + bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) + : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), + tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), + fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), + skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} + + explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) + : slowMemorySpace(other.slowMemorySpace), + fastMemorySpace(other.fastMemorySpace), + tagMemorySpace(other.tagMemorySpace), + minDmaTransferSize(other.minDmaTransferSize), + fastMemCapacityBytes(other.fastMemCapacityBytes), + generateDma(other.generateDma), + skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} + + void runOnFunction() override; + LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests); + + // Slow memory space associated with copies. + const unsigned slowMemorySpace; + // Fast memory space associated with copies. + unsigned fastMemorySpace; + // Memory space associated with DMA tags. + unsigned tagMemorySpace; + // Minimum DMA transfer size supported by the target in bytes. + const int minDmaTransferSize; + // Capacity of the faster memory space. + uint64_t fastMemCapacityBytes; + + // If set, generate DMA operations instead of read/write. + bool generateDma; + + // If set, ignore loops with steps other than 1. + bool skipNonUnitStrideLoops; + + // Constant zero index to avoid too many duplicates. + Value zeroIndex = nullptr; }; +} // end anonymous namespace -void foo() { - B(); +/// Generates copies for memref's living in 'slowMemorySpace' into newly created +/// buffers in 'fastMemorySpace', and replaces memory operations to the former +/// by the latter. Only load op's handled for now. +/// TODO(bondhugula): extend this to store op's. +std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass( + unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, + int minDmaTransferSize, uint64_t fastMemCapacityBytes) { + return std::make_unique<AffineDataCopyGeneration>( + slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, + fastMemCapacityBytes); } + +/// Generate copies for this block. The block is partitioned into separate +/// ranges: each range is either a sequence of one or more operations starting +/// and ending with an affine load or store op, or just an affine.forop (which +/// could have other affine for op's nested within). +LogicalResult +AffineDataCopyGeneration::runOnBlock(Block *block, + DenseSet<Operation *> ©Nests) { + if (block->empty()) + return success(); + + AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, + fastMemorySpace, tagMemorySpace, + fastMemCapacityBytes}; + + // Every affine.forop in the block starts and ends a block range for copying; + // in addition, a contiguous sequence of operations starting with a + // load/store op but not including any copy nests themselves is also + // identified as a copy block range. Straightline code (a contiguous chunk of + // operations excluding AffineForOp's) are always assumed to not exhaust + // memory. As a result, this approach is conservative in some cases at the + // moment; we do a check later and report an error with location info. + // TODO(bondhugula): An 'affine.if' operation is being treated similar to an + // operation. 'affine.if''s could have 'affine.for's in them; + // treat them separately. + + // Get to the first load, store, or for op (that is not a copy nest itself). + auto curBegin = + std::find_if(block->begin(), block->end(), [&](Operation &op) { + return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || + isa<AffineForOp>(op)) && + copyNests.count(&op) == 0; + }); + + // Create [begin, end) ranges. + auto it = curBegin; + while (it != block->end()) { + AffineForOp forOp; + // If you hit a non-copy for loop, we will split there. + if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { + // Perform the copying up unti this 'for' op first. + affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, + /*filterMemRef=*/llvm::None, copyNests); + + // Returns true if the footprint is known to exceed capacity. + auto exceedsCapacity = [&](AffineForOp forOp) { + Optional<int64_t> footprint = + getMemoryFootprintBytes(forOp, + /*memorySpace=*/0); + return (footprint.hasValue() && + static_cast<uint64_t>(footprint.getValue()) > + fastMemCapacityBytes); + }; + + // If the memory footprint of the 'affine.for' loop is higher than fast + // memory capacity (when provided), we recurse to copy at an inner level + // until we find a depth at which footprint fits in fast mem capacity. If + // the footprint can't be calculated, we assume for now it fits. Recurse + // inside if footprint for 'forOp' exceeds capacity, or when + // skipNonUnitStrideLoops is set and the step size is not one. + bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 + : exceedsCapacity(forOp); + if (recurseInner) { + // We'll recurse and do the copies at an inner level for 'forInst'. + // Recurse onto the body of this loop. + runOnBlock(forOp.getBody(), copyNests); + } else { + // We have enough capacity, i.e., copies will be computed for the + // portion of the block until 'it', and for 'it', which is 'forOp'. Note + // that for the latter, the copies are placed just before this loop (for + // incoming copies) and right after (for outgoing ones). + + // Inner loop copies have their own scope - we don't thus update + // consumed capacity. The footprint check above guarantees this inner + // loop's footprint fits. + affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, + /*filterMemRef=*/llvm::None, copyNests); + } + // Get to the next load or store op after 'forOp'. + curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { + return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || + isa<AffineForOp>(op)) && + copyNests.count(&op) == 0; + }); + it = curBegin; + } else { + assert(copyNests.count(&*it) == 0 && + "all copy nests generated should have been skipped above"); + // We simply include this op in the current range and continue for more. + ++it; + } + } + + // Generate the copy for the final block range. + if (curBegin != block->end()) { + // Can't be a terminator because it would have been skipped above. + assert(!curBegin->isKnownTerminator() && "can't be a terminator"); + // Exclude the affine terminator - hence, the std::prev. + affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), + copyOptions, /*filterMemRef=*/llvm::None, copyNests); + } + + return success(); +} + +void AffineDataCopyGeneration::runOnFunction() { + FuncOp f = getFunction(); + OpBuilder topBuilder(f.getBody()); + zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); + + // Nests that are copy-in's or copy-out's; the root AffineForOps of those + // nests are stored herein. + DenseSet<Operation *> copyNests; + + // Clear recorded copy nests. + copyNests.clear(); + + for (auto &block : f) + runOnBlock(&block, copyNests); + + // Promote any single iteration loops in the copy nests. + for (auto nest : copyNests) { + nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); + } +} + +static PassRegistration<AffineDataCopyGeneration> + pass("affine-data-copy-generate", + "Generate explicit copying for memory operations"); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits