https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/90933
>From 4344f540008d4fd079bb009318b5b0b070bec0f8 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Fri, 13 Sep 2024 08:51:00 -0700 Subject: [PATCH 1/6] [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds --- llvm/include/llvm/CGData/CodeGenData.h | 16 +++ llvm/lib/CGData/CodeGenData.cpp | 81 +++++++++++++- llvm/lib/LTO/CMakeLists.txt | 1 + llvm/lib/LTO/LTO.cpp | 103 +++++++++++++++++- llvm/lib/LTO/LTOBackend.cpp | 11 ++ .../test/ThinLTO/AArch64/cgdata-two-rounds.ll | 94 ++++++++++++++++ llvm/test/ThinLTO/AArch64/lit.local.cfg | 2 + 7 files changed, 302 insertions(+), 6 deletions(-) create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 84133a433170fe..1e1afe99327650 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } +/// Initialize the two-codegen rounds. +void initializeTwoCodegenRounds(); + +/// Save the current module before the first codegen round. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); + +/// Load the current module before the second codegen round. +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context); + +/// Merge the codegen data from the input files in scratch vector in ThinLTO +/// two-codegen rounds. +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); + void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 55d2504231c744..ff8e5dd7c75790 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -17,6 +17,7 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" #include "llvm/Support/WithColor.h" #define DEBUG_TYPE "cg-data" @@ -30,6 +31,14 @@ cl::opt<bool> cl::opt<std::string> CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden, cl::desc("File path to where .cgdata file is read")); +cl::opt<bool> CodeGenDataThinLTOTwoRounds( + "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden, + cl::desc("Enable two-round ThinLTO code generation. The first round " + "emits codegen data, while the second round uses the emitted " + "codegen data for further optimizations.")); + +// Path to where the optimized bitcodes are saved and restored for ThinLTO. +static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { @@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() { std::call_once(CodeGenData::OnceFlag, []() { Instance = std::unique_ptr<CodeGenData>(new CodeGenData()); - if (CodeGenDataGenerate) + if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds) Instance->EmitCGData = true; else if (!CodeGenDataUsePath.empty()) { // Initialize the global CGData if the input file name is given. @@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) { } } +static std::string getPath(StringRef Dir, unsigned Task) { + return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); +} + +void initializeTwoCodegenRounds() { + assert(CodeGenDataThinLTOTwoRounds); + if (auto EC = llvm::sys::fs::createUniqueDirectory( + "cgdata", CodeGenDataThinLTOTwoRoundsPath)) + report_fatal_error(Twine("Failed to create directory: ") + EC.message()); +} + +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + std::error_code EC; + raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); + if (EC) + report_fatal_error(Twine("Failed to open ") + Path + + " to save optimized bitcode: " + EC.message()); + WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); +} + +std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, + unsigned Task, + LLVMContext &Context) { + assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); + std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); + auto FileOrError = MemoryBuffer::getFile(Path); + if (auto EC = FileOrError.getError()) + report_fatal_error(Twine("Failed to open ") + Path + + " to load optimized bitcode: " + EC.message()); + + std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + if (!RestoredModule) + report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + + Path + "\n"); + + // Restore the original module identifier. + (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); + return std::move(*RestoredModule); +} + +Error mergeCodeGenData( + const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { + + OutlinedHashTreeRecord GlobalOutlineRecord; + for (auto &InputFile : *(InputFiles)) { + if (InputFile.empty()) + continue; + StringRef File = StringRef(InputFile.data(), InputFile.size()); + std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( + File, "in-memory object file", /*RequiresNullTerminator=*/false); + Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = + object::ObjectFile::createObjectFile(Buffer->getMemBufferRef()); + if (!BinOrErr) + return BinOrErr.takeError(); + + std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); + if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(), + GlobalOutlineRecord)) + return E; + } + + if (!GlobalOutlineRecord.empty()) + cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree)); + + return Error::success(); +} + } // end namespace cgdata } // end namespace llvm diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt index 69ff08e1f374c4..057d73b6349cf1 100644 --- a/llvm/lib/LTO/CMakeLists.txt +++ b/llvm/lib/LTO/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO BinaryFormat BitReader BitWriter + CGData CodeGen CodeGenTypes Core diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index f4c25f80811a85..945f8c859365ea 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/AutoUpgrade.h" @@ -70,6 +71,8 @@ static cl::opt<bool> DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden, cl::desc("Dump the SCCs in the ThinLTO index's callgraph")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { /// Enable global value internalization in LTO. cl::opt<bool> EnableLTOInternalization( @@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc { GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); } - Error runThinLTOBackendThread( + virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, @@ -1559,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc { return BackendThreadPool.getMaxConcurrency(); } }; + +/// This Backend will run ThinBackend process but throw away all the output from +/// the codegen. This class facilitates the first codegen round. +class NoOutputThinBackend : public InProcessThinBackend { +public: + NoOutputThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) + : InProcessThinBackend( + Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, + // Allocate a scratch buffer for each task to write output to. + [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>((*Allocation)[Task])); + }, + FileCache(), nullptr, false, false), + Scratch(std::move(Scratch)) {} + + /// Scratch space for writing output during the codegen. + std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; +}; + +/// This Backend performs codegen on bitcode that was previously saved after +/// going through optimization. This class facilitates the second codegen round. +class OptimizedBitcodeThinBackend : public InProcessThinBackend { +public: + OptimizedBitcodeThinBackend( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + ThreadPoolStrategy ThinLTOParallelism, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, AddStream, FileCache(), + nullptr, false, false) {} + + virtual Error runThinLTOBackendThread( + AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, + ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + } +}; } // end anonymous namespace ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, @@ -1879,10 +1936,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return BackendProcess->wait(); }; - std::unique_ptr<ThinBackendProc> BackendProc = - ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - AddStream, Cache); - return RunBackends(BackendProc.get()); + if (!CodeGenDataThinLTOTwoRounds) { + std::unique_ptr<ThinBackendProc> BackendProc = + ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, + AddStream, Cache); + return RunBackends(BackendProc.get()); + } + + // Perform two rounds of code generation for ThinLTO: + // 1. First round: Run optimization and code generation with a scratch output. + // 2. Merge codegen data extracted from the scratch output. + // 3. Second round: Run code generation again using the merged data. + LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); + + // Initialize a temporary path to store and retrieve optimized IRs for + // two-round code generation. + cgdata::initializeTwoCodegenRounds(); + + // Create a scratch output to hold intermediate results. + auto Outputs = + std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); + auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, std::move(Outputs)); + // First round: Run optimization and code generation with a scratch output. + // Before code generation, serialize modules. + if (Error E = RunBackends(FirstRoundLTO.get())) + return E; + + // Merge codegen data extracted from the scratch output. + if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + return E; + + // Second round: Run code generation by reading IRs. + std::unique_ptr<ThinBackendProc> SecondRoundLTO = + std::make_unique<OptimizedBitcodeThinBackend>( + Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), + ModuleToDefinedGVSummaries, AddStream); + Error E = RunBackends(SecondRoundLTO.get()); + + return E; } Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 06eeed3e1bc41f..3e3b5b316d4125 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CGData/CodeGenData.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" @@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged( cl::desc("Assume the input has already undergone ThinLTO function " "importing and the other pre-optimization pipeline changes.")); +extern cl::opt<bool> CodeGenDataThinLTOTwoRounds; + namespace llvm { extern cl::opt<bool> NoPGOWarnMismatch; } @@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, auto OptimizeAndCodegen = [&](Module &Mod, TargetMachine *TM, std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) { + // Perform optimization and code generation for ThinLTO. if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, CmdArgs)) return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); + // Save the current module before the first codegen round. + // Note that the second codegen round runs only `codegen()` without + // running `opt()`. We're not reaching here as it's bailed out earlier + // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + if (CodeGenDataThinLTOTwoRounds) + cgdata::saveModuleForTwoRounds(Mod, Task); + codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); }; diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll new file mode 100644 index 00000000000000..0e082cf4e55e54 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll @@ -0,0 +1,94 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. + +; RUN: split-file %s %t + +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; Now add a lto module to the above thinlto modules. +; Verify the lto module is optimized independent of the global outlining for thinlto. +; RUN: opt %t/lto.ll -o %t/lto.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; lto.ll will have one outlining instance within the lto module itself (no global outlining). +; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0 +; LTO-0: _OUTLINED_FUNCTION{{.*}}>: +; LTO-0-NEXT: mov +; LTO-0-NEXT: b + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1 + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2 + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- lto.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f4() minsize { + %1 = call i32 @g(i32 10, i32 30, i32 2); + ret i32 %1 +} +define i32 @f5() minsize { + %1 = call i32 @g(i32 20, i32 40, i32 2); + ret i32 %1 +} +define i32 @f6() minsize { + %1 = call i32 @g(i32 50, i32 60, i32 2); + ret i32 %1 +} diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg new file mode 100644 index 00000000000000..10d4a0e953ed47 --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True >From 84f812bcbf35c7959a55180642a2ec57608371db Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 18:07:49 -0700 Subject: [PATCH 2/6] Address comments from ellishg --- llvm/include/llvm/CGData/CodeGenData.h | 7 ++++--- llvm/lib/CGData/CodeGenData.cpp | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 1e1afe99327650..72b52e6e9b8fd1 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -164,13 +164,14 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -/// Initialize the two-codegen rounds. void initializeTwoCodegenRounds(); -/// Save the current module before the first codegen round. +/// Save \p TheModule before the first codegen round. +/// \p Task represents the partition number in the parallel code generation +/// process. void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); -/// Load the current module before the second codegen round. +/// Load the optimized module before the second codegen round. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, LLVMContext &Context); diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index ff8e5dd7c75790..58b92b7262957a 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -225,7 +225,9 @@ void warn(Error E, StringRef Whence) { } static std::string getPath(StringRef Dir, unsigned Task) { - return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str(); + llvm::SmallString<128> Path(Dir); + llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); + return std::string(Path); } void initializeTwoCodegenRounds() { >From 37eb2c50cd5b2d020598a7658291bf4789a1fd1d Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Tue, 17 Sep 2024 23:37:51 -0700 Subject: [PATCH 3/6] Address comments from NuriAmari --- llvm/lib/CGData/CodeGenData.cpp | 4 ++-- llvm/lib/LTO/LTO.cpp | 33 +++++++++++++++++++++------------ llvm/lib/LTO/LTOBackend.cpp | 2 +- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 58b92b7262957a..4e21045a67cba6 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -245,7 +245,7 @@ void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { if (EC) report_fatal_error(Twine("Failed to open ") + Path + " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true); + WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, @@ -259,7 +259,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, " to load optimized bitcode: " + EC.message()); std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); - auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context); + auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + Path + "\n"); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 945f8c859365ea..b51b908fb28760 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1563,11 +1563,14 @@ class InProcessThinBackend : public ThinBackendProc { } }; -/// This Backend will run ThinBackend process but throw away all the output from -/// the codegen. This class facilitates the first codegen round. -class NoOutputThinBackend : public InProcessThinBackend { +/// This backend is utilized in the first round of a two-codegen round process. +/// It first saves optimized bitcode files to disk before the codegen process +/// begins. After codegen, it stores the resulting object files in a scratch +/// buffer. Note the codegen data stored in the scratch buffer will be extracted +/// and merged in the subsequent step. +class FirstRoundThinBackend : public InProcessThinBackend { public: - NoOutputThinBackend( + FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, @@ -1579,25 +1582,31 @@ class NoOutputThinBackend : public InProcessThinBackend { return std::make_unique<CachedFileStream>( std::make_unique<raw_svector_ostream>((*Allocation)[Task])); }, - FileCache(), nullptr, false, false), + FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), Scratch(std::move(Scratch)) {} /// Scratch space for writing output during the codegen. std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; }; -/// This Backend performs codegen on bitcode that was previously saved after -/// going through optimization. This class facilitates the second codegen round. -class OptimizedBitcodeThinBackend : public InProcessThinBackend { +/// This backend operates in the second round of a two-codegen round process. +/// It starts by reading the optimized bitcode files that were saved during the +/// first round. The backend then executes the codegen only to further optimize +/// the code, utilizing the codegen data merged from the first round. Finally, +/// it writes the resulting object files as usual. +class SecondRoundThinBackend : public InProcessThinBackend { public: - OptimizedBitcodeThinBackend( + SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, AddStream, FileCache(), - nullptr, false, false) {} + /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1956,7 +1965,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Create a scratch output to hold intermediate results. auto Outputs = std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); - auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>( + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, std::move(Outputs)); // First round: Run optimization and code generation with a scratch output. @@ -1970,7 +1979,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // Second round: Run code generation by reading IRs. std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<OptimizedBitcodeThinBackend>( + std::make_unique<SecondRoundThinBackend>( Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), ModuleToDefinedGVSummaries, AddStream); Error E = RunBackends(SecondRoundLTO.get()); diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 3e3b5b316d4125..b66989fe520b42 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -611,7 +611,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Save the current module before the first codegen round. // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier - // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`. + // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. if (CodeGenDataThinLTOTwoRounds) cgdata::saveModuleForTwoRounds(Mod, Task); >From 6778a73542b02f9319baf7790c207614abb37e52 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Sun, 29 Sep 2024 18:28:15 -0700 Subject: [PATCH 4/6] [NFC] Refactor ThinBackend - Change it to a type from a function. - Store the parallelism in the type for the future use. --- llvm/include/llvm/LTO/LTO.h | 63 +++++++++++++++++++++-- llvm/lib/LTO/LTO.cpp | 100 +++++++++++++----------------------- 2 files changed, 94 insertions(+), 69 deletions(-) diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 214aa4e1c562dc..fde062ddbf7bc8 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -105,7 +105,41 @@ void updateMemProfAttributes(Module &Mod, const ModuleSummaryIndex &Index); class LTO; struct SymbolResolution; -class ThinBackendProc; + +using IndexWriteCallback = std::function<void(const std::string &)>; + +/// This class defines the interface to the ThinLTO backend. +class ThinBackendProc { +protected: + const Config &Conf; + ModuleSummaryIndex &CombinedIndex; + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries; + lto::IndexWriteCallback OnWrite; + bool ShouldEmitImportsFiles; + +public: + ThinBackendProc( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles) + : Conf(Conf), CombinedIndex(CombinedIndex), + ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries), + OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {} + + virtual ~ThinBackendProc() = default; + virtual Error start( + unsigned Task, BitcodeModule BM, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + MapVector<StringRef, BitcodeModule> &ModuleMap) = 0; + virtual Error wait() = 0; + virtual unsigned getThreadCount() = 0; + + // Write sharded indices and (optionally) imports to disk + Error emitFiles(const FunctionImporter::ImportMapTy &ImportList, + llvm::StringRef ModulePath, const std::string &NewModulePath); +}; /// An input file. This is a symbol table wrapper that only exposes the /// information that an LTO client should need in order to do symbol resolution. @@ -197,10 +231,30 @@ class InputFile { /// A ThinBackend defines what happens after the thin-link phase during ThinLTO. /// The details of this type definition aren't important; clients can only /// create a ThinBackend using one of the create*ThinBackend() functions below. -using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>( +using ThinBackendFunction = std::function<std::unique_ptr<ThinBackendProc>( const Config &C, ModuleSummaryIndex &CombinedIndex, - DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream, FileCache Cache)>; +struct ThinBackend { + ThinBackend(ThinBackendFunction Func, ThreadPoolStrategy Parallelism = {}) + : Func(std::move(Func)), Parallelism(std::move(Parallelism)) {} + ThinBackend() = default; + + std::unique_ptr<ThinBackendProc> operator()( + const Config &Conf, ModuleSummaryIndex &CombinedIndex, + const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, + AddStreamFn AddStream, FileCache Cache) { + assert(isValid() && "Invalid backend function"); + return Func(Conf, CombinedIndex, ModuleToDefinedGVSummaries, + std::move(AddStream), std::move(Cache)); + } + ThreadPoolStrategy getParallelism() const { return Parallelism; } + bool isValid() const { return static_cast<bool>(Func); } + +private: + ThinBackendFunction Func = nullptr; + ThreadPoolStrategy Parallelism; +}; /// This ThinBackend runs the individual backend jobs in-process. /// The default value means to use one job per hardware core (not hyper-thread). @@ -210,7 +264,6 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>( /// to the same path as the input module, with suffix ".thinlto.bc" /// ShouldEmitImportsFiles is true it also writes a list of imported files to a /// similar path with ".imports" appended instead. -using IndexWriteCallback = std::function<void(const std::string &)>; ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism, IndexWriteCallback OnWrite = nullptr, bool ShouldEmitIndexFiles = false, @@ -275,7 +328,7 @@ class LTO { /// this constructor. /// FIXME: We do currently require the DiagHandler field to be set in Conf. /// Until that is fixed, a Config argument is required. - LTO(Config Conf, ThinBackend Backend = nullptr, + LTO(Config Conf, ThinBackend Backend = {}, unsigned ParallelCodeGenParallelismLevel = 1, LTOKind LTOMode = LTOK_Default); ~LTO(); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index b51b908fb28760..8266af5c1d4152 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -581,10 +581,10 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel, CombinedModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat; } -LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) - : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { - if (!Backend) - this->Backend = +LTO::ThinLTOState::ThinLTOState(ThinBackend BackendParam) + : Backend(std::move(BackendParam)), CombinedIndex(/*HaveGVs*/ false) { + if (!Backend.isValid()) + Backend = createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); } @@ -1371,64 +1371,6 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) { return LibcallSymbols; } -/// This class defines the interface to the ThinLTO backend. -class lto::ThinBackendProc { -protected: - const Config &Conf; - ModuleSummaryIndex &CombinedIndex; - const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries; - lto::IndexWriteCallback OnWrite; - bool ShouldEmitImportsFiles; - -public: - ThinBackendProc( - const Config &Conf, ModuleSummaryIndex &CombinedIndex, - const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles) - : Conf(Conf), CombinedIndex(CombinedIndex), - ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries), - OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {} - - virtual ~ThinBackendProc() = default; - virtual Error start( - unsigned Task, BitcodeModule BM, - const FunctionImporter::ImportMapTy &ImportList, - const FunctionImporter::ExportSetTy &ExportList, - const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, - MapVector<StringRef, BitcodeModule> &ModuleMap) = 0; - virtual Error wait() = 0; - virtual unsigned getThreadCount() = 0; - - // Write sharded indices and (optionally) imports to disk - Error emitFiles(const FunctionImporter::ImportMapTy &ImportList, - llvm::StringRef ModulePath, - const std::string &NewModulePath) { - ModuleToSummariesForIndexTy ModuleToSummariesForIndex; - GVSummaryPtrSet DeclarationSummaries; - - std::error_code EC; - gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries, - ImportList, ModuleToSummariesForIndex, - DeclarationSummaries); - - raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC, - sys::fs::OpenFlags::OF_None); - if (EC) - return errorCodeToError(EC); - - writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex, - &DeclarationSummaries); - - if (ShouldEmitImportsFiles) { - EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports", - ModuleToSummariesForIndex); - if (EC) - return errorCodeToError(EC); - } - return Error::success(); - } -}; - namespace { class InProcessThinBackend : public ThinBackendProc { DefaultThreadPool BackendThreadPool; @@ -1631,7 +1573,7 @@ ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, lto::IndexWriteCallback OnWrite, bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles) { - return + auto Func = [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream, FileCache Cache) { @@ -1640,6 +1582,7 @@ ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism, AddStream, Cache, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles); }; + return ThinBackend(Func, Parallelism); } StringLiteral lto::getThinLTODefaultCPU(const Triple &TheTriple) { @@ -1732,7 +1675,7 @@ ThinBackend lto::createWriteIndexesThinBackend( std::string OldPrefix, std::string NewPrefix, std::string NativeObjectPrefix, bool ShouldEmitImportsFiles, raw_fd_ostream *LinkedObjectsFile, IndexWriteCallback OnWrite) { - return + auto Func = [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, AddStreamFn AddStream, FileCache Cache) { @@ -1741,6 +1684,7 @@ ThinBackend lto::createWriteIndexesThinBackend( NewPrefix, NativeObjectPrefix, ShouldEmitImportsFiles, LinkedObjectsFile, OnWrite); }; + return ThinBackend(Func); } Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, @@ -2041,3 +1985,31 @@ std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) { }); return ModulesOrdering; } + +Error ThinBackendProc::emitFiles( + const FunctionImporter::ImportMapTy &ImportList, llvm::StringRef ModulePath, + const std::string &NewModulePath) { + ModuleToSummariesForIndexTy ModuleToSummariesForIndex; + GVSummaryPtrSet DeclarationSummaries; + + std::error_code EC; + gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries, + ImportList, ModuleToSummariesForIndex, + DeclarationSummaries); + + raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC, + sys::fs::OpenFlags::OF_None); + if (EC) + return errorCodeToError(EC); + + writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex, + &DeclarationSummaries); + + if (ShouldEmitImportsFiles) { + EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports", + ModuleToSummariesForIndex); + if (EC) + return errorCodeToError(EC); + } + return Error::success(); +} >From c5d3379e03a9f894dcb7816ae9cf5995fd850687 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Sun, 29 Sep 2024 22:58:52 -0700 Subject: [PATCH 5/6] [NFC] Refactor FileCache - Turn it into a type from a function. - Store the cache directory for the future use. --- llvm/include/llvm/LTO/LTO.h | 2 +- llvm/include/llvm/Support/Caching.h | 22 +++++++++++++++++++++- llvm/lib/LTO/LTO.cpp | 2 +- llvm/lib/Support/Caching.cpp | 5 +++-- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index fde062ddbf7bc8..4b8c4f4fc23298 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -351,7 +351,7 @@ class LTO { /// /// The client will receive at most one callback (via either AddStream or /// Cache) for each task identifier. - Error run(AddStreamFn AddStream, FileCache Cache = nullptr); + Error run(AddStreamFn AddStream, FileCache Cache = {}); /// Static method that returns a list of libcall symbols that can be generated /// by LTO but might not be visible from bitcode symbol table. diff --git a/llvm/include/llvm/Support/Caching.h b/llvm/include/llvm/Support/Caching.h index 4fa57cc92e51f7..cc86d1583fd6e6 100644 --- a/llvm/include/llvm/Support/Caching.h +++ b/llvm/include/llvm/Support/Caching.h @@ -54,9 +54,29 @@ using AddStreamFn = std::function<Expected<std::unique_ptr<CachedFileStream>>( /// /// if (AddStreamFn AddStream = Cache(Task, Key, ModuleName)) /// ProduceContent(AddStream); -using FileCache = std::function<Expected<AddStreamFn>( +using FileCacheFunction = std::function<Expected<AddStreamFn>( unsigned Task, StringRef Key, const Twine &ModuleName)>; +struct FileCache { + FileCache(FileCacheFunction CacheFn, const std::string &DirectoryPath) + : CacheFunction(std::move(CacheFn)), CacheDirectoryPath(DirectoryPath) {} + FileCache() = default; + + Expected<AddStreamFn> operator()(unsigned Task, StringRef Key, + const Twine &ModuleName) { + assert(isValid() && "Invalid cache function"); + return CacheFunction(Task, Key, ModuleName); + } + const std::string &getCacheDirectoryPath() const { + return CacheDirectoryPath; + } + bool isValid() const { return static_cast<bool>(CacheFunction); } + +private: + FileCacheFunction CacheFunction = nullptr; + std::string CacheDirectoryPath; +}; + /// This type defines the callback to add a pre-existing file (e.g. in a cache). /// /// Buffer callbacks must be thread safe. diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 8266af5c1d4152..c6369ab382eeb0 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1429,7 +1429,7 @@ class InProcessThinBackend : public ThinBackendProc { return E; } - if (!Cache || !CombinedIndex.modulePaths().count(ModuleID) || + if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || all_of(CombinedIndex.getModuleHash(ModuleID), [](uint32_t V) { return V == 0; })) // Cache disabled or no entry for this module in the combined index or diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp index 1ef51db218e89c..66e540efaca972 100644 --- a/llvm/lib/Support/Caching.cpp +++ b/llvm/lib/Support/Caching.cpp @@ -37,8 +37,8 @@ Expected<FileCache> llvm::localCache(const Twine &CacheNameRef, TempFilePrefixRef.toVector(TempFilePrefix); CacheDirectoryPathRef.toVector(CacheDirectoryPath); - return [=](unsigned Task, StringRef Key, - const Twine &ModuleName) -> Expected<AddStreamFn> { + auto Func = [=](unsigned Task, StringRef Key, + const Twine &ModuleName) -> Expected<AddStreamFn> { // This choice of file name allows the cache to be pruned (see pruneCache() // in include/llvm/Support/CachePruning.h). SmallString<64> EntryPath; @@ -167,4 +167,5 @@ Expected<FileCache> llvm::localCache(const Twine &CacheNameRef, Task); }; }; + return FileCache(Func, CacheDirectoryPathRef.str()); } >From fa3489a0735ecd207bc3f6c552ce18f776372ab8 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee <kyu...@meta.com> Date: Sun, 29 Sep 2024 10:38:46 -0700 Subject: [PATCH 6/6] Address comments from teresajohnson --- clang/lib/CodeGen/BackendUtil.cpp | 9 +- llvm/include/llvm/CGData/CodeGenData.h | 52 ++++- llvm/include/llvm/CGData/CodeGenDataReader.h | 5 +- llvm/include/llvm/LTO/LTO.h | 6 +- llvm/include/llvm/LTO/LTOBackend.h | 6 +- llvm/lib/CGData/CMakeLists.txt | 2 + llvm/lib/CGData/CodeGenData.cpp | 99 +++++---- llvm/lib/CGData/CodeGenDataReader.cpp | 7 +- llvm/lib/LTO/LTO.cpp | 188 ++++++++++++++---- llvm/lib/LTO/LTOBackend.cpp | 7 +- .../AArch64/cgdata-two-rounds-caching.ll | 173 ++++++++++++++++ 11 files changed, 452 insertions(+), 102 deletions(-) create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index abc936f2c686dd..f018130807519d 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1321,10 +1321,11 @@ static void runThinLTOBackend( Conf.CGFileType = getCodeGenFileType(Action); break; } - if (Error E = thinBackend( - Conf, -1, AddStream, *M, *CombinedIndex, ImportList, - ModuleToDefinedGVSummaries[M->getModuleIdentifier()], - /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) { + if (Error E = + thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList, + ModuleToDefinedGVSummaries[M->getModuleIdentifier()], + /*ModuleMap=*/nullptr, Conf.CodeGenOnly, + /*IRAddStream=*/nullptr, CGOpts.CmdArgs)) { handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) { errs() << "Error running ThinLTO backend: " << EIB.message() << '\n'; }); diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h index 72b52e6e9b8fd1..e8e331f0189ac1 100644 --- a/llvm/include/llvm/CGData/CodeGenData.h +++ b/llvm/include/llvm/CGData/CodeGenData.h @@ -15,11 +15,13 @@ #define LLVM_CGDATA_CODEGENDATA_H #include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/CGData/OutlinedHashTree.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/IR/Module.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" #include <mutex> @@ -164,22 +166,60 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) { CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree)); } -void initializeTwoCodegenRounds(); +struct StreamCacheData { + /// Backing buffer for serialized data stream. + SmallVector<SmallString<0>> Outputs; + /// Callback function to add serialized data to the stream. + AddStreamFn AddStream; + /// Backing buffer for cached data. + SmallVector<std::unique_ptr<MemoryBuffer>> Files; + /// Cache mechanism for storing and retrieving data. + FileCache Cache; + + StreamCacheData(unsigned Size) : Outputs(Size), Files(Size) {} + StreamCacheData() = delete; + + /// Retrieve results from either the cache or the stream. + SmallVector<StringRef> getResult() { + unsigned NumOutputs = Outputs.size(); + SmallVector<StringRef> Result(NumOutputs); + for (unsigned I = 0; I < NumOutputs; ++I) + if (Files[I]) + Result[I] = Files[I]->getBuffer(); + else + Result[I] = Outputs[I]; + return Result; + } +}; + +/// Establish additional streams and caches for accessing object and IR files. +/// \p OrigCache refers to the original cache used for accessing the final +/// object files, which has already been configured and provided by the linker, +/// if applicable. This cache will be utilized during the second round of the +/// run. Additionally, we add two more caches at the same location for the first +/// round of the run. +void initializeTwoCodegenRounds(StreamCacheData &CG, StreamCacheData &IR, + const FileCache &OrigCache); /// Save \p TheModule before the first codegen round. /// \p Task represents the partition number in the parallel code generation /// process. -void saveModuleForTwoRounds(const Module &TheModule, unsigned Task); +/// \p AddStream is the callback used to add the serialized module to the +/// stream. +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream); /// Load the optimized module before the second codegen round. std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, - LLVMContext &Context); + LLVMContext &Context, + ArrayRef<StringRef> IRFiles); /// Merge the codegen data from the input files in scratch vector in ThinLTO -/// two-codegen rounds. -Error mergeCodeGenData( - const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles); +/// two-codegen rounds. Optionally, \p CombinedHash can be used to compuate +/// the combined hash of the merged data. +Error mergeCodeGenData(ArrayRef<StringRef> CGFiles, + stable_hash *CombinedHash = nullptr); void warn(Error E, StringRef Whence = ""); void warn(Twine Message, std::string Whence = "", std::string Hint = ""); diff --git a/llvm/include/llvm/CGData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h index 1ee4bfbe480233..7e4882df2116e2 100644 --- a/llvm/include/llvm/CGData/CodeGenDataReader.h +++ b/llvm/include/llvm/CGData/CodeGenDataReader.h @@ -54,8 +54,11 @@ class CodeGenDataReader { /// Extract the cgdata embedded in sections from the given object file and /// merge them into the GlobalOutlineRecord. This is a static helper that /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds. + /// Optionally, \p CombinedHash can be used to compuate the combined hash of + /// the merged data. static Error mergeFromObjectFile(const object::ObjectFile *Obj, - OutlinedHashTreeRecord &GlobalOutlineRecord); + OutlinedHashTreeRecord &GlobalOutlineRecord, + stable_hash *CombinedHash = nullptr); protected: /// The outlined hash tree that has been read. When it's released by diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 4b8c4f4fc23298..7174118ed81fe1 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -64,7 +64,8 @@ void thinLTOInternalizeAndPromoteInIndex( isPrevailing); /// Computes a unique hash for the Module considering the current list of -/// export/import and other global analysis results. +/// export/import and other global analysis results. Optionally, \p ExtraID +/// can be used to add an extra identifier to the hash. std::string computeLTOCacheKey( const lto::Config &Conf, const ModuleSummaryIndex &Index, StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList, @@ -72,7 +73,8 @@ std::string computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {}, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}); + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {}, + StringRef ExtraID = {}); namespace lto { diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 098c0491dfe70a..2769e58f249053 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -51,13 +51,15 @@ Error backend(const Config &C, AddStreamFn AddStream, /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will /// be mapped to memory on demand and at any given time during importing, only /// one source module will be kept open at the most. If \p CodeGenOnly is true, -/// the backend will skip optimization and only perform code generation. +/// the backend will skip optimization and only perform code generation. If +/// \p IRAddStream is not nullptr, it will be called just before code generation +/// to serialize the optimized IR. Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, const ModuleSummaryIndex &CombinedIndex, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, + bool CodeGenOnly, AddStreamFn IRAddStream = nullptr, const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>()); Error finalizeOptimizationRemarks( diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt index ff1aab920e7a8c..157b0dfb7f9fcf 100644 --- a/llvm/lib/CGData/CMakeLists.txt +++ b/llvm/lib/CGData/CMakeLists.txt @@ -12,6 +12,8 @@ add_llvm_component_library(LLVMCGData intrinsics_gen LINK_COMPONENTS + BitReader + BitWriter Core Support Object diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp index 4e21045a67cba6..460f01aa3b1e98 100644 --- a/llvm/lib/CGData/CodeGenData.cpp +++ b/llvm/lib/CGData/CodeGenData.cpp @@ -15,6 +15,7 @@ #include "llvm/CGData/CodeGenDataReader.h" #include "llvm/CGData/OutlinedHashTreeRecord.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -37,9 +38,6 @@ cl::opt<bool> CodeGenDataThinLTOTwoRounds( "emits codegen data, while the second round uses the emitted " "codegen data for further optimizations.")); -// Path to where the optimized bitcodes are saved and restored for ThinLTO. -static SmallString<128> CodeGenDataThinLTOTwoRoundsPath; - static std::string getCGDataErrString(cgdata_error Err, const std::string &ErrMsg = "") { std::string Msg; @@ -224,59 +222,78 @@ void warn(Error E, StringRef Whence) { } } -static std::string getPath(StringRef Dir, unsigned Task) { - llvm::SmallString<128> Path(Dir); - llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc"); - return std::string(Path); -} - -void initializeTwoCodegenRounds() { +void initializeTwoCodegenRounds(StreamCacheData &CG, StreamCacheData &IR, + const FileCache &OrigCache) { assert(CodeGenDataThinLTOTwoRounds); - if (auto EC = llvm::sys::fs::createUniqueDirectory( - "cgdata", CodeGenDataThinLTOTwoRoundsPath)) - report_fatal_error(Twine("Failed to create directory: ") + EC.message()); + CG.AddStream = [&](size_t Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>(CG.Outputs[Task])); + }; + IR.AddStream = [&](size_t Task, const Twine &ModuleName) { + return std::make_unique<CachedFileStream>( + std::make_unique<raw_svector_ostream>(IR.Outputs[Task])); + }; + + if (OrigCache.isValid()) { + auto CGCacheOrErr = + localCache("ThinLTO", "CG", OrigCache.getCacheDirectoryPath(), + [&](size_t Task, const Twine &ModuleName, + std::unique_ptr<MemoryBuffer> MB) { + CG.Files[Task] = std::move(MB); + }); + if (Error Err = CGCacheOrErr.takeError()) + report_fatal_error(std::move(Err)); + CG.Cache = std::move(*CGCacheOrErr); + auto IRCacheOrErr = + localCache("ThinLTO", "IR", OrigCache.getCacheDirectoryPath(), + [&](size_t Task, const Twine &NoduleName, + std::unique_ptr<MemoryBuffer> MB) { + IR.Files[Task] = std::move(MB); + }); + if (Error Err = IRCacheOrErr.takeError()) + report_fatal_error(std::move(Err)); + IR.Cache = std::move(*IRCacheOrErr); + } } -void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) { - assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); - std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); - std::error_code EC; - raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None); - if (EC) - report_fatal_error(Twine("Failed to open ") + Path + - " to save optimized bitcode: " + EC.message()); - WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true); +void saveModuleForTwoRounds(const Module &TheModule, unsigned Task, + AddStreamFn AddStream) { + LLVM_DEBUG(dbgs() << "Saving module: " << TheModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + Expected<std::unique_ptr<CachedFileStream>> StreamOrErr = + AddStream(Task, TheModule.getModuleIdentifier()); + if (Error Err = StreamOrErr.takeError()) + report_fatal_error(std::move(Err)); + std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr; + + WriteBitcodeToFile(TheModule, *Stream->OS, + /*ShouldPreserveUseListOrder=*/true); } std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule, unsigned Task, - LLVMContext &Context) { - assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath)); - std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task); - auto FileOrError = MemoryBuffer::getFile(Path); - if (auto EC = FileOrError.getError()) - report_fatal_error(Twine("Failed to open ") + Path + - " to load optimized bitcode: " + EC.message()); - - std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError); + LLVMContext &Context, + ArrayRef<StringRef> IRFiles) { + LLVM_DEBUG(dbgs() << "Loading module: " << OrigModule.getModuleIdentifier() + << " in Task " << Task << "\n"); + std::unique_ptr<MemoryBuffer> FileBuffer = MemoryBuffer::getMemBuffer( + IRFiles[Task], "in-memory IR file", /*RequiresNullTerminator=*/false); auto RestoredModule = parseBitcodeFile(*FileBuffer, Context); if (!RestoredModule) - report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") + - Path + "\n"); + report_fatal_error( + Twine("Failed to parse optimized bitcode loaded for Task: ") + + Twine(Task) + "\n"); // Restore the original module identifier. (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier()); return std::move(*RestoredModule); } -Error mergeCodeGenData( - const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) { - +Error mergeCodeGenData(ArrayRef<StringRef> CGFiles, stable_hash *CombinedHash) { OutlinedHashTreeRecord GlobalOutlineRecord; - for (auto &InputFile : *(InputFiles)) { - if (InputFile.empty()) + for (auto File : CGFiles) { + if (File.empty()) continue; - StringRef File = StringRef(InputFile.data(), InputFile.size()); std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer( File, "in-memory object file", /*RequiresNullTerminator=*/false); Expected<std::unique_ptr<object::ObjectFile>> BinOrErr = @@ -285,8 +302,8 @@ Error mergeCodeGenData( return BinOrErr.takeError(); std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get(); - if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(), - GlobalOutlineRecord)) + if (auto E = CodeGenDataReader::mergeFromObjectFile( + Obj.get(), GlobalOutlineRecord, CombinedHash)) return E; } diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp index f7f3a8f42af7e1..2f2481ea60f822 100644 --- a/llvm/lib/CGData/CodeGenDataReader.cpp +++ b/llvm/lib/CGData/CodeGenDataReader.cpp @@ -31,8 +31,8 @@ setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) { } Error CodeGenDataReader::mergeFromObjectFile( - const object::ObjectFile *Obj, - OutlinedHashTreeRecord &GlobalOutlineRecord) { + const object::ObjectFile *Obj, OutlinedHashTreeRecord &GlobalOutlineRecord, + stable_hash *CombinedHash) { Triple TT = Obj->makeTriple(); auto CGOutLineName = getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false); @@ -48,6 +48,9 @@ Error CodeGenDataReader::mergeFromObjectFile( auto *EndData = Data + ContentsOrErr->size(); if (*NameOrErr == CGOutLineName) { + if (CombinedHash) + *CombinedHash = + stable_hash_combine(*CombinedHash, xxh3_64bits(*ContentsOrErr)); // In case dealing with an executable that has concatenated cgdata, // we want to merge them into a single cgdata. // Although it's not a typical workflow, we support this scenario. diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index c6369ab382eeb0..493f0be5938658 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -13,6 +13,7 @@ #include "llvm/LTO/LTO.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StableHashing.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -36,6 +37,7 @@ #include "llvm/Linker/IRMover.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRObjectFile.h" +#include "llvm/Support/Caching.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -101,7 +103,7 @@ std::string llvm::computeLTOCacheKey( const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, const DenseSet<GlobalValue::GUID> &CfiFunctionDefs, - const DenseSet<GlobalValue::GUID> &CfiFunctionDecls) { + const DenseSet<GlobalValue::GUID> &CfiFunctionDecls, StringRef ExtraID) { // Compute the unique hash for this entry. // This is based on the current compiler version, the module itself, the // export list, the hash for every single module in the import list, the @@ -341,6 +343,9 @@ std::string llvm::computeLTOCacheKey( } } + if (!ExtraID.empty()) + AddString(ExtraID); + return toHex(Hasher.result()); } @@ -1373,6 +1378,7 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) { namespace { class InProcessThinBackend : public ThinBackendProc { +protected: DefaultThreadPool BackendThreadPool; AddStreamFn AddStream; FileCache Cache; @@ -1511,25 +1517,89 @@ class InProcessThinBackend : public ThinBackendProc { /// buffer. Note the codegen data stored in the scratch buffer will be extracted /// and merged in the subsequent step. class FirstRoundThinBackend : public InProcessThinBackend { + AddStreamFn IRAddStream; + FileCache IRCache; + public: FirstRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch) - : InProcessThinBackend( - Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries, - // Allocate a scratch buffer for each task to write output to. - [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) { - return std::make_unique<CachedFileStream>( - std::make_unique<raw_svector_ostream>((*Allocation)[Task])); - }, - FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, - /*ShouldEmitImportsFiles=*/false), - Scratch(std::move(Scratch)) {} - - /// Scratch space for writing output during the codegen. - std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch; + AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream, + FileCache IRCache) + : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, + ModuleToDefinedGVSummaries, std::move(CGAddStream), + std::move(CGCache), /*OnWrite=*/nullptr, + /*ShouldEmitIndexFiles=*/false, + /*ShouldEmitImportsFiles=*/false), + IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {} + + Error runThinLTOBackendThread( + AddStreamFn CGAddStream, FileCache CGCache, unsigned Task, + BitcodeModule BM, ModuleSummaryIndex &CombinedIndex, + const FunctionImporter::ImportMapTy &ImportList, + const FunctionImporter::ExportSetTy &ExportList, + const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, + const GVSummaryMapTy &DefinedGlobals, + MapVector<StringRef, BitcodeModule> &ModuleMap) override { + auto RunThinBackend = [&](AddStreamFn CGAddStream, + AddStreamFn IRAddStream) { + LTOLLVMContext BackendContext(Conf); + Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext); + if (!MOrErr) + return MOrErr.takeError(); + + return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + Conf.CodeGenOnly, IRAddStream); + }; + + auto ModuleID = BM.getModuleIdentifier(); + + if (ShouldEmitIndexFiles) { + if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str())) + return E; + } + + assert((CGCache.isValid() == IRCache.isValid()) && + "Both caches for CG and IR should have matching availability"); + if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(CGAddStream, IRAddStream); + + // Get CGKey for caching object in CGCache. + std::string CGKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls); + Expected<AddStreamFn> CacheCGAddStreamOrErr = + CGCache(Task, CGKey, ModuleID); + if (Error Err = CacheCGAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr; + + // Get IRKey for caching (optimized) IR in IRCache. + std::string IRKey = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, /*ExtraID=*/"IR"); + Expected<AddStreamFn> CacheIRAddStreamOrErr = + IRCache(Task, IRKey, ModuleID); + if (Error Err = CacheIRAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr; + + assert((CacheCGAddStream == nullptr) == (CacheIRAddStream == nullptr) && + "Both CG and IR caching should be matched"); + if (CacheIRAddStream) { + LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheCGAddStream, CacheIRAddStream); + } + + return Error::success(); + } }; /// This backend operates in the second round of a two-codegen round process. @@ -1538,17 +1608,23 @@ class FirstRoundThinBackend : public InProcessThinBackend { /// the code, utilizing the codegen data merged from the first round. Finally, /// it writes the resulting object files as usual. class SecondRoundThinBackend : public InProcessThinBackend { + ArrayRef<StringRef> IRFiles; + stable_hash CombinedCGDataHash; + public: SecondRoundThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, ThreadPoolStrategy ThinLTOParallelism, const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries, - AddStreamFn AddStream) + AddStreamFn AddStream, FileCache CGCache, ArrayRef<StringRef> IRFiles, + stable_hash CombinedCGDataHash) : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism, - ModuleToDefinedGVSummaries, AddStream, FileCache(), + ModuleToDefinedGVSummaries, AddStream, + std::move(CGCache), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false, - /*ShouldEmitImportsFiles=*/false) {} + /*ShouldEmitImportsFiles=*/false), + IRFiles(IRFiles), CombinedCGDataHash(CombinedCGDataHash) {} virtual Error runThinLTOBackendThread( AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM, @@ -1558,13 +1634,42 @@ class SecondRoundThinBackend : public InProcessThinBackend { const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> &ModuleMap) override { - LTOLLVMContext BackendContext(Conf); - std::unique_ptr<Module> LoadedModule = - cgdata::loadModuleForTwoRounds(BM, Task, BackendContext); + auto RunThinBackend = [&](AddStreamFn AddStream) { + LTOLLVMContext BackendContext(Conf); + std::unique_ptr<Module> LoadedModule = + cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, IRFiles); - return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, - ImportList, DefinedGlobals, &ModuleMap, - /*CodeGenOnly=*/true); + return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex, + ImportList, DefinedGlobals, &ModuleMap, + /*CodeGenOnly=*/true); + }; + + auto ModuleID = BM.getModuleIdentifier(); + if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) || + all_of(CombinedIndex.getModuleHash(ModuleID), + [](uint32_t V) { return V == 0; })) + // Cache disabled or no entry for this module in the combined index or + // no module hash. + return RunThinBackend(AddStream); + + // Get Key for caching the final object file in Cache with the combined + // CGData hash. + std::string Key = computeLTOCacheKey( + Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR, + DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, + /*ExtraID=*/std::to_string(CombinedCGDataHash)); + Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID); + if (Error Err = CacheAddStreamOrErr.takeError()) + return Err; + AddStreamFn &CacheAddStream = *CacheAddStreamOrErr; + + if (CacheAddStream) { + LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for " + << BM.getModuleIdentifier() << "\n"); + return RunThinBackend(CacheAddStream); + } + + return Error::success(); } }; } // end anonymous namespace @@ -1900,32 +2005,33 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, // 1. First round: Run optimization and code generation with a scratch output. // 2. Merge codegen data extracted from the scratch output. // 3. Second round: Run code generation again using the merged data. - LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n"); + LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n"); - // Initialize a temporary path to store and retrieve optimized IRs for - // two-round code generation. - cgdata::initializeTwoCodegenRounds(); + unsigned MaxTasks = getMaxTasks(); + auto Parallelism = ThinLTO.Backend.getParallelism(); + cgdata::StreamCacheData CG(MaxTasks), IR(MaxTasks); + cgdata::initializeTwoCodegenRounds(CG, IR, Cache); - // Create a scratch output to hold intermediate results. - auto Outputs = - std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks()); - auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( - Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), - ModuleToDefinedGVSummaries, std::move(Outputs)); // First round: Run optimization and code generation with a scratch output. - // Before code generation, serialize modules. + // Before code generation, serialize the optimized IR modules. + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n"); + auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + CG.AddStream, CG.Cache, IR.AddStream, IR.Cache); if (Error E = RunBackends(FirstRoundLTO.get())) return E; - // Merge codegen data extracted from the scratch output. - if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch))) + LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n"); + stable_hash CombinedHash = 0; + if (Error E = cgdata::mergeCodeGenData(CG.getResult(), &CombinedHash)) return E; + LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n"); // Second round: Run code generation by reading IRs. - std::unique_ptr<ThinBackendProc> SecondRoundLTO = - std::make_unique<SecondRoundThinBackend>( - Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(), - ModuleToDefinedGVSummaries, AddStream); + LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n"); + auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>( + Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, + AddStream, Cache, IR.getResult(), CombinedHash); Error E = RunBackends(SecondRoundLTO.get()); return E; diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index b66989fe520b42..fd2e9c9169514c 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -568,7 +568,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, const FunctionImporter::ImportMapTy &ImportList, const GVSummaryMapTy &DefinedGlobals, MapVector<StringRef, BitcodeModule> *ModuleMap, - bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) { + bool CodeGenOnly, AddStreamFn IRAddStream, + const std::vector<uint8_t> &CmdArgs) { Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod); if (!TOrErr) return TOrErr.takeError(); @@ -612,8 +613,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, // Note that the second codegen round runs only `codegen()` without // running `opt()`. We're not reaching here as it's bailed out earlier // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`. - if (CodeGenDataThinLTOTwoRounds) - cgdata::saveModuleForTwoRounds(Mod, Task); + if (IRAddStream) + cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream); codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex); return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll new file mode 100644 index 00000000000000..61131ad6d3887f --- /dev/null +++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll @@ -0,0 +1,173 @@ +; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat) +; by running two codegen rounds. +; This test also verifies if caches for the two-round codegens are correctly working. + +; REQUIRES: asserts +; RUN: rm -rf %t +; RUN: split-file %s %t + +; 0. Base case without a cache. +; Verify each outlining instance is singleton with the global outlining for thinlto. +; They will be identical, which can be folded by the linker with ICF. +; RUN: opt -module-hash -module-summary %t/thin-one.ll -o %t/thin-one.bc +; RUN: opt -module-hash -module-summary %t/thin-two.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds + +; thin-one.ll will have one outlining instance (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1 +; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: mov +; THINLTO-1-NEXT: b + +; thin-two.ll will have two outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2 +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b +; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: mov +; THINLTO-2-NEXT: b + +; 1. Run this with a cache for the first time. +; RUN: rm -rf %t.cache +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-cold \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-cold.txt 2>&1 +; RUN: cat %t.log-cold.txt | FileCheck %s --check-prefix=COLD +; diff %t/thinlto.1 %t/thinlto-cold.1 +; diff %t/thinlto.2 %t/thinlto-cold.2 + +; COLD: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; COLD: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; COLD: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; COLD: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; 2. Without any changes, simply re-running it will hit the cache. +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm.txt 2>&1 +; RUN: cat %t.log-warm.txt | FileCheck %s --check-prefix=WARM +; diff %t/thinlto.1 %t/thinlto-warm.1 +; diff %t/thinlto.2 %t/thinlto-warm.2 + +; WARM-NOT: Cache Miss + +; 3. Assume thin-one.ll is modified to mimic thin-one-modified.ll +; The merged CG data remains unchanged as this modification does not affect the hash tree built from thin-two.bc. +; Therefore, both the first and second round runs update only this module. +; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified.txt 2>&1 +; RUN: cat %t.log-warm-modified.txt | FileCheck %s --check-prefix=WARM-MODIFIED +; diff %t/thinlto.1 %t/thinlto-warm-modified.1 +; diff %t/thinlto.2 %t/thinlto-warm-modified.2 + +; WARM-MODIFIED: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-NOT: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; WARM-MODIFIED: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-NOT: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; 4. Additionally, thin-two.ll is modified to mimic thin-two-modified.ll. +; In this case, the merged CG data, which is global, is updated. +; Although the first round run updates only the thin-two.ll module, the second round run +; will update all modules, resulting in different binaries. +; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc +; RUN: opt -module-hash -module-summary %t/thin-two-modified.ll -o %t/thin-two.bc +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified-all \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified-all.txt 2>&1 +; RUN: cat %t.log-warm-modified-all.txt | FileCheck %s --check-prefix=WARM-MODIFIED-ALL +; RUN: not diff %t/thinlto.1 %t/thinlto-warm-modified-all.1 +; RUN: not diff %t/thinlto.2 %t/thinlto-warm-modified-all.2 + +; WARM-MODIFIED-ALL-NOT: [FirstRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-ALL: [FirstRound] Cache Miss for {{.*}}thin-two.bc +; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-one.bc +; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-two.bc + +; thin-one-modified.ll won't be outlined. +; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.1 | FileCheck %s --check-prefix=THINLTO-1-MODIFIED-ALL +; THINLTO-1-MODIFIED-ALL-NOT: _OUTLINED_FUNCTION{{.*}}>: + +; thin-two-modified.ll will have two (longer) outlining instances (matched in the global outlined hash tree) +; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.2| FileCheck %s --check-prefix=THINLTO-2-MODIFIED-ALL +; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: b +; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>: +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: mov +; THINLTO-2-MODIFIED-ALL: b + +; 5. Re-running it will hit the cache. +; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-again \ +; RUN: -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \ +; RUN: -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \ +; RUN: -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-again.txt 2>&1 +; RUN: cat %t.log-warm-again.txt | FileCheck %s --check-prefix=WARM-AGAIN +; RUN: diff %t/thinlto-warm-modified-all.1 %t/thinlto-warm-again.1 +; RUN: diff %t/thinlto-warm-modified-all.2 %t/thinlto-warm-again.2 + +; WARM-AGAIN-NOT: Cache Miss + +;--- thin-one.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 30, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-one-modified.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f3() minsize { + %1 = call i32 @g(i32 31, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 20, i32 1, i32 2); + ret i32 %1 +} + +;--- thin-two-modified.ll +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-darwin" + +declare i32 @g(i32, i32, i32) +define i32 @f1() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} +define i32 @f2() minsize { + %1 = call i32 @g(i32 10, i32 1, i32 2); + ret i32 %1 +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits