Author: Joseph Huber Date: 2022-04-23T12:42:40-04:00 New Revision: 3530c35c660919b9367f1ac598abfb9a569e7606
URL: https://github.com/llvm/llvm-project/commit/3530c35c660919b9367f1ac598abfb9a569e7606 DIFF: https://github.com/llvm/llvm-project/commit/3530c35c660919b9367f1ac598abfb9a569e7606.diff LOG: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility When we do LTO we consider ourselves to have whole program visibility if every single input file we have contains LLVM bitcode. If we have whole program visibliity then we can create a single image and utilize CUDA's non-RDC mode by not passing `-c` to `ptxas` and ignoring the `nvlink` job. This should be faster for some situations and also saves us the time executing `nvlink`. Reviewed By: tra Differential Revision: https://reviews.llvm.org/D124292 Added: Modified: clang/test/Driver/linker-wrapper.c clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp Removed: ################################################################################ diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index 5ec99f5fe5b03..7920fe8c1a990 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -38,5 +38,5 @@ // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \ // RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO -// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s -// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin +// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s +// LTO-NOT: nvlink diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index b52dda13ac200..2c14c893c6424 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -595,7 +595,7 @@ extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer, // TODO: Move these to a separate file. namespace nvptx { Expected<std::string> assemble(StringRef InputFile, Triple TheTriple, - StringRef Arch) { + StringRef Arch, bool RDC = true) { // NVPTX uses the ptxas binary to create device object files. Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath}); if (!PtxasPath) @@ -626,7 +626,8 @@ Expected<std::string> assemble(StringRef InputFile, Triple TheTriple, CmdArgs.push_back(Opt); CmdArgs.push_back("--gpu-name"); CmdArgs.push_back(Arch); - CmdArgs.push_back("-c"); + if (RDC) + CmdArgs.push_back("-c"); CmdArgs.push_back(InputFile); @@ -933,7 +934,8 @@ bool isValidCIdentifier(StringRef S) { } Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles, - const Triple &TheTriple, StringRef Arch) { + const Triple &TheTriple, StringRef Arch, + bool &WholeProgram) { SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers; SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles; SmallVector<std::string, 4> NewInputFiles; @@ -1009,7 +1011,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles, }; // We assume visibility of the whole program if every input file was bitcode. - bool WholeProgram = BitcodeFiles.size() == InputFiles.size(); + WholeProgram = BitcodeFiles.size() == InputFiles.size(); auto LTOBackend = (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode) : createLTO(TheTriple, Arch, WholeProgram); @@ -1089,7 +1091,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles, // Is we are compiling for NVPTX we need to run the assembler first. if (TheTriple.isNVPTX() && !EmbedBitcode) { for (auto &File : Files) { - auto FileOrErr = nvptx::assemble(File, TheTriple, Arch); + auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram); if (!FileOrErr) return FileOrErr.takeError(); File = *FileOrErr; @@ -1117,10 +1119,11 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles, for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = - linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch)) + if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, + File.Arch, WholeProgram)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. @@ -1130,6 +1133,14 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles, continue; } + // If we performed LTO on NVPTX and had whole program visibility, we can use + // CUDA in non-RDC mode. + if (WholeProgram && TheTriple.isNVPTX()) { + assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); + LinkedImages.push_back(LinkerInput.getSecond().front()); + continue; + } + auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits