jhuber6 created this revision. jhuber6 added reviewers: jdoerfert, tianshilei1992, tra, yaxunl, JonChesterfield, ronlieb. Herald added a project: All. jhuber6 requested review of this revision. Herald added a project: clang. Herald added a subscriber: cfe-commits.
This patch changes the device linking steps to be performed in parallel when multiple offloading architectures are being used. We use the LLVM parallelism support to accomplish this by simply doing each inidividual device linking job in a single thread. This change required re-parsing the input arguments as these arguments have internal state that would not be properly shared between the threads otherwise. By default, the parallelism uses all threads availible. But this can be controlled with the `--wrapper-jobs=` option. This was required in a few tests to ensure the ordering was still deterministic. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D136701 Files: clang/test/Driver/linker-wrapper.c clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
Index: clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td =================================================================== --- clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -59,6 +59,10 @@ Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">, HelpText<"Set the granularity of time-trace updates">; +def wrapper_jobs : Joined<["--"], "wrapper-jobs=">, + Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">, + HelpText<"Sets the number of parallel jobs to use for device linking">; + // Flags passed to the device linker. def arch_EQ : Joined<["--"], "arch=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">, Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp =================================================================== --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -42,6 +42,7 @@ #include "llvm/Support/Host.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" @@ -1082,6 +1083,7 @@ /// Returns a new ArgList containg arguments used for the device linking phase. DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input, const InputArgList &Args) { + DerivedArgList DAL = DerivedArgList(DerivedArgList(Args)); for (Arg *A : Args) DAL.append(A); @@ -1119,19 +1121,34 @@ /// be registered by the runtime. Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles, - const InputArgList &Args) { + const InputArgList &Args, char **Argv, int Argc) { llvm::TimeTraceScope TimeScope("Handle all device input"); - DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile, 4>> InputsForTarget; + DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap; for (auto &File : LinkerInputFiles) - InputsForTarget[File].emplace_back(std::move(File)); + InputMap[File].emplace_back(std::move(File)); LinkerInputFiles.clear(); - DenseMap<OffloadKind, SmallVector<OffloadingImage, 2>> Images; - for (auto &[ID, Input] : InputsForTarget) { + SmallVector<SmallVector<OffloadFile>> InputsForTarget; + for (auto &[ID, Input] : InputMap) + InputsForTarget.emplace_back(std::move(Input)); + InputMap.clear(); + + std::mutex ImageMtx; + DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images; + auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error { llvm::TimeTraceScope TimeScope("Link device input"); - auto LinkerArgs = getLinkerArgs(Input, Args); + // Each thread needs its own copy of the base arguments to maintain + // per-device argument storage of synthetic strings. + const OptTable &Tbl = getOptTable(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + auto BaseArgs = + Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [](StringRef Err) { + reportError(createStringError(inconvertibleErrorCode(), Err)); + }); + auto LinkerArgs = getLinkerArgs(Input, BaseArgs); DenseSet<OffloadKind> ActiveOffloadKinds; for (const auto &File : Input) @@ -1142,7 +1159,7 @@ if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) return std::move(Err); - // Write any remaining device inputs to an output file for the linker job. + // Write any remaining device inputs to an output file for the linker. for (const OffloadFile &File : Input) { auto FileNameOrErr = writeOffloadFile(File); if (!FileNameOrErr) @@ -1150,7 +1167,7 @@ InputFiles.emplace_back(*FileNameOrErr); } - // Link the remaining device files, if necessary, using the device linker. + // Link the remaining device files using the device linker. llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); bool RequiresLinking = !Args.hasArg(OPT_embed_bitcode) && @@ -1171,12 +1188,19 @@ TheImage.TheImageKind = IMG_Object; TheImage.TheOffloadKind = Kind; TheImage.StringData = { - {"triple", LinkerArgs.getLastArgValue(OPT_triple_EQ)}, - {"arch", LinkerArgs.getLastArgValue(OPT_arch_EQ)}}; + {"triple", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ))}, + {"arch", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ))}}; TheImage.Image = std::move(*FileOrErr); + + std::lock_guard<decltype(ImageMtx)> Guard(ImageMtx); Images[Kind].emplace_back(std::move(TheImage)); } - } + return Error::success(); + }); + if (Err) + return std::move(Err); // Create a binary image of each offloading image and embed it into a new // object file. @@ -1351,6 +1375,15 @@ if (!CudaBinaryPath.empty()) CudaBinaryPath = CudaBinaryPath + "/bin"; + if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) { + unsigned Threads = 0; + if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0) + reportError(createStringError( + inconvertibleErrorCode(), "%s: expected a positive integer, got '%s'", + Arg->getSpelling().data(), Arg->getValue())); + parallel::strategy = hardware_concurrency(Threads); + } + if (Args.hasArg(OPT_wrapper_time_trace_eq)) { unsigned Granularity; Args.getLastArgValue(OPT_wrapper_time_trace_granularity, "500") @@ -1367,7 +1400,8 @@ reportError(DeviceInputFiles.takeError()); // Link and wrap the device images extracted from the linker input. - auto FilesOrErr = linkAndWrapDeviceFiles(*DeviceInputFiles, Args); + auto FilesOrErr = + linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, Argc); if (!FilesOrErr) reportError(FilesOrErr.takeError()); Index: clang/test/Driver/linker-wrapper.c =================================================================== --- clang/test/Driver/linker-wrapper.c +++ clang/test/Driver/linker-wrapper.c @@ -102,7 +102,7 @@ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ // RUN: -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \ +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=1 \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_52 {{.*}}.o @@ -116,7 +116,7 @@ // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ // RUN: -fembed-offload-object=%t.out // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -linker-path \ -// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP +// RUN: --wrapper-jobs=1 /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o @@ -127,7 +127,7 @@ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ // RUN: -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \ +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=1 \ // RUN: --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b -- \ // RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LINKER_ARGS
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits