[PATCH] D123810: [Cuda] Add initial support for wrapping CUDA images in the new driver.

Joseph Huber via Phabricator via cfe-commits Thu, 14 Apr 2022 12:27:06 -0700

jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, JonChesterfield, tra, yaxunl.
Herald added a subscriber: carlosgalvezp.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1.
Herald added a project: clang.


This patch adds the initial support for wrapping CUDA images. This
requires changing some of the logic for how we bundle images. We now
need to copy the image for all kinds that are active for the
architecture. Then we need to run a separate wrapping job if the Kind is
Cuda. For cuda wrapping we need to use the `fatbinary` program from the
CUDA SDK to bundle all the binaries together. This is then passed to a
new function to perfom the actual module code generation that will be
implemented in a later patch.

Depends on D120273 <https://reviews.llvm.org/D120273> D123471 
<https://reviews.llvm.org/D123471>


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D123810

Files:
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
  clang/tools/clang-linker-wrapper/OffloadWrapper.h

Index: clang/tools/clang-linker-wrapper/OffloadWrapper.h
===================================================================
--- clang/tools/clang-linker-wrapper/OffloadWrapper.h
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.h
@@ -1,4 +1,4 @@
-//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===//
+//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +14,11 @@
 
 /// Wrap the input device images into the module \p M as global symbols and
 /// registers the images with the OpenMP Offloading runtime libomptarget.
-llvm::Error wrapBinaries(llvm::Module &M,
-                         llvm::ArrayRef<llvm::ArrayRef<char>> Images);
+llvm::Error wrapOpenMPBinaries(llvm::Module &M,
+                               llvm::ArrayRef<llvm::ArrayRef<char>> Images);
+
+/// Wrap the input fatbinary image into the module \p M as global symbols and
+/// registers the images with the CUDA runtime.
+llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef<char> Images);
 
 #endif
Index: clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
===================================================================
--- clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
+++ clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
@@ -257,7 +257,7 @@
 
 } // namespace
 
-Error wrapBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
+Error wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
   GlobalVariable *Desc = createBinDesc(M, Images);
   if (!Desc)
     return createStringError(inconvertibleErrorCode(),
@@ -266,3 +266,8 @@
   createUnregisterFunction(M, Desc);
   return Error::success();
 }
+
+llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef<char> Images) {
+  return createStringError(inconvertibleErrorCode(),
+                           "Cuda wrapping is not yet supported.");
+}
Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===================================================================
--- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -691,6 +691,45 @@
 
   return static_cast<std::string>(TempFile);
 }
+
+Expected<std::string> fatbinary(ArrayRef<StringRef> InputFiles,
+                                Triple TheTriple, ArrayRef<StringRef> Archs) {
+  // NVPTX uses the fatbinary program to bundle the linked images.
+  ErrorOr<std::string> FatBinaryPath =
+      sys::findProgramByName("fatbinary", {CudaBinaryPath});
+  if (!FatBinaryPath)
+    FatBinaryPath = sys::findProgramByName("fatbinary");
+  if (!FatBinaryPath)
+    return createStringError(FatBinaryPath.getError(),
+                             "Unable to find 'fatbinary' in path");
+
+  // Create a new file to write the linked device image to.
+  SmallString<128> TempFile;
+  if (Error Err = createOutputFile(sys::path::filename(ExecutableName) +
+                                       "-device-" + TheTriple.getArchName(),
+                                   "fatbin", TempFile))
+    return std::move(Err);
+
+  BumpPtrAllocator Alloc;
+  StringSaver Saver(Alloc);
+
+  SmallVector<StringRef, 16> CmdArgs;
+  CmdArgs.push_back(*FatBinaryPath);
+  CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32");
+  CmdArgs.push_back("--create");
+  CmdArgs.push_back(TempFile);
+  for (const auto &FileAndArch : llvm::zip(InputFiles, Archs))
+    CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) +
+                                 ",file=" + std::get<0>(FileAndArch)));
+
+  if (Verbose)
+    printCommands(CmdArgs);
+
+  if (sys::ExecuteAndWait(*FatBinaryPath, CmdArgs))
+    return createStringError(inconvertibleErrorCode(), "'fatbinary' failed");
+
+  return static_cast<std::string>(TempFile);
+}
 } // namespace nvptx
 namespace amdgcn {
 Expected<std::string> link(ArrayRef<std::string> InputFiles, Triple TheTriple,
@@ -1143,34 +1182,42 @@
 /// Runs the appropriate linking action on all the device files specified in \p
 /// DeviceFiles. The linked device images are returned in \p LinkedImages.
 Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
-                      SmallVectorImpl<std::string> &LinkedImages) {
-  // Get the list of inputs for a specific device.
-  DenseMap<DeviceFile, SmallVector<std::string, 4>> LinkerInputMap;
-  for (auto &File : DeviceFiles)
-    LinkerInputMap[File].push_back(File.Filename);
+                      SmallVectorImpl<DeviceFile> &LinkedImages) {
+  // Get the list of inputs and active offload kinds for a specific device.
+  DenseMap<DeviceFile,
+           std::pair<DenseSet<StringRef>, SmallVector<std::string, 4>>>
+      LinkerInputMap;
+  for (auto &File : DeviceFiles) {
+    LinkerInputMap[File].first.insert(File.Kind);
+    LinkerInputMap[File].second.push_back(File.Filename);
+  }
 
   // Try to link each device toolchain.
   for (auto &LinkerInput : LinkerInputMap) {
     DeviceFile &File = LinkerInput.getFirst();
     Triple TheTriple = Triple(File.TheTriple);
+    auto &LinkerInputFiles = LinkerInput.getSecond().second;
 
     // Run LTO on any bitcode files and replace the input with the result.
-    if (Error Err =
-            linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch))
+    if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch))
       return Err;
 
     // If we are embedding bitcode for JIT, skip the final device linking.
     if (EmbedBitcode) {
-      assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed");
-      LinkedImages.push_back(LinkerInput.getSecond().front());
+      assert(!LinkerInputFiles.empty() && "No bitcode image to embed");
+      LinkedImages.emplace_back("openmp", TheTriple.getTriple(), File.Arch,
+                                LinkerInputFiles.front());
       continue;
     }
 
-    auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);
+    auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch);
     if (!ImageOrErr)
       return ImageOrErr.takeError();
 
-    LinkedImages.push_back(*ImageOrErr);
+    // Create separate images for all the active offload kinds.
+    for (StringRef Kind : LinkerInput.getSecond().first)
+      LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch,
+                                *ImageOrErr);
   }
   return Error::success();
 }
@@ -1215,29 +1262,84 @@
   return static_cast<std::string>(ObjectFile);
 }
 
-/// Creates the object file containing the device image and runtime registration
-/// code from the device images stored in \p Images.
-Expected<std::string> wrapDeviceImages(ArrayRef<std::string> Images) {
+/// Load all of the OpenMP images into a buffer and pass it to the binary
+/// wrapping function to create the registration code in the module \p M.
+Error wrapOpenMPImages(Module &M, ArrayRef<DeviceFile> Images) {
   SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
   SmallVector<ArrayRef<char>, 4> ImagesToWrap;
-
-  for (StringRef ImageFilename : Images) {
+  for (const DeviceFile &File : Images) {
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
-        llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename);
+        llvm::MemoryBuffer::getFileOrSTDIN(File.Filename);
     if (std::error_code EC = ImageOrError.getError())
-      return createFileError(ImageFilename, EC);
+      return createFileError(File.Filename, EC);
     ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(),
                               (*ImageOrError)->getBufferSize());
     SavedBuffers.emplace_back(std::move(*ImageOrError));
   }
 
-  LLVMContext Context;
-  Module M("offload.wrapper.module", Context);
-  M.setTargetTriple(HostTriple);
-  if (Error Err = wrapBinaries(M, ImagesToWrap))
+  if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap))
+    return std::move(Err);
+  return Error::success();
+}
+
+/// Combine all of the CUDA images into a single fatbinary and pass it to the
+/// binary wrapping function to create the registration code in the module \p M.
+Error wrapCudaImages(Module &M, ArrayRef<DeviceFile> Images) {
+  SmallVector<StringRef, 4> InputFiles;
+  SmallVector<StringRef, 4> Architectures;
+  for (const DeviceFile &File : Images) {
+    InputFiles.push_back(File.Filename);
+    Architectures.push_back(File.Arch);
+  }
+
+  // CUDA expects its embedded device images to be a fatbinary.
+  Triple TheTriple = Triple(Images.front().TheTriple);
+  auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures);
+  if (!FileOrErr)
+    return FileOrErr.takeError();
+
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> ImageOrError =
+      llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr);
+  if (std::error_code EC = ImageOrError.getError())
+    return createFileError(*FileOrErr, EC);
+
+  auto ImageToWrap = ArrayRef<char>((*ImageOrError)->getBufferStart(),
+                                    (*ImageOrError)->getBufferSize());
+
+  if (Error Err = wrapCudaBinary(M, ImageToWrap))
     return std::move(Err);
+  return Error::success();
+}
+
+/// Creates the object file containing the device image and runtime
+/// registration code from the device images stored in \p Images.
+Expected<SmallVector<std::string, 2>>
+wrapDeviceImages(ArrayRef<DeviceFile> Images) {
+  StringMap<SmallVector<DeviceFile, 2>> ImagesForKind;
+  for (const DeviceFile &Image : Images)
+    ImagesForKind[Image.Kind].push_back(Image);
+
+  SmallVector<std::string, 2> WrappedImages;
+  for (const auto &KindAndImages : ImagesForKind) {
+    LLVMContext Context;
+    Module M("offload.wrapper.module", Context);
+    M.setTargetTriple(HostTriple);
+
+    if (KindAndImages.getKey() == "openmp") {
+      if (Error Err = wrapOpenMPImages(M, KindAndImages.getValue()))
+        return std::move(Err);
+    } else if (KindAndImages.getKey() == "cuda") {
+      if (Error Err = wrapCudaImages(M, KindAndImages.getValue()))
+        return std::move(Err);
+    }
+
+    auto FileOrErr = compileModule(M);
+    if (!FileOrErr)
+      return FileOrErr.takeError();
+    WrappedImages.push_back(*FileOrErr);
+  }
 
-  return compileModule(M);
+  return WrappedImages;
 }
 
 Optional<std::string> findFile(StringRef Dir, const Twine &Name) {
@@ -1368,7 +1470,7 @@
     DeviceFiles.push_back(getBitcodeLibrary(LibraryStr));
 
   // Link the device images extracted from the linker input.
-  SmallVector<std::string, 16> LinkedImages;
+  SmallVector<DeviceFile, 4> LinkedImages;
   if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages))
     return reportError(std::move(Err));
 
@@ -1377,7 +1479,7 @@
   auto FileOrErr = wrapDeviceImages(LinkedImages);
   if (!FileOrErr)
     return reportError(FileOrErr.takeError());
-  LinkerArgs.push_back(*FileOrErr);
+  LinkerArgs.append(*FileOrErr);
 
   // Run the host linking job.
   if (Error Err = runLinker(LinkerUserPath, LinkerArgs))

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D123810: [Cuda] Add initial support for wrapping CUDA images in the new driver.

Reply via email to