[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-31 Thread Joseph Huber via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG3762111aa960: [OpenMP] Link the bitcode library late for 
device LTO (authored by jhuber6).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

Files:
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -68,9 +68,14 @@
 
 static cl::opt OptLevel("opt-level",
  cl::desc("Optimization level for LTO"),
- cl::init("O0"),
+ cl::init("O2"),
  cl::cat(ClangLinkerWrapperCategory));
 
+static cl::opt
+BitcodeLibrary("target-library",
+   cl::desc("Path for the target bitcode library"),
+   cl::cat(ClangLinkerWrapperCategory));
+
 // Do not parse linker options.
 static cl::list
 HostLinkerArgs(cl::Sink, cl::desc("..."));
@@ -197,7 +202,7 @@
   std::unique_ptr Output = std::move(*OutputOrErr);
   std::copy(Contents->begin(), Contents->end(), Output->getBufferStart());
   if (Error E = Output->commit())
-return E;
+return std::move(E);
 
   DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
   ToBeStripped.push_back(*Name);
@@ -225,7 +230,7 @@
 std::unique_ptr Output = std::move(*OutputOrErr);
 std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
 if (Error E = Output->commit())
-  return E;
+  return std::move(E);
 StripFile = TempFile;
   }
 
@@ -307,7 +312,7 @@
 std::unique_ptr Output = std::move(*OutputOrErr);
 std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
 if (Error E = Output->commit())
-  return E;
+  return std::move(E);
 
 DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
 ToBeDeleted.push_back();
@@ -318,7 +323,7 @@
 
   // We need to materialize the lazy module before we make any changes.
   if (Error Err = M->materializeAll())
-return Err;
+return std::move(Err);
 
   // Remove the global from the module and write it to a new file.
   for (GlobalVariable *GV : ToBeDeleted) {
@@ -392,7 +397,7 @@
   }
 
   if (Err)
-return Err;
+return std::move(Err);
 
   if (!NewMembers)
 return None;
@@ -406,9 +411,9 @@
 
   std::unique_ptr Buffer =
   MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false);
-  if (Error WriteErr = writeArchive(TempFile, Members, true, Library.kind(),
+  if (Error Err = writeArchive(TempFile, Members, true, Library.kind(),
 true, Library.isThin(), std::move(Buffer)))
-return WriteErr;
+return std::move(Err);
 
   return static_cast(TempFile);
 }
@@ -726,7 +731,7 @@
 
 // Add the bitcode file with its resolved symbols to the LTO job.
 if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions))
-  return Err;
+  return std::move(Err);
   }
 
   // Run the LTO job to compile the bitcode.
@@ -744,7 +749,7 @@
 std::make_unique(FD, true));
   };
   if (Error Err = LTOBackend->run(AddStream))
-return Err;
+return std::move(Err);
 
   for (auto  : Files) {
 if (!TheTriple.isNVPTX())
@@ -957,6 +962,17 @@
 }
   }
 
+  // Add the device bitcode library to the device files if it was passed in.
+  if (!BitcodeLibrary.empty()) {
+// FIXME: Hacky workaround to avoid a backend crash at O0.
+if (OptLevel[1] - '0' == 0)
+  OptLevel[1] = '1';
+auto DeviceAndPath = StringRef(BitcodeLibrary).split('=');
+auto TripleAndArch = DeviceAndPath.first.rsplit('-');
+DeviceFiles.emplace_back(TripleAndArch.first, TripleAndArch.second,
+ DeviceAndPath.second);
+  }
+
   // Link the device images extracted from the linker input.
   SmallVector LinkedImages;
   if (Error Err = linkDeviceFiles(DeviceFiles, LinkerArgs, LinkedImages))
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -744,6 +744,10 @@
   return;
 }
 
+// Link the bitcode library late if we're using device LTO.
+if (getDriver().isUsingLTO(/* IsOffload */ true))
+  return;
+
 std::string BitcodeSuffix;
 if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, true))
Index: 

[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-31 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:75
+static cl::opt
+BitcodeLibrary("target-library",
+   cl::desc("Path for the target bitcode library"),

tianshilei1992 wrote:
> `target-library` is not the common name we call it. Maybe 
> `device-runtime-library`?
I think I called it `bitcode-library` later, since technically it's just any 
bitcode file that's linked in along LTO.



Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:986
+  if (!BitcodeLibrary.empty()) {
+// FIXME: Hacky workaround to avoid a backend crash at O0.
+if (OptLevel[1] - '0' == 0)

tianshilei1992 wrote:
> Is this still needed now?
It still breaks, but I removed it later. My ordering for these patches is a bit 
of a mess, sorry.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-31 Thread Shilei Tian via Phabricator via cfe-commits
tianshilei1992 accepted this revision.
tianshilei1992 added a comment.
This revision is now accepted and ready to land.

LGTM with two nits.




Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:75
+static cl::opt
+BitcodeLibrary("target-library",
+   cl::desc("Path for the target bitcode library"),

`target-library` is not the common name we call it. Maybe 
`device-runtime-library`?



Comment at: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:986
+  if (!BitcodeLibrary.empty()) {
+// FIXME: Hacky workaround to avoid a backend crash at O0.
+if (OptLevel[1] - '0' == 0)

Is this still needed now?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-25 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 updated this revision to Diff 402925.
jhuber6 added a comment.

Squash other uncommitted changes.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

Files:
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -68,9 +68,14 @@
 
 static cl::opt OptLevel("opt-level",
  cl::desc("Optimization level for LTO"),
- cl::init("O0"),
+ cl::init("O2"),
  cl::cat(ClangLinkerWrapperCategory));
 
+static cl::opt
+BitcodeLibrary("target-library",
+   cl::desc("Path for the target bitcode library"),
+   cl::cat(ClangLinkerWrapperCategory));
+
 // Do not parse linker options.
 static cl::list
 HostLinkerArgs(cl::Sink, cl::desc("..."));
@@ -201,7 +206,7 @@
   std::unique_ptr Output = std::move(*OutputOrErr);
   std::copy(Contents->begin(), Contents->end(), Output->getBufferStart());
   if (Error E = Output->commit())
-return E;
+return std::move(E);
 
   DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
   ToBeStripped.push_back(*Name);
@@ -229,7 +234,7 @@
 std::unique_ptr Output = std::move(*OutputOrErr);
 std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
 if (Error E = Output->commit())
-  return E;
+  return std::move(E);
 StripFile = TempFile;
   }
 
@@ -318,7 +323,7 @@
 std::unique_ptr Output = std::move(*OutputOrErr);
 std::copy(Contents.begin(), Contents.end(), Output->getBufferStart());
 if (Error E = Output->commit())
-  return E;
+  return std::move(E);
 
 DeviceFiles.emplace_back(DeviceTriple, Arch, TempFile);
 ToBeDeleted.push_back();
@@ -329,7 +334,7 @@
 
   // We need to materialize the lazy module before we make any changes.
   if (Error Err = M->materializeAll())
-return Err;
+return std::move(Err);
 
   // Remove the global from the module and write it to a new file.
   for (GlobalVariable *GV : ToBeDeleted) {
@@ -403,7 +408,7 @@
   }
 
   if (Err)
-return Err;
+return std::move(Err);
 
   if (!NewMembers)
 return None;
@@ -417,9 +422,9 @@
 
   std::unique_ptr Buffer =
   MemoryBuffer::getMemBuffer(Library.getMemoryBufferRef(), false);
-  if (Error WriteErr = writeArchive(TempFile, Members, true, Library.kind(),
+  if (Error Err = writeArchive(TempFile, Members, true, Library.kind(),
 true, Library.isThin(), std::move(Buffer)))
-return WriteErr;
+return std::move(Err);
 
   return static_cast(TempFile);
 }
@@ -740,7 +745,7 @@
 
 // Add the bitcode file with its resolved symbols to the LTO job.
 if (Error Err = LTOBackend->add(std::move(BitcodeFile), Resolutions))
-  return Err;
+  return std::move(Err);
   }
 
   // Run the LTO job to compile the bitcode.
@@ -758,7 +763,7 @@
 std::make_unique(FD, true));
   };
   if (Error Err = LTOBackend->run(AddStream))
-return Err;
+return std::move(Err);
 
   for (auto  : Files) {
 if (!TheTriple.isNVPTX())
@@ -976,6 +981,17 @@
 }
   }
 
+  // Add the device bitcode library to the device files if it was passed in.
+  if (!BitcodeLibrary.empty()) {
+// FIXME: Hacky workaround to avoid a backend crash at O0.
+if (OptLevel[1] - '0' == 0)
+  OptLevel[1] = '1';
+auto DeviceAndPath = StringRef(BitcodeLibrary).split('=');
+auto TripleAndArch = DeviceAndPath.first.rsplit('-');
+DeviceFiles.emplace_back(TripleAndArch.first, TripleAndArch.second,
+ DeviceAndPath.second);
+  }
+
   // Link the device images extracted from the linker input.
   SmallVector LinkedImages;
   if (Error Err = linkDeviceFiles(DeviceFiles, LinkerArgs, LinkedImages))
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -744,6 +744,10 @@
   return;
 }
 
+// Link the bitcode library late if we're using device LTO.
+if (getDriver().isUsingLTO(/* IsOffload */ true))
+  return;
+
 std::string BitcodeSuffix;
 if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, true))
Index: clang/lib/Driver/ToolChains/Clang.cpp
===
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ 

[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-11 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added inline comments.



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:8159
+  std::string BitcodeSuffix;
+  if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
+ options::OPT_fno_openmp_target_new_runtime, true))

JonChesterfield wrote:
> This looks familiar - is there a function in commonargs that already does 
> this?
It's implemented individually when we select it in the CUDA and AMDGPUOpenMP 
toolchains, could probably try to unify it into a utility function to save a 
few lines.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-11 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added inline comments.



Comment at: clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp:268
+  // Link the bitcode library late if we're using device LTO.
+  if (getDriver().isUsingLTO(/* IsOffload */ true))
+return;

Can/should probably always link it late on amdgpu



Comment at: clang/lib/Driver/ToolChains/Clang.cpp:8159
+  std::string BitcodeSuffix;
+  if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
+ options::OPT_fno_openmp_target_new_runtime, true))

This looks familiar - is there a function in commonargs that already does this?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-11 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

Oh, right. Nvptx is still lowering to machine code per-tu. We don't want the 
devicertl linking as machine code, so it has to go in per-tu. Or we could link 
nvptx as IR instead, and send that plus amdgpu down the same code path. 
Probably makes applications faster and compiles slower. Which sort of brings us 
to this patch with a different default.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-11 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 added a comment.

In D117048#3235763 , @JonChesterfield 
wrote:

> Do we have a hook where we could link it at, uh, link time on nvtptx without 
> LTO? Amdgpu had a llvm-link already there. Always bothered me a little that 
> we link a copy per TU then hope the optimiser sorts it out nicely.

I think it would be difficult, we could compile the library stand-alone and 
link it normally in the linker wrapper, but the performance would probably be 
bad. We could potentially only link everything once, but when we compile we 
don't really know how many files will need it right? It would definitely 
improve compile times if we could somehow do it though.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-11 Thread Jon Chesterfield via Phabricator via cfe-commits
JonChesterfield added a comment.

Do we have a hook where we could link it at, uh, link time on nvtptx without 
LTO? Amdgpu had a llvm-link already there. Always bothered me a little that we 
link a copy per TU then hope the optimiser sorts it out nicely.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D117048/new/

https://reviews.llvm.org/D117048

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D117048: [OpenMP] Link the bitcode library late for device LTO

2022-01-11 Thread Joseph Huber via Phabricator via cfe-commits
jhuber6 created this revision.
jhuber6 added reviewers: jdoerfert, ronlieb, gregrodgers, JonChesterfield.
Herald added subscribers: kerbowa, guansong, inglorion, yaxunl, nhaehnle, 
jvesely.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1.
Herald added a project: clang.

This patch adds support for linking the OpenMP device bitcode library
late when doing LTO. This simply passes it in as an additional device
file when doing the final device linking phase with LTO. This has the
advantage that we don't link it multiple times, and the device
references do not get inlined and prevent us from doing needed OpenMP
optimizations when we have visiblity of the whole module.

Depends on D116975 


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D117048

Files:
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Driver/ToolChains/Cuda.cpp
  clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp


Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
===
--- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -71,6 +71,11 @@
  cl::init("O0"),
  cl::cat(ClangLinkerWrapperCategory));
 
+static cl::opt
+BitcodeLibrary("target-library",
+   cl::desc("Path for the target bitcode library"),
+   cl::cat(ClangLinkerWrapperCategory));
+
 // Do not parse linker options.
 static cl::list
 HostLinkerArgs(cl::Sink, cl::desc("..."));
@@ -976,6 +981,14 @@
 }
   }
 
+  // Add the device bitcode library to the device files if it was passed in.
+  if (!BitcodeLibrary.empty()) {
+auto DeviceAndPath = StringRef(BitcodeLibrary).split('=');
+auto TripleAndArch = DeviceAndPath.first.rsplit('-');
+DeviceFiles.emplace_back(TripleAndArch.first, TripleAndArch.second,
+ DeviceAndPath.second);
+  }
+
   // Link the device images extracted from the linker input.
   SmallVector LinkedImages;
   if (Error Err = linkDeviceFiles(DeviceFiles, LinkerArgs, LinkedImages))
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -744,6 +744,10 @@
   return;
 }
 
+// Link the bitcode library late if we're using device LTO.
+if (getDriver().isUsingLTO(/* IsOffload */ true))
+  return;
+
 std::string BitcodeSuffix;
 if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
options::OPT_fno_openmp_target_new_runtime, true))
Index: clang/lib/Driver/ToolChains/Clang.cpp
===
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -8147,6 +8147,34 @@
   "-target-feature=" + TC->getTripleString() + "=" + *(FeatureIt + 
1)));
 }
 
+// Pass in the bitcode library to be linked during LTO.
+for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE;
+ ++TI) {
+  const ToolChain *TC = TI->second;
+  const Driver  = TC->getDriver();
+  const ArgList  = C.getArgsForToolChain(TC, "", 
Action::OFK_OpenMP);
+  StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);
+
+  std::string BitcodeSuffix;
+  if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
+ options::OPT_fno_openmp_target_new_runtime, true))
+BitcodeSuffix += "new-";
+  if (TC->getTriple().isNVPTX())
+BitcodeSuffix += "nvptx-";
+  else if (TC->getTriple().isAMDGPU())
+BitcodeSuffix += "amdgpu-";
+  BitcodeSuffix += Arch;
+
+  ArgStringList BitcodeLibrary;
+  addOpenMPDeviceRTL(D, TCArgs, BitcodeLibrary, BitcodeSuffix,
+ TC->getTriple());
+
+  if (!BitcodeLibrary.empty())
+CmdArgs.push_back(
+Args.MakeArgString("-target-library=" + TC->getTripleString() +
+   "-" + Arch + "=" + BitcodeLibrary.back()));
+}
+
 // Pass in the optimization level to use for LTO.
 if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) {
   StringRef OOpt;
Index: clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
===
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -264,6 +264,10 @@
   if (DriverArgs.hasArg(options::OPT_nogpulib))
 return;
 
+  // Link the bitcode library late if we're using device LTO.
+  if (getDriver().isUsingLTO(/* IsOffload */ true))
+return;
+
   std::string BitcodeSuffix;
   if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,