llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: Yaxun (Sam) Liu (yxsamliu)

<details>
<summary>Changes</summary>

Non-RDC HIP does not need LTO, but the new offload driver compiles all
AMDGPU device code through the LTO pipeline. That makes non-RDC builds pay
full LTO codegen cost for no benefit.

Fix this in clang-linker-wrapper instead of the driver, so device codegen
still runs in the wrapper's parallel device-link step (kept fast by
--offload-jobs) rather than being serialized back in the driver.

The driver passes a new --no-lto flag for the non-RDC fat-binary job
(unless the user asked for -foffload-lto). With it, the wrapper drops -flto
and, because the device images are bitcode stored in object-extension
files, also passes -x ir so clang actually compiles them (cc1 -emit-obj)
instead of handing the bitcode to lld for an LTO link.



---
Full diff: https://github.com/llvm/llvm-project/pull/201135.diff


5 Files Affected:

- (modified) clang/lib/Driver/ToolChains/Clang.cpp (+5) 
- (modified) clang/test/Driver/hip-toolchain-no-rdc.hip (+2) 
- (modified) 
clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c (+12) 
- (modified) clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp (+11-2) 
- (modified) clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td (+3) 


``````````diff
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index 7657afb14f077..41148dc8306b9 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9830,6 +9830,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const 
JobAction &JA,
          JA.getType() == types::TY_Image);
   if (JA.getType() == types::TY_HIP_FATBIN) {
     CmdArgs.push_back("--emit-fatbin-only");
+    // Non-RDC HIP uses the conventional non-LTO pipeline unless the user opts
+    // into offload LTO. The device backend then runs in the linker wrapper's
+    // parallel device-link step rather than being deferred to the LTO link.
+    if (C.getDriver().getOffloadLTOMode() == LTOK_None)
+      CmdArgs.push_back("--no-lto");
     CmdArgs.append({"-o", Output.getFilename()});
     for (auto Input : Inputs)
       CmdArgs.push_back(Input.getFilename());
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip 
b/clang/test/Driver/hip-toolchain-no-rdc.hip
index f4cd703547ac0..37e5741584fb4 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -109,6 +109,7 @@
 // NEW: [[WRAPPER:".*clang-linker-wrapper]]"
 // NEW-SAME: "--host-triple=x86_64-unknown-linux-gnu"
 // NEW-SAME: "--emit-fatbin-only"
+// NEW-SAME: "--no-lto"
 // NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
@@ -181,6 +182,7 @@
 
 // NEW: [[WRAPPER:".*clang-linker-wrapper]]"
 // NEW-SAME: "--emit-fatbin-only"
+// NEW-SAME: "--no-lto"
 // NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
diff --git 
a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c 
b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c
index aae330bd3f6de..5c5b7b1eabfab 100644
--- a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c
+++ b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c
@@ -55,3 +55,15 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: test -s %t.gfx9-4-generic-xnack+.co
 // RUN: test -f %t.gfx1200.co
 // RUN: test -s %t.gfx1200.co
+
+// Without --no-lto the AMDGPU device compilation uses the LTO pipeline
+// (-flto).
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu 
--wrapper-verbose --dry-run --emit-fatbin-only --linker-path=/usr/bin/ld %t.out 
-o %t.lto.hipfb 2>&1 | FileCheck %s --check-prefix=LTO
+// LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -flto
+
+// With --no-lto the AMDGPU device compilation uses the conventional non-LTO
+// pipeline: -flto must not be passed, and '-x ir' must be passed so Clang
+// compiles the bitcode (stored in an object-extension file) instead of
+// handing it to the LTO link.
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu 
--wrapper-verbose --dry-run --no-lto --emit-fatbin-only 
--linker-path=/usr/bin/ld %t.out -o %t.nolto.hipfb 2>&1 | FileCheck %s 
--check-prefix=NO-LTO --implicit-check-not=-flto
+// NO-LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -x ir
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp 
b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 24900a43dbcc2..84bb5d04f5843 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -535,8 +535,11 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, 
const ArgList &Args,
     Triple.isAMDGPU() ? CmdArgs.push_back(Args.MakeArgString("-mcpu=" + Arch))
                       : CmdArgs.push_back(Args.MakeArgString("-march=" + 
Arch));
 
-  // AMDGPU is always in LTO mode currently.
-  if (Triple.isAMDGPU())
+  // AMDGPU defaults to the LTO pipeline. Non-RDC HIP uses the conventional
+  // non-LTO pipeline so device codegen still runs here, in parallel, instead
+  // of being deferred to the LTO link.
+  bool NonLTOAMDGPU = Triple.isAMDGPU() && Args.hasArg(OPT_no_lto);
+  if (Triple.isAMDGPU() && !NonLTOAMDGPU)
     CmdArgs.push_back("-flto");
 
   // Forward all of the `--offload-opt` and `-mllvm` options to the device.
@@ -548,6 +551,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, 
const ArgList &Args,
   if (!Triple.isNVPTX() && !Triple.isSPIRV())
     CmdArgs.push_back("-Wl,--no-undefined");
 
+  // The device inputs are bitcode stored in files with an object extension.
+  // Force the IR input language so Clang runs the compile and backend phases
+  // instead of treating them as linker inputs, which would defer codegen to
+  // the LTO link and defeat the non-LTO pipeline.
+  if (NonLTOAMDGPU)
+    CmdArgs.append({"-x", "ir"});
   for (StringRef InputFile : InputFiles)
     CmdArgs.push_back(InputFile);
 
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td 
b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index 53b6c596de291..87a26ca90a66f 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -39,6 +39,9 @@ def print_wrapped_module : Flag<["--"], 
"print-wrapped-module">,
   HelpText<"Print the wrapped module's IR for testing">;
 def save_temps : Flag<["--"], "save-temps">,
   Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
+def no_lto : Flag<["--"], "no-lto">,
+             Flags<[WrapperOnlyOption]>,
+             HelpText<"Use the non-LTO device compilation pipeline">;
 def compress : Flag<["--"], "compress">,
   Flags<[WrapperOnlyOption]>, HelpText<"Compress bundled files">;
 def compression_level_eq : Joined<["--"], "compression-level=">,

``````````

</details>


https://github.com/llvm/llvm-project/pull/201135
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to