https://github.com/yxsamliu created https://github.com/llvm/llvm-project/pull/201135
Non-RDC HIP does not need LTO, but the new offload driver compiles all AMDGPU device code through the LTO pipeline. That makes non-RDC builds pay full LTO codegen cost for no benefit. Fix this in clang-linker-wrapper instead of the driver, so device codegen still runs in the wrapper's parallel device-link step (kept fast by --offload-jobs) rather than being serialized back in the driver. The driver passes a new --no-lto flag for the non-RDC fat-binary job (unless the user asked for -foffload-lto). With it, the wrapper drops -flto and, because the device images are bitcode stored in object-extension files, also passes -x ir so clang actually compiles them (cc1 -emit-obj) instead of handing the bitcode to lld for an LTO link. >From e58827413b516ded2a6818f7d1a7e8c0e0dd4175 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Tue, 2 Jun 2026 10:12:23 -0400 Subject: [PATCH] [HIP][AMDGPU] Use non-LTO pipeline for non-RDC in the linker wrapper Non-RDC HIP does not need LTO, but the new offload driver compiles all AMDGPU device code through the LTO pipeline. That makes non-RDC builds pay full LTO codegen cost for no benefit. Fix this in clang-linker-wrapper instead of the driver, so device codegen still runs in the wrapper's parallel device-link step (kept fast by --offload-jobs) rather than being serialized back in the driver. The driver passes a new --no-lto flag for the non-RDC fat-binary job (unless the user asked for -foffload-lto). With it, the wrapper drops -flto and, because the device images are bitcode stored in object-extension files, also passes -x ir so clang actually compiles them (cc1 -emit-obj) instead of handing the bitcode to lld for an LTO link. --- clang/lib/Driver/ToolChains/Clang.cpp | 5 +++++ clang/test/Driver/hip-toolchain-no-rdc.hip | 2 ++ .../linker-wrapper-hip-no-rdc.c | 12 ++++++++++++ .../clang-linker-wrapper/ClangLinkerWrapper.cpp | 13 +++++++++++-- .../tools/clang-linker-wrapper/LinkerWrapperOpts.td | 3 +++ 5 files changed, 33 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7657afb14f077..41148dc8306b9 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9830,6 +9830,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, JA.getType() == types::TY_Image); if (JA.getType() == types::TY_HIP_FATBIN) { CmdArgs.push_back("--emit-fatbin-only"); + // Non-RDC HIP uses the conventional non-LTO pipeline unless the user opts + // into offload LTO. The device backend then runs in the linker wrapper's + // parallel device-link step rather than being deferred to the LTO link. + if (C.getDriver().getOffloadLTOMode() == LTOK_None) + CmdArgs.push_back("--no-lto"); CmdArgs.append({"-o", Output.getFilename()}); for (auto Input : Inputs) CmdArgs.push_back(Input.getFilename()); diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index f4cd703547ac0..37e5741584fb4 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -109,6 +109,7 @@ // NEW: [[WRAPPER:".*clang-linker-wrapper]]" // NEW-SAME: "--host-triple=x86_64-unknown-linux-gnu" // NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "--no-lto" // NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" @@ -181,6 +182,7 @@ // NEW: [[WRAPPER:".*clang-linker-wrapper]]" // NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "--no-lto" // NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" diff --git a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c index aae330bd3f6de..5c5b7b1eabfab 100644 --- a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c +++ b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c @@ -55,3 +55,15 @@ __attribute__((visibility("protected"), used)) int x; // RUN: test -s %t.gfx9-4-generic-xnack+.co // RUN: test -f %t.gfx1200.co // RUN: test -s %t.gfx1200.co + +// Without --no-lto the AMDGPU device compilation uses the LTO pipeline +// (-flto). +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --wrapper-verbose --dry-run --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.lto.hipfb 2>&1 | FileCheck %s --check-prefix=LTO +// LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -flto + +// With --no-lto the AMDGPU device compilation uses the conventional non-LTO +// pipeline: -flto must not be passed, and '-x ir' must be passed so Clang +// compiles the bitcode (stored in an object-extension file) instead of +// handing it to the LTO link. +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --wrapper-verbose --dry-run --no-lto --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.nolto.hipfb 2>&1 | FileCheck %s --check-prefix=NO-LTO --implicit-check-not=-flto +// NO-LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -x ir diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 24900a43dbcc2..84bb5d04f5843 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -535,8 +535,11 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args, Triple.isAMDGPU() ? CmdArgs.push_back(Args.MakeArgString("-mcpu=" + Arch)) : CmdArgs.push_back(Args.MakeArgString("-march=" + Arch)); - // AMDGPU is always in LTO mode currently. - if (Triple.isAMDGPU()) + // AMDGPU defaults to the LTO pipeline. Non-RDC HIP uses the conventional + // non-LTO pipeline so device codegen still runs here, in parallel, instead + // of being deferred to the LTO link. + bool NonLTOAMDGPU = Triple.isAMDGPU() && Args.hasArg(OPT_no_lto); + if (Triple.isAMDGPU() && !NonLTOAMDGPU) CmdArgs.push_back("-flto"); // Forward all of the `--offload-opt` and `-mllvm` options to the device. @@ -548,6 +551,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args, if (!Triple.isNVPTX() && !Triple.isSPIRV()) CmdArgs.push_back("-Wl,--no-undefined"); + // The device inputs are bitcode stored in files with an object extension. + // Force the IR input language so Clang runs the compile and backend phases + // instead of treating them as linker inputs, which would defer codegen to + // the LTO link and defeat the non-LTO pipeline. + if (NonLTOAMDGPU) + CmdArgs.append({"-x", "ir"}); for (StringRef InputFile : InputFiles) CmdArgs.push_back(InputFile); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index 53b6c596de291..87a26ca90a66f 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -39,6 +39,9 @@ def print_wrapped_module : Flag<["--"], "print-wrapped-module">, HelpText<"Print the wrapped module's IR for testing">; def save_temps : Flag<["--"], "save-temps">, Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">; +def no_lto : Flag<["--"], "no-lto">, + Flags<[WrapperOnlyOption]>, + HelpText<"Use the non-LTO device compilation pipeline">; def compress : Flag<["--"], "compress">, Flags<[WrapperOnlyOption]>, HelpText<"Compress bundled files">; def compression_level_eq : Joined<["--"], "compression-level=">, _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
