llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Yaxun (Sam) Liu (yxsamliu) <details> <summary>Changes</summary> Non-RDC HIP does not need LTO, but the new offload driver compiles all AMDGPU device code through the LTO pipeline. That makes non-RDC builds pay full LTO codegen cost for no benefit. Fix this in clang-linker-wrapper instead of the driver, so device codegen still runs in the wrapper's parallel device-link step (kept fast by --offload-jobs) rather than being serialized back in the driver. The driver passes a new --no-lto flag for the non-RDC fat-binary job (unless the user asked for -foffload-lto). With it, the wrapper drops -flto and, because the device images are bitcode stored in object-extension files, also passes -x ir so clang actually compiles them (cc1 -emit-obj) instead of handing the bitcode to lld for an LTO link. --- Full diff: https://github.com/llvm/llvm-project/pull/201135.diff 5 Files Affected: - (modified) clang/lib/Driver/ToolChains/Clang.cpp (+5) - (modified) clang/test/Driver/hip-toolchain-no-rdc.hip (+2) - (modified) clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c (+12) - (modified) clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp (+11-2) - (modified) clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td (+3) ``````````diff diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7657afb14f077..41148dc8306b9 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9830,6 +9830,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, JA.getType() == types::TY_Image); if (JA.getType() == types::TY_HIP_FATBIN) { CmdArgs.push_back("--emit-fatbin-only"); + // Non-RDC HIP uses the conventional non-LTO pipeline unless the user opts + // into offload LTO. The device backend then runs in the linker wrapper's + // parallel device-link step rather than being deferred to the LTO link. + if (C.getDriver().getOffloadLTOMode() == LTOK_None) + CmdArgs.push_back("--no-lto"); CmdArgs.append({"-o", Output.getFilename()}); for (auto Input : Inputs) CmdArgs.push_back(Input.getFilename()); diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index f4cd703547ac0..37e5741584fb4 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -109,6 +109,7 @@ // NEW: [[WRAPPER:".*clang-linker-wrapper]]" // NEW-SAME: "--host-triple=x86_64-unknown-linux-gnu" // NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "--no-lto" // NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" @@ -181,6 +182,7 @@ // NEW: [[WRAPPER:".*clang-linker-wrapper]]" // NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "--no-lto" // NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" diff --git a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c index aae330bd3f6de..5c5b7b1eabfab 100644 --- a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c +++ b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c @@ -55,3 +55,15 @@ __attribute__((visibility("protected"), used)) int x; // RUN: test -s %t.gfx9-4-generic-xnack+.co // RUN: test -f %t.gfx1200.co // RUN: test -s %t.gfx1200.co + +// Without --no-lto the AMDGPU device compilation uses the LTO pipeline +// (-flto). +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --wrapper-verbose --dry-run --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.lto.hipfb 2>&1 | FileCheck %s --check-prefix=LTO +// LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -flto + +// With --no-lto the AMDGPU device compilation uses the conventional non-LTO +// pipeline: -flto must not be passed, and '-x ir' must be passed so Clang +// compiles the bitcode (stored in an object-extension file) instead of +// handing it to the LTO link. +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --wrapper-verbose --dry-run --no-lto --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.nolto.hipfb 2>&1 | FileCheck %s --check-prefix=NO-LTO --implicit-check-not=-flto +// NO-LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -x ir diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 24900a43dbcc2..84bb5d04f5843 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -535,8 +535,11 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args, Triple.isAMDGPU() ? CmdArgs.push_back(Args.MakeArgString("-mcpu=" + Arch)) : CmdArgs.push_back(Args.MakeArgString("-march=" + Arch)); - // AMDGPU is always in LTO mode currently. - if (Triple.isAMDGPU()) + // AMDGPU defaults to the LTO pipeline. Non-RDC HIP uses the conventional + // non-LTO pipeline so device codegen still runs here, in parallel, instead + // of being deferred to the LTO link. + bool NonLTOAMDGPU = Triple.isAMDGPU() && Args.hasArg(OPT_no_lto); + if (Triple.isAMDGPU() && !NonLTOAMDGPU) CmdArgs.push_back("-flto"); // Forward all of the `--offload-opt` and `-mllvm` options to the device. @@ -548,6 +551,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args, if (!Triple.isNVPTX() && !Triple.isSPIRV()) CmdArgs.push_back("-Wl,--no-undefined"); + // The device inputs are bitcode stored in files with an object extension. + // Force the IR input language so Clang runs the compile and backend phases + // instead of treating them as linker inputs, which would defer codegen to + // the LTO link and defeat the non-LTO pipeline. + if (NonLTOAMDGPU) + CmdArgs.append({"-x", "ir"}); for (StringRef InputFile : InputFiles) CmdArgs.push_back(InputFile); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index 53b6c596de291..87a26ca90a66f 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -39,6 +39,9 @@ def print_wrapped_module : Flag<["--"], "print-wrapped-module">, HelpText<"Print the wrapped module's IR for testing">; def save_temps : Flag<["--"], "save-temps">, Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">; +def no_lto : Flag<["--"], "no-lto">, + Flags<[WrapperOnlyOption]>, + HelpText<"Use the non-LTO device compilation pipeline">; def compress : Flag<["--"], "compress">, Flags<[WrapperOnlyOption]>, HelpText<"Compress bundled files">; def compression_level_eq : Joined<["--"], "compression-level=">, `````````` </details> https://github.com/llvm/llvm-project/pull/201135 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
