https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/81700
>From 1e9c29c4794ed6d60cf54c902cd0f854afd1aace Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <yaxun....@amd.com> Date: Tue, 13 Feb 2024 10:00:21 -0500 Subject: [PATCH] [HIP] Allow partial linking for `-fgpu-rdc` `-fgpu-rdc` mode allows device functions call device functions in different TU. However, currently all device objects have to be linked together since only one fat binary is supported. This is time consuming for AMDGPU backend since it only supports LTO. There are use cases that objects can be divided into groups in which device functions are self-contained but host functions are not. It is desirable to link/optimize/codegen the device code and generate a fatbin for each group, whereas partially link the host code with `ld -r` or generate a static library by using the `-emit-static-lib` option of clang. This avoids linking all device code together, therefore decreases the linking time for `-fgpu-rdc`. Previously, clang emits an external symbol `__hip_fatbin` for all objects for `-fgpu-rdc`. With this patch, clang emits an unique external symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a group of objects are linked together to generate a fatbin, the symbols are merged by alias and point to the same fat binary. Each group has its own fat binary. One executable or shared library can have multiple fat binaries. Device linking is done for undefined fab binary symbols only to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is introduced to facilitate debugging and tooling. Fixes: https://github.com/llvm/llvm-project/issues/77018 --- clang/lib/CodeGen/CGCUDANV.cpp | 22 +- clang/lib/CodeGen/CodeGenModule.cpp | 10 +- clang/lib/Driver/OffloadBundler.cpp | 40 ++- clang/lib/Driver/ToolChains/HIPUtility.cpp | 258 +++++++++++++++++- clang/test/CMakeLists.txt | 1 + clang/test/CodeGenCUDA/device-stub.cu | 10 +- .../test/CodeGenCUDA/host-used-device-var.cu | 5 +- clang/test/Driver/Inputs/hip.h | 25 ++ clang/test/Driver/clang-offload-bundler.c | 13 +- clang/test/Driver/hip-partial-link.hip | 97 +++++++ clang/test/Driver/hip-toolchain-rdc.hip | 38 ++- 11 files changed, 469 insertions(+), 50 deletions(-) create mode 100644 clang/test/Driver/Inputs/hip.h create mode 100644 clang/test/Driver/hip-partial-link.hip diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 5b43272bfa62f4..49f93451db7bbb 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // to contain the fat binary but will be populated somewhere else, // e.g. by lld through link script. FatBinStr = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, - /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, - "__hip_fatbin", nullptr, - llvm::GlobalVariable::NotThreadLocal); + CGM.getModule(), CGM.Int8Ty, + /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, + "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr, + llvm::GlobalVariable::NotThreadLocal); cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); } @@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // thread safety of the loaded program. Therefore we can assume sequential // execution of constructor functions here. if (IsHIP) { - auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : - llvm::GlobalValue::LinkOnceAnyLinkage; + auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage + : llvm::GlobalValue::ExternalLinkage; llvm::BasicBlock *IfBlock = llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); llvm::BasicBlock *ExitBlock = @@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // of HIP ABI. GpuBinaryHandle = new llvm::GlobalVariable( TheModule, PtrTy, /*isConstant=*/false, Linkage, - /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy), - "__hip_gpubin_handle"); - if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) - GpuBinaryHandle->setComdat( - CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); + /*Initializer=*/ + CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr, + CudaGpuBinary + ? "__hip_gpubin_handle" + : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash()); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); // Prevent the weak symbol in different shared libraries being merged. if (Linkage != llvm::GlobalValue::InternalLinkage) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 77fb3a62b356e1..95e457bef28ed3 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -915,7 +915,15 @@ void CodeGenModule::Release() { llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external"); addCompilerUsedGlobal(GV); } - + if (LangOpts.HIP) { + // Emit a unique ID so that host and device binaries from the same + // compilation unit can be associated. + auto *GV = new llvm::GlobalVariable( + getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage, + llvm::Constant::getNullValue(Int8Ty), + "__hip_cuid_" + getContext().getCUIDHash()); + addCompilerUsedGlobal(GV); + } emitLLVMUsed(); if (SanStats) SanStats->finish(); diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index b1091aca5616f8..99a34d25cfcd56 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -588,8 +588,15 @@ class ObjectFileHandler final : public FileHandler { StringRef Content = *ContentOrErr; // Copy fat object contents to the output when extracting host bundle. - if (Content.size() == 1u && Content.front() == 0) - Content = StringRef(Input.getBufferStart(), Input.getBufferSize()); + std::string ModifiedContent; + if (Content.size() == 1u && Content.front() == 0) { + auto HostBundleOrErr = getHostBundle(); + if (!HostBundleOrErr) + return HostBundleOrErr.takeError(); + + ModifiedContent = std::move(*HostBundleOrErr); + Content = ModifiedContent; + } OS.write(Content.data(), Content.size()); return Error::success(); @@ -692,6 +699,35 @@ class ObjectFileHandler final : public FileHandler { } return Error::success(); } + + Expected<std::string> getHostBundle() { + TempFileHandlerRAII TempFiles; + + auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt); + if (!ModifiedObjPathOrErr) + return ModifiedObjPathOrErr.takeError(); + StringRef ModifiedObjPath = *ModifiedObjPathOrErr; + + BumpPtrAllocator Alloc; + StringSaver SS{Alloc}; + SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"}; + + ObjcopyArgs.push_back("--regex"); + ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*"); + ObjcopyArgs.push_back("--"); + ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front()); + ObjcopyArgs.push_back(ModifiedObjPath); + + if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs)) + return std::move(Err); + + auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath); + if (!BufOrErr) + return createStringError(BufOrErr.getError(), + "Failed to read back the modified object file"); + + return BufOrErr->get()->getBuffer().str(); + } }; /// Handler for text files. The bundled file will have the following format. diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index f692458b775de2..fcecf2e1313bb5 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -9,13 +9,24 @@ #include "HIPUtility.h" #include "CommonArgs.h" #include "clang/Driver/Compilation.h" +#include "clang/Driver/Options.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Object/Archive.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" +#include <deque> +#include <set> +using namespace clang; using namespace clang::driver; using namespace clang::driver::tools; using namespace llvm::opt; +using llvm::dyn_cast; #if defined(_WIN32) || defined(_WIN64) #define NULL_FILE "nul" @@ -36,6 +47,169 @@ static std::string normalizeForBundler(const llvm::Triple &T, : T.normalize(); } +// Collect undefined __hip_fatbin* and __hip_gpubin_handle* symbols from all +// input object or archive files. +class HIPUndefinedFatBinSymbols { +public: + HIPUndefinedFatBinSymbols(const Compilation &C) + : C(C), DiagID(C.getDriver().getDiags().getCustomDiagID( + DiagnosticsEngine::Error, + "Error collecting HIP undefined fatbin symbols: %0")), + Quiet(C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)), + Verbose(C.getArgs().hasArg(options::OPT_v)) { + populateSymbols(); + if (Verbose) { + for (auto Name : FatBinSymbols) + llvm::errs() << "Found undefined HIP fatbin symbol: " << Name << "\n"; + for (auto Name : GPUBinHandleSymbols) + llvm::errs() << "Found undefined HIP gpubin handle symbol: " << Name + << "\n"; + } + } + + const std::set<std::string> &getFatBinSymbols() const { + return FatBinSymbols; + } + + const std::set<std::string> &getGPUBinHandleSymbols() const { + return GPUBinHandleSymbols; + } + +private: + const Compilation &C; + unsigned DiagID; + bool Quiet; + bool Verbose; + std::set<std::string> FatBinSymbols; + std::set<std::string> GPUBinHandleSymbols; + std::set<std::string> DefinedFatBinSymbols; + std::set<std::string> DefinedGPUBinHandleSymbols; + const std::string FatBinPrefix = "__hip_fatbin"; + const std::string GPUBinHandlePrefix = "__hip_gpubin_handle"; + + void populateSymbols() { + std::deque<const Action *> WorkList; + std::set<const Action *> Visited; + + for (const auto &Action : C.getActions()) + WorkList.push_back(Action); + + while (!WorkList.empty()) { + const Action *CurrentAction = WorkList.front(); + WorkList.pop_front(); + + if (!CurrentAction || !Visited.insert(CurrentAction).second) + continue; + + if (const auto *IA = dyn_cast<InputAction>(CurrentAction)) { + std::string ID = IA->getId().str(); + if (!ID.empty()) { + ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true); + FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str()); + GPUBinHandleSymbols.insert( + Twine(GPUBinHandlePrefix + "_" + ID).str()); + continue; + } + if (IA->getInputArg().getNumValues() == 0) + continue; + const char *Filename = IA->getInputArg().getValue(); + if (!Filename) + continue; + auto BufferOrErr = llvm::MemoryBuffer::getFile(Filename); + // Input action could be options to linker, therefore, ignore it + // if cannot read it. If it turns out to be a file that cannot be read, + // the error will be caught by the linker. + if (!BufferOrErr) + continue; + + processInput(BufferOrErr.get()->getMemBufferRef()); + } else + WorkList.insert(WorkList.end(), CurrentAction->getInputs().begin(), + CurrentAction->getInputs().end()); + } + } + + void processInput(const llvm::MemoryBufferRef &Buffer) { + // Try processing as object file first. + auto ObjFileOrErr = llvm::object::ObjectFile::createObjectFile(Buffer); + if (ObjFileOrErr) { + processSymbols(**ObjFileOrErr); + return; + } + + // Then try processing as archive files. + llvm::consumeError(ObjFileOrErr.takeError()); + auto ArchiveOrErr = llvm::object::Archive::create(Buffer); + if (ArchiveOrErr) { + llvm::Error Err = llvm::Error::success(); + llvm::object::Archive &Archive = *ArchiveOrErr.get(); + for (auto &Child : Archive.children(Err)) { + auto ChildBufOrErr = Child.getMemoryBufferRef(); + if (ChildBufOrErr) + processInput(*ChildBufOrErr); + else + errorHandler(ChildBufOrErr.takeError()); + } + + if (Err) + errorHandler(std::move(Err)); + return; + } + + // Ignore other files. + llvm::consumeError(ArchiveOrErr.takeError()); + } + + void processSymbols(const llvm::object::ObjectFile &Obj) { + for (const auto &Symbol : Obj.symbols()) { + auto FlagOrErr = Symbol.getFlags(); + if (!FlagOrErr) { + errorHandler(FlagOrErr.takeError()); + continue; + } + + auto NameOrErr = Symbol.getName(); + if (!NameOrErr) { + errorHandler(NameOrErr.takeError()); + continue; + } + llvm::StringRef Name = *NameOrErr; + + bool isUndefined = + FlagOrErr.get() & llvm::object::SymbolRef::SF_Undefined; + bool isFatBinSymbol = Name.starts_with(FatBinPrefix); + bool isGPUBinHandleSymbol = Name.starts_with(GPUBinHandlePrefix); + + // Handling for defined symbols + if (!isUndefined) { + if (isFatBinSymbol) { + DefinedFatBinSymbols.insert(Name.str()); + FatBinSymbols.erase(Name.str()); + } else if (isGPUBinHandleSymbol) { + DefinedGPUBinHandleSymbols.insert(Name.str()); + GPUBinHandleSymbols.erase(Name.str()); + } + continue; + } + + // Add undefined symbols if they are not in the defined sets + if (isFatBinSymbol && + DefinedFatBinSymbols.find(Name.str()) == DefinedFatBinSymbols.end()) + FatBinSymbols.insert(Name.str()); + else if (isGPUBinHandleSymbol && + DefinedGPUBinHandleSymbols.find(Name.str()) == + DefinedGPUBinHandleSymbols.end()) + GPUBinHandleSymbols.insert(Name.str()); + } + } + + void errorHandler(llvm::Error Err) { + if (Quiet) + return; + C.getDriver().Diag(DiagID) << llvm::toString(std::move(Err)); + } +}; + // Construct a clang-offload-bundler command to bundle code objects for // different devices into a HIP fat binary. void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA, @@ -130,26 +304,84 @@ void HIP::constructGenerateObjFileFromHIPFatBinary( auto HostTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()->getTriple(); + HIPUndefinedFatBinSymbols Symbols(C); + + std::string PrimaryHipFatbinSymbol; + std::string PrimaryGpuBinHandleSymbol; + bool FoundPrimaryHipFatbinSymbol = false; + bool FoundPrimaryGpuBinHandleSymbol = false; + + std::vector<std::string> AliasHipFatbinSymbols; + std::vector<std::string> AliasGpuBinHandleSymbols; + + // Iterate through symbols to find the primary ones and collect others for + // aliasing + for (const auto &Symbol : Symbols.getFatBinSymbols()) { + if (!FoundPrimaryHipFatbinSymbol) { + PrimaryHipFatbinSymbol = Symbol; + FoundPrimaryHipFatbinSymbol = true; + } else + AliasHipFatbinSymbols.push_back(Symbol); + } + + for (const auto &Symbol : Symbols.getGPUBinHandleSymbols()) { + if (!FoundPrimaryGpuBinHandleSymbol) { + PrimaryGpuBinHandleSymbol = Symbol; + FoundPrimaryGpuBinHandleSymbol = true; + } else + AliasGpuBinHandleSymbols.push_back(Symbol); + } + // Add MC directives to embed target binaries. We ensure that each // section and image is 16-byte aligned. This is not mandatory, but // increases the likelihood of data to be aligned with a cache block // in several main host machines. ObjStream << "# HIP Object Generator\n"; ObjStream << "# *** Automatically generated by Clang ***\n"; - if (HostTriple.isWindowsMSVCEnvironment()) { - ObjStream << " .section .hip_fatbin, \"dw\"\n"; - } else { - ObjStream << " .protected __hip_fatbin\n"; - ObjStream << " .type __hip_fatbin,@object\n"; - ObjStream << " .section .hip_fatbin,\"a\",@progbits\n"; + if (FoundPrimaryGpuBinHandleSymbol) { + // Define the first gpubin handle symbol + if (HostTriple.isWindowsMSVCEnvironment()) + ObjStream << " .section .hip_gpubin_handle,\"dw\"\n"; + else { + ObjStream << " .protected " << PrimaryGpuBinHandleSymbol << "\n"; + ObjStream << " .type " << PrimaryGpuBinHandleSymbol << ",@object\n"; + ObjStream << " .section .hip_gpubin_handle,\"aw\"\n"; + } + ObjStream << " .globl " << PrimaryGpuBinHandleSymbol << "\n"; + ObjStream << " .p2align 3\n"; // Align 8 + ObjStream << PrimaryGpuBinHandleSymbol << ":\n"; + ObjStream << " .zero 8\n"; // Size 8 + + // Generate alias directives for other gpubin handle symbols + for (const auto &AliasSymbol : AliasGpuBinHandleSymbols) { + ObjStream << " .globl " << AliasSymbol << "\n"; + ObjStream << " .set " << AliasSymbol << "," << PrimaryGpuBinHandleSymbol + << "\n"; + } + } + if (FoundPrimaryHipFatbinSymbol) { + // Define the first fatbin symbol + if (HostTriple.isWindowsMSVCEnvironment()) + ObjStream << " .section .hip_fatbin,\"dw\"\n"; + else { + ObjStream << " .protected " << PrimaryHipFatbinSymbol << "\n"; + ObjStream << " .type " << PrimaryHipFatbinSymbol << ",@object\n"; + ObjStream << " .section .hip_fatbin,\"a\",@progbits\n"; + } + ObjStream << " .globl " << PrimaryHipFatbinSymbol << "\n"; + ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign)) + << "\n"; + // Generate alias directives for other fatbin symbols + for (const auto &AliasSymbol : AliasHipFatbinSymbols) { + ObjStream << " .globl " << AliasSymbol << "\n"; + ObjStream << " .set " << AliasSymbol << "," << PrimaryHipFatbinSymbol + << "\n"; + } + ObjStream << PrimaryHipFatbinSymbol << ":\n"; + ObjStream << " .incbin "; + llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true); + ObjStream << "\n"; } - ObjStream << " .globl __hip_fatbin\n"; - ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign)) - << "\n"; - ObjStream << "__hip_fatbin:\n"; - ObjStream << " .incbin "; - llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true); - ObjStream << "\n"; if (HostTriple.isOSLinux() && HostTriple.isOSBinFormatELF()) ObjStream << " .section .note.GNU-stack, \"\", @progbits\n"; ObjStream.flush(); diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index 6b5cb0a18457bd..fcfca354f4a75f 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -136,6 +136,7 @@ if( NOT CLANG_BUILT_STANDALONE ) llvm-strip llvm-symbolizer llvm-windres + obj2yaml opt split-file yaml2obj diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu index d7a7b1bb9fe956..60304647bd4c54 100644 --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -50,21 +50,19 @@ // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ +// RUN: %clang_cc1 -cuid=123 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN -// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ +// RUN: %clang_cc1 -cuid=123 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ // RUN: -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN,HIP,HIPNEF #include "Inputs/cuda.h" -// HIPNEF: $__hip_gpubin_handle = comdat any - #ifndef NOGLOBALS // NORDC-DAG: @device_var = internal global i32 // RDC-DAG: @device_var = global i32 @@ -161,7 +159,7 @@ __device__ void device_use() { // * constant unnamed string with GPU binary // CUDA: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.", // HIPEF: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",{{.*}}align 4096 -// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" +// HIPNEF: @[[FATBIN:__hip_fatbin_[0-9a-f]+]] = external constant i8, section ".hip_fatbin" // CUDANORDC-SAME: section ".nv_fatbin", align 8 // CUDARDC-SAME: section "__nv_relfatbin", align 8 // * constant struct that wraps GPU binary @@ -177,7 +175,7 @@ __device__ void device_use() { // HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global ptr null -// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global ptr null +// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = external hidden global ptr, align 8 // * constant unnamed string with NVModuleID // CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 diff --git a/clang/test/CodeGenCUDA/host-used-device-var.cu b/clang/test/CodeGenCUDA/host-used-device-var.cu index 7cb31aff84264e..5328660c9dc9df 100644 --- a/clang/test/CodeGenCUDA/host-used-device-var.cu +++ b/clang/test/CodeGenCUDA/host-used-device-var.cu @@ -1,9 +1,9 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ // RUN: -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \ -// RUN: | FileCheck -check-prefix=DEV %s +// RUN: -cuid=123 | FileCheck -check-prefix=DEV %s // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \ -// RUN: -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s +// RUN: -std=c++17 -O3 -emit-llvm -o - -cuid=123 | FileCheck -check-prefix=HOST %s // Negative tests. @@ -187,6 +187,7 @@ public: // DEV-SAME: {{^[^@]*}} @_ZL2u3 // DEV-SAME: {{^[^@]*}} @_ZZ4fun1vE11static_var1 // DEV-SAME: {{^[^@]*}} @_ZZZN21TestStaticVarInLambda3funEvENKUlPcE_clES0_E4var2 +// DEV-SAME: {{^[^@]*}} @__hip_cuid_{{[0-9a-f]+}} // DEV-SAME: {{^[^@]*}} @constexpr_var2b // DEV-SAME: {{^[^@]*}} @inline_var // DEV-SAME: {{^[^@]*}} @u1 diff --git a/clang/test/Driver/Inputs/hip.h b/clang/test/Driver/Inputs/hip.h new file mode 100644 index 00000000000000..5be772a7b34132 --- /dev/null +++ b/clang/test/Driver/Inputs/hip.h @@ -0,0 +1,25 @@ +/* Minimal declarations for HIP support. Testing purposes only. */ + +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __shared__ __attribute__((shared)) +#define __managed__ __attribute__((managed)) + +struct dim3 { + unsigned x, y, z; + __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, unsigned long long sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + unsigned long long sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + unsigned long long sharedMem, + hipStream_t stream); diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c index 7d0b6b27a60aea..9d8b81ee9806ee 100644 --- a/clang/test/Driver/clang-offload-bundler.c +++ b/clang/test/Driver/clang-offload-bundler.c @@ -10,6 +10,7 @@ // RUN: %clang -O0 -target %itanium_abi_triple %s -c -emit-llvm -o %t.bc // RUN: %clang -O0 -target %itanium_abi_triple %s -S -o %t.s // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.o +// RUN: obj2yaml %t.o > %t.o.yaml // RUN: %clang -O0 -target %itanium_abi_triple %s -emit-ast -o %t.ast // @@ -305,11 +306,13 @@ // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle3.o // RUN: clang-offload-bundler -type=o -input=%t.bundle3.o -list | FileCheck -check-prefix=CKLST %s // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle -// RUN: diff %t.bundle3.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle -// RUN: diff %t.bundle3.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu -output=%t.res.tgt1 -input=%t.bundle3.o -unbundle @@ -318,11 +321,13 @@ // Check if we can unbundle a file with no magic strings. // RUN: clang-offload-bundler -type=o -input=%t.o -list | FileCheck -check-prefix=CKLST2 --allow-empty %s // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles -// RUN: diff %t.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.empty %t.res.tgt1 // RUN: diff %t.empty %t.res.tgt2 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles -// RUN: diff %t.o %t.res.o +// RUN: obj2yaml %t.res.o > %t.res.o.yaml +// RUN: diff %t.o.yaml %t.res.o.yaml // RUN: diff %t.empty %t.res.tgt1 // RUN: diff %t.empty %t.res.tgt2 diff --git a/clang/test/Driver/hip-partial-link.hip b/clang/test/Driver/hip-partial-link.hip new file mode 100644 index 00000000000000..a1d31f9a651951 --- /dev/null +++ b/clang/test/Driver/hip-partial-link.hip @@ -0,0 +1,97 @@ +// REQUIRES: x86-registered-target, amdgpu-registered-target, lld, system-linux + +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu \ +// RUN: --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \ +// RUN: -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.1.o + +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DLIB \ +// RUN: --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \ +// RUN: -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.2.o + +// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DMAIN \ +// RUN: --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \ +// RUN: -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.main.o + +// RUN: llvm-nm %t.1.o | FileCheck -check-prefix=OBJ1 %s +// OBJ1: B __hip_cuid_[[ID:[0-9a-f]+]] +// OBJ1: U __hip_fatbin_[[ID]] +// OBJ1: U __hip_gpubin_handle_[[ID]] + +// RUN: llvm-nm %t.2.o | FileCheck -check-prefix=OBJ2 %s +// OBJ2: B __hip_cuid_[[ID:[0-9a-f]+]] +// OBJ2: U __hip_fatbin_[[ID]] +// OBJ2: U __hip_gpubin_handle_[[ID]] + +// Link %t.1.o and %t.2.o by -r and then link with %t.main.o + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -fgpu-rdc --offload-arch=gfx906 \ +// RUN: -r -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.lib.o \ +// RUN: 2>&1 | FileCheck -check-prefix=LD-R %s +// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]] +// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]] +// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]] +// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]] +// LD-R: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle +// LD-R: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu +// LD-R: "{{.*}}/clang-offload-bundler" +// LD-R: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu +// LD-R: "{{.*}}/ld.lld" {{.*}} -r + +// RUN: llvm-nm %t.lib.o | FileCheck -check-prefix=OBJ %s +// OBJ: B __hip_cuid_[[ID1:[0-9a-f]+]] +// OBJ: B __hip_cuid_[[ID2:[0-9a-f]+]] +// OBJ: R __hip_fatbin_[[ID1]] +// OBJ: R __hip_fatbin_[[ID2]] +// OBJ: D __hip_gpubin_handle_[[ID1]] +// OBJ: D __hip_gpubin_handle_[[ID2]] + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \ +// RUN: -fuse-ld=lld -nostdlib -r %t.main.o %t.lib.o -o %t.final.o \ +// RUN: 2>&1 | FileCheck -check-prefix=LINK-O %s +// LINK-O-NOT: Found undefined HIP {{.*}}symbol + +// Generate a static lib with %t.1.o and %t.2.o then link with %t.main.o + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -fgpu-rdc --offload-arch=gfx906 \ +// RUN: --emit-static-lib -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.a \ +// RUN: 2>&1 | FileCheck -check-prefix=STATIC %s +// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]] +// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]] +// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]] +// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]] +// STATIC: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle +// STATIC: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu +// STATIC: "{{.*}}/clang-offload-bundler" +// STATIC: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu +// STATIC: "{{.*}}/llvm-ar" + +// RUN: %clang -v --target=x86_64-unknown-linux-gnu \ +// RUN: --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \ +// RUN: -fuse-ld=lld -nostdlib -r %t.main.o %t.a -o %t.final.o \ +// RUN: 2>&1 | FileCheck -check-prefix=LINK-A %s +// LINK-A-NOT: Found undefined HIP {{.*}}symbol + +#include "hip.h" + +#ifdef LIB +__device__ int x; +__device__ void libfun() { + x = 1; +} +#elif !defined(MAIN) +__device__ void libfun(); +__global__ void kern() { + libfun(); +} +void run() { + kern<<<1,1>>>(); +} +#else +extern void run(); +int main() { + run(); +} +#endif diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 1827531f9cab7a..d19d8ccd6cb29e 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -1,7 +1,7 @@ // REQUIRES: x86-registered-target // REQUIRES: amdgpu-registered-target -// RUN: %clang -### --target=x86_64-linux-gnu \ +// RUN: %clang -### --target=x86_64-linux-gnu -v \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ @@ -12,7 +12,7 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LNX %s -// RUN: %clang -### --target=x86_64-pc-windows-msvc \ +// RUN: %clang -### --target=x86_64-pc-windows-msvc -v \ // RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \ // RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ @@ -23,15 +23,31 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,MSVC %s -// check code object alignment in dumped llvm-mc input -// LNX: .protected __hip_fatbin -// LNX: .type __hip_fatbin,@object -// LNX: .section .hip_fatbin,"a",@progbits -// MSVC: .section .hip_fatbin, "dw" -// CHECK: .globl __hip_fatbin -// CHECK: .p2align 12 -// CHECK: __hip_fatbin: -// CHECK: .incbin "[[BUNDLE:.*hipfb]]" +// check HIP fatbin and gpubin handle symbols and code object alignment in dumped llvm-mc input +// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]] +// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]] +// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]] +// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]] +// LNX: .protected __hip_gpubin_handle_[[ID1]] +// LNX: .type __hip_gpubin_handle_[[ID1]] +// LNX-LABEL: .section .hip_gpubin_handle,"aw" +// MSVC-LABEL: .section .hip_gpubin_handle,"dw" +// CHECK: .globl __hip_gpubin_handle_[[ID1]] +// CHECK-NEXT: .p2align 3 +// CHECK-NEXT:__hip_gpubin_handle_[[ID1]]: +// CHECK-NEXT: .zero 8 +// CHECK-NEXT: .globl __hip_gpubin_handle_[[ID2]] +// CHECK-NEXT: .set __hip_gpubin_handle_[[ID2]],__hip_gpubin_handle_[[ID1]] +// LNX: .protected __hip_fatbin_[[ID1]] +// LNX: .type __hip_fatbin_[[ID1]],@object +// LNX-LABEL: .section .hip_fatbin,"a",@progbits +// MSVC-LABEL: .section .hip_fatbin,"dw" +// CHECK: .globl __hip_fatbin_[[ID1]] +// CHECK-NEXT: .p2align 12 +// CHECK-NEXT: .globl __hip_fatbin_[[ID2]] +// CHECK-NEXT: .set __hip_fatbin_[[ID2]],__hip_fatbin_[[ID1]] +// CHECK-NEXT: __hip_fatbin_[[ID1]]: +// CHECK-NEXT: .incbin "[[BUNDLE:.*hipfb]]" // LNX: .section .note.GNU-stack, "", @progbits // MSVC-NOT: .note.GNU-stack _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits