saiislam updated this revision to Diff 552085. saiislam marked an inline comment as done. saiislam added a comment.
Adressed reviewer's comments. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D139730/new/ https://reviews.llvm.org/D139730 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/lib/CodeGen/CodeGenModule.h clang/lib/CodeGen/TargetInfo.h clang/lib/CodeGen/Targets/AMDGPU.cpp clang/lib/Driver/ToolChain.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu clang/test/CodeGenOpenCL/opencl_types.cl clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp openmp/libomptarget/DeviceRTL/CMakeLists.txt openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
Index: openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -25,6 +25,7 @@ #include "llvm/Support/MemoryBufferRef.h" #include "llvm/Support/YAMLTraits.h" +using namespace llvm::ELF; namespace llvm { namespace omp { @@ -34,17 +35,25 @@ // The implicit arguments of AMDGPU kernels. struct AMDGPUImplicitArgsTy { - uint64_t OffsetX; - uint64_t OffsetY; - uint64_t OffsetZ; - uint64_t HostcallPtr; - uint64_t Unused0; - uint64_t Unused1; - uint64_t Unused2; + uint32_t BlockCountX; + uint32_t BlockCountY; + uint32_t BlockCountZ; + uint16_t GroupSizeX; + uint16_t GroupSizeY; + uint16_t GroupSizeZ; + uint8_t Unused0[46]; // 46 byte offset. + uint16_t GridDims; + uint8_t Unused1[190]; // 190 byte offset. }; -static_assert(sizeof(AMDGPUImplicitArgsTy) == 56, - "Unexpected size of implicit arguments"); +enum IMPLICITARGS : uint32_t { + COV4_SIZE = 56, + COV5_SIZE = 256, +}; + +uint16_t getImplicitArgsSize(uint16_t Version) { + return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5 ? COV4_SIZE : COV5_SIZE; +} /// Parse a TargetID to get processor arch and feature map. /// Returns processor subarch. @@ -295,7 +304,8 @@ /// Reads the AMDGPU specific metadata from the ELF file and propagates the /// KernelInfoMap Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, - StringMap<KernelMetaDataTy> &KernelInfoMap) { + StringMap<KernelMetaDataTy> &KernelInfoMap, + uint16_t &ELFABIVersion) { Error Err = Error::success(); // Used later as out-parameter auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer()); @@ -305,6 +315,12 @@ const object::ELF64LEFile ELFObj = ELFOrError.get(); ArrayRef<object::ELF64LE::Shdr> Sections = cantFail(ELFObj.sections()); KernelInfoReader Reader(KernelInfoMap); + + // Read the code object version from ELF image header + auto Header = ELFObj.getHeader(); + ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", ELFABIVersion); + for (const auto &S : Sections) { if (S.sh_type != ELF::SHT_NOTE) continue; Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -381,6 +381,9 @@ /// Get the executable. hsa_executable_t getExecutable() const { return Executable; } + /// Get to Code Object Version of the ELF + uint16_t getELFABIVersion() const { return ELFABIVersion; } + /// Find an HSA device symbol by its name on the executable. Expected<hsa_executable_symbol_t> findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const; @@ -401,6 +404,7 @@ hsa_executable_t Executable; hsa_code_object_t CodeObject; StringMap<utils::KernelMetaDataTy> KernelInfoMap; + uint16_t ELFABIVersion; }; /// Class implementing the AMDGPU kernel functionalities which derives from the @@ -408,8 +412,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Create an AMDGPU kernel with a name and an execution mode. AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode) - : GenericKernelTy(Name, ExecutionMode), - ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {} + : GenericKernelTy(Name, ExecutionMode) {} /// Initialize the AMDGPU kernel. Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override { @@ -450,6 +453,9 @@ // TODO: Read the kernel descriptor for the max threads per block. May be // read from the image. + ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion()); + DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion()); + // Get additional kernel info read from image KernelInfo = AMDImage.getKernelInfo(getName()); if (!KernelInfo.has_value()) @@ -476,6 +482,10 @@ /// Get the HSA kernel object representing the kernel function. uint64_t getKernelObject() const { return KernelObject; } + /// Get the size of implicitargs based on the code object version + /// @return 56 for cov4 and 256 for cov5 + uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } + private: /// The kernel object to execute. uint64_t KernelObject; @@ -486,7 +496,7 @@ uint32_t PrivateSize; /// The size of implicit kernel arguments. - const uint32_t ImplicitArgsSize; + uint32_t ImplicitArgsSize; /// Additional Info for the AMD GPU Kernel std::optional<utils::KernelMetaDataTy> KernelInfo; @@ -2627,8 +2637,8 @@ if (Result) return Plugin::error("Loaded HSA executable does not validate"); - if (auto Err = - utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap)) + if (auto Err = utils::readAMDGPUMetaDataFromImage( + getMemoryBuffer(), KernelInfoMap, ELFABIVersion)) return Err; return Plugin::success(); @@ -2993,6 +3003,14 @@ if (GenericDevice.getRPCServer()) Stream->setRPCServer(GenericDevice.getRPCServer()); + if (getImplicitArgsSize() >= utils::COV5_SIZE) { + ImplArgs->BlockCountX = NumBlocks; + ImplArgs->GroupSizeX = NumThreads; + ImplArgs->GroupSizeY = 1; + ImplArgs->GroupSizeZ = 1; + ImplArgs->GridDims = 1; + } + // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, GroupSize, ArgsMemoryManager); Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt =================================================================== --- openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -282,7 +282,7 @@ add_custom_target(omptarget.devicertl.amdgpu) foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES}) if("${gpu_arch}" IN_LIST all_amdgpu_architectures) - compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa) + compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61) else() Index: clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp =================================================================== --- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -403,11 +403,19 @@ llvm::copy(LinkerArgs, std::back_inserter(CmdArgs)); } + // pass on -mllvm options to the clang + for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Arg->getValue()); + } + if (Args.hasArg(OPT_debug)) CmdArgs.push_back("-g"); - if (SaveTemps) + if (SaveTemps) { CmdArgs.push_back("-save-temps"); + // CmdArgs.push_back(Args.MakeArgString("--amdhsa-code-object-version=5")); + } if (Verbose) CmdArgs.push_back("-v"); Index: clang/test/CodeGenOpenCL/opencl_types.cl =================================================================== --- clang/test/CodeGenOpenCL/opencl_types.cl +++ clang/test/CodeGenOpenCL/opencl_types.cl @@ -7,7 +7,6 @@ #define CLK_FILTER_LINEAR 0x20 constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE|CLK_NORMALIZED_COORDS_TRUE|CLK_FILTER_NEAREST; -// CHECK-COM-NOT: constant i32 void fnc1(image1d_t img) {} // CHECK-SPIR: @fnc1(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) Index: clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu +++ clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -7,6 +7,10 @@ // RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \ // RUN: | FileCheck -check-prefix=COV5 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -mcode-object-version=none -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=COVNONE %s + #include "Inputs/cuda.h" // PRECOV5-LABEL: test_get_workgroup_size @@ -26,6 +30,36 @@ // COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef // COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 // COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + + +// COVNONE-LABEL: test_get_workgroup_size +// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500 +// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 +// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COVNONE: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 +// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] +// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500 +// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 +// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COVNONE: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 +// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] +// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500 +// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 +// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// COVNONE: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8 +// COVNONE: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]] +// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + __device__ void test_get_workgroup_size(int d, int *out) { switch (d) { Index: clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu @@ -0,0 +1,91 @@ +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mcode-object-version=4 -DUSER -x hip -o %t_4 %s + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mcode-object-version=5 -DUSER -x hip -o %t_5 %s + +// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \ +// RUN: -mcode-object-version=none -DDEVICELIB -x hip -o %t_0 %s + +// RUN: llvm-link %t_0 %t_4 -o -| llvm-dis -o - | FileCheck -check-prefix=LINKED4 %s +// RUN: llvm-link %t_0 %t_5 -o -| llvm-dis -o - | FileCheck -check-prefix=LINKED5 %s + +#include "Inputs/cuda.h" + +// LINKED4: llvm.amdgcn.abi.version = weak_odr hidden addrspace(4) constant i32 400, align 4 +// LINKED4-LABEL: bar +// LINKED4: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500 +// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 +// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED4: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 +// LINKED4: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] +// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED4: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500 +// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 +// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED4: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 +// LINKED4: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] +// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED4: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500 +// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 +// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED4: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8 +// LINKED4: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]] +// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef +// LINKED4: "amdgpu_code_object_version", i32 400 + +// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden addrspace(4) constant i32 500, align 4 +// LINKED5-LABEL: bar +// LINKED5: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500 +// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 +// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED5: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4 +// LINKED5: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] +// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED5: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500 +// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 +// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED5: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6 +// LINKED5: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] +// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef + +// LINKED5: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500 +// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 +// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// LINKED5: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8 +// LINKED5: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]] +// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef +// LINKED5: "amdgpu_code_object_version", i32 500 + +#ifdef DEVICELIB +__device__ void bar(int *x, int *y, int *z) +{ + *x = __builtin_amdgcn_workgroup_size_x(); + *y = __builtin_amdgcn_workgroup_size_y(); + *z = __builtin_amdgcn_workgroup_size_z(); +} +#endif + +#ifdef USER +__device__ void bar(int *x, int *y, int *z); +__device__ void foo() +{ + int *x, *y, *z; + bar(x, y, z); +} +#endif Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -8647,6 +8647,14 @@ CmdArgs.push_back("--device-debug"); } + // code-object-version=X needs to be passed to clang-linker-wrapper to ensure + // that it is used by lld. + if (const Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) { + CmdArgs.push_back(Args.MakeArgString("-mllvm")); + CmdArgs.push_back(Args.MakeArgString( + Twine("--amdhsa-code-object-version=") + A->getValue())); + } + for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) CmdArgs.push_back(Args.MakeArgString("--ptxas-arg=" + A)); Index: clang/lib/Driver/ToolChain.cpp =================================================================== --- clang/lib/Driver/ToolChain.cpp +++ clang/lib/Driver/ToolChain.cpp @@ -1365,7 +1365,10 @@ // matches the current toolchain triple. If it is not present // at all, target and host share a toolchain. if (A->getOption().matches(options::OPT_m_Group)) { - if (SameTripleAsHost) + // Pass code objection version to device toolchain + // to correctly set meta-data in intermediate files. + if (SameTripleAsHost || + A->getOption().matches(options::OPT_mcode_object_version_EQ)) DAL->append(A); else Modified = true; Index: clang/lib/CodeGen/Targets/AMDGPU.cpp =================================================================== --- clang/lib/CodeGen/Targets/AMDGPU.cpp +++ clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/TargetOptions.h" using namespace clang; using namespace clang::CodeGen; @@ -274,6 +275,8 @@ void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F, CodeGenModule &CGM) const; + void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override; + void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; unsigned getOpenCLKernelCallingConv() const override; @@ -354,6 +357,35 @@ } } +/// Emits control constants used to change per-architecture behaviour in the +/// AMDGPU ROCm device libraries. +void AMDGPUTargetCodeGenInfo::emitTargetGlobals( + CodeGen::CodeGenModule &CGM) const { + auto AddGlobal = [&](StringRef Name, + clang::TargetOptions::CodeObjectVersionKind Value, + unsigned Size, + llvm::GlobalValue::LinkageTypes Linkage = + llvm::GlobalValue::WeakODRLinkage) { + if (CGM.getModule().getNamedGlobal(Name)) + return; + + auto *Type = + llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), Type, true, Linkage, + llvm::ConstantInt::get(Type, Value), Name, nullptr, + llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant)); + GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility); + GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type)); + }; + + AddGlobal("llvm.amdgcn.abi.version", + CGM.getTarget().getTargetOpts().CodeObjectVersion, /*Size=*/32, + llvm::GlobalValue::WeakODRLinkage); +} + void AMDGPUTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { if (requiresAMDGPUProtectedVisibility(D, GV)) { Index: clang/lib/CodeGen/TargetInfo.h =================================================================== --- clang/lib/CodeGen/TargetInfo.h +++ clang/lib/CodeGen/TargetInfo.h @@ -81,6 +81,9 @@ CodeGen::CodeGenModule &CGM, const llvm::MapVector<GlobalDecl, StringRef> &MangledDeclNames) const {} + /// Provides a convenient hook to handle extra target-specific globals. + virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {} + /// Any further codegen related checks that need to be done on a function call /// in a target specific manner. virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc, Index: clang/lib/CodeGen/CodeGenModule.h =================================================================== --- clang/lib/CodeGen/CodeGenModule.h +++ clang/lib/CodeGen/CodeGenModule.h @@ -1573,6 +1573,11 @@ void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A); + llvm::Constant * + GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, + const VarDecl *D, + ForDefinition_t IsForDefinition = NotForDefinition); + private: llvm::Constant *GetOrCreateLLVMFunction( StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable, @@ -1595,11 +1600,6 @@ void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD, StringRef &CurName); - llvm::Constant * - GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace, - const VarDecl *D, - ForDefinition_t IsForDefinition = NotForDefinition); - bool GetCPUAndFeaturesAttributes(GlobalDecl GD, llvm::AttrBuilder &AttrBuilder, bool SetTargetFeatures = true); Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -1203,6 +1203,8 @@ getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign", getContext().getTargetInfo().getMaxTLSAlign()); + getTargetCodeGenInfo().emitTargetGlobals(*this); + getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames); EmitBackendOptionsMetadata(getCodeGenOpts()); Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -27,6 +27,7 @@ #include "clang/AST/OSLog.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TargetOptions.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/ADT/APFloat.h" @@ -17041,24 +17042,66 @@ } // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. +/// Emit code based on Code Object ABI version. +/// COV_4 : Emit code to use dispatch ptr +/// COV_5 : Emit code to use implicitarg ptr +/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version" +/// and use its value for COV_4 or COV_5 approach. It is used for +/// compiling device libraries in an ABI-agnostic way. +/// +/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by +/// clang during compilation of user code. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { - bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion == - clang::TargetOptions::COV_5; - Constant *Offset; - Value *DP; - if (IsCOV_5) { + llvm::LoadInst *LD; + + auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; + + if (Cov == clang::TargetOptions::COV_None) { + auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal( + "llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr, + CodeGen::NotForDefinition); + + Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC, + CGF.CGM.getIntAlign()); + + Value *Iscov5 = CGF.Builder.CreateICmpSGE( + ABIVersion, + llvm::ConstantInt::get(CGF.Int32Ty, clang::TargetOptions::COV_5)); + // Indexing the implicit kernarg segment. - Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); - DP = EmitAMDGPUImplicitArgPtr(CGF); - } else { + Constant *ImplicitOffset = + llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); + Value *ImplicitArgPtr = EmitAMDGPUImplicitArgPtr(CGF); + auto *ImplicitGEP = + CGF.Builder.CreateGEP(CGF.Int8Ty, ImplicitArgPtr, ImplicitOffset); + // Indexing the HSA kernel_dispatch_packet struct. - Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); - DP = EmitAMDGPUDispatchPtr(CGF); + Constant *DispatchOffset = + llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); + Value *DispatchPtr = EmitAMDGPUDispatchPtr(CGF); + auto *DispatchGEP = + CGF.Builder.CreateGEP(CGF.Int8Ty, DispatchPtr, DispatchOffset); + + auto Result = CGF.Builder.CreateSelect(Iscov5, ImplicitGEP, DispatchGEP); + LD = CGF.Builder.CreateLoad( + Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2))); + } else { + Value *ArgPtr = nullptr; + Constant *Offset = nullptr; + if (Cov == clang::TargetOptions::COV_5) { + // Indexing the implicit kernarg segment. + Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2); + ArgPtr = EmitAMDGPUImplicitArgPtr(CGF); + } else { + // Indexing the HSA kernel_dispatch_packet struct. + Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2); + ArgPtr = EmitAMDGPUDispatchPtr(CGF); + } + auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, ArgPtr, Offset); + LD = CGF.Builder.CreateLoad( + Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); } - auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); - auto *LD = CGF.Builder.CreateLoad( - Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2))); llvm::MDBuilder MDHelper(CGF.getLLVMContext()); llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits